## Naive Bayes

In [39]:
import pandas as pd

In [40]:
df = pd.read_csv('data/derecha_econ_news.csv')

In [41]:
inflation = df.loc[df.title.str.contains('inflacion|inflación|emisión|salarios|salario|indec', na=False), 'body']
inflation.dropna(inplace=True)
inflation = inflation.reset_index()
inflation.insert(loc=1, column='topics', value='inflation')

In [42]:
exchange = df.loc[df.title.str.contains('dolar|dólar|dolares|dólares|dolarización|cambio|cepo', na=False), 'body']
exchange.dropna(inplace=True)
exchange = exchange.reset_index()
exchange.insert(loc=1, column='topics', value='exchange')

In [43]:
corpus = inflation.append(exchange)
corpus = corpus.drop_duplicates(subset='index',keep=False)
corpus.shape

(61, 3)

#### Train - Test Split

In [51]:
from sklearn.model_selection import train_test_split

In [75]:
train,test,y_train,y_test = train_test_split(corpus.iloc[:,2], corpus.iloc[:,1], train_size=0.75); 

In [76]:
train.shape, test.shape

((45,), (16,))

#### Tokenization

In [164]:
import re
import unidecode
from nltk.tokenize import word_tokenize
import numpy as np
from nltk.corpus import stopwords

In [165]:
stopwords_sp = stopwords.words('spanish')

In [221]:
def tokenizer(texto):

    alphanumeric = re.sub(r'([^\s\w]|_)+', '', texto).lower()
    no_accents = unidecode.unidecode(alphanumeric)
    
    tockens = word_tokenize(no_accents)
    
    tockens_clean = [tocken for tocken in tockens if tocken not in stopwords_sp and tocken.isalpha()]
    
    terminos = tockens_clean

    return terminos

In [222]:
def token_matrix_maker(data, vocabulario):
    matriz = np.zeros(shape = (len(data), len(vocabulario)), dtype='int')
    for i, documento in enumerate(data):
        for termino in documento:
            matriz[i, vocabulario.index(termino)] += 1
    return matriz

In [230]:
X_train = [tokenizer(documento) for documento in train]
vocabulario = sorted(list(set([word for group in [tokenizer(documento) for documento in corpus.iloc[:,2]] for word in group])))

In [231]:
X_train_matriz = token_matrix_maker(X_train, vocabulario)

In [232]:
X_test = [tokenizer(documento) for documento in test]

In [233]:
X_test_matriz = token_matrix_maker(X_test, vocabulario)

Vectorización con TF-IDF

In [234]:
from sklearn.feature_extraction.text import TfidfTransformer

In [235]:
X_train_matriz_tfidf = TfidfTransformer().fit_transform(X_train_matriz);

In [236]:
X_test_matriz_tfidf = TfidfTransformer().fit_transform(X_test_matriz);

Modelo + CountVectorizer

In [237]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [238]:
multi_nb = MultinomialNB()

multi_nb.fit(X_train_matriz, y_train.values)

MultinomialNB()

In [239]:
y_pred = multi_nb.predict(X_test_matriz)

In [240]:
from sklearn.metrics import accuracy_score

In [241]:
accuracy_score(y_pred, y_test.values)

0.875

Modelo + TfIDf

In [242]:
multi_nb_2 = MultinomialNB()

multi_nb_2.fit(X_test_matriz, y_train.values)

MultinomialNB()

In [244]:
y_pred = multi_nb_2.predict(X_test_matriz)

In [245]:
from sklearn.metrics import accuracy_score

In [246]:
accuracy_score(y_pred, y_test.values)

0.625