## Naive Bayes

In [1]:
import pandas as pd
import pickle

In [2]:
data = pickle.load(open('data/df_clean_izq.pkl','rb'))

In [3]:
data.sample(5)

Unnamed: 0,topics,date,flyer,lead,headline,body,url
11,inflation,\nMiércoles 30 de septiembre,COSTO DE VIDA,La cifra surge del Índice de salarios de julio...,Los salarios perdieron casi 5 % contra la infl...,El Indec publicó este miércoles el Indice de s...,https://www.laizquierdadiario.com/Economia/Los...
46,exchange,\nMiércoles 21 de octubre,TENSIÓN CAMBIARIA,Este miércoles anota una nueva jornada de suba...,Los especuladores no dan tregua: el dólar fina...,La cotización de la divisa norteamericana sigu...,https://www.laizquierdadiario.com/Economia/Los...
50,exchange,\nViernes 9 de octubre,TIPO DE CAMBIO,La divisa paralela se vende a $ 167 y acumula ...,Nuevo récord: el dólar blue vuelve a subir,La tensión cambiaria no se detiene. El dólar b...,https://www.laizquierdadiario.com/Economia/Nue...
5,inflation,\nMartes 17 de noviembre,INFORME INDEC,"El Indec informó que la Canasta Básica Total, ...",Imposible llegar a fin de mes: el costo de vid...,El Indec informó este martes que la Canasta Bá...,https://www.laizquierdadiario.com/Economia/Imp...
57,exchange,\nViernes 24 de julio,TIPO DE CAMBIO,Las restricciones para comprar el billete ofic...,El dólar blue saltó a $ 140 y marcó un nuevo r...,El dólar blue aumentó casi $ 10 en una semana....,https://www.laizquierdadiario.com/Economia/El-...


In [4]:
df_inflation = data.loc[data.topics == 'inflation']
df_inflation.shape

(34, 7)

In [5]:
df_exchange = data.loc[data.topics == 'exchange']
df_exchange.shape

(26, 7)

In [6]:
corpus = list(df_inflation.body.values) + list(df_exchange.body.values)
len(corpus)

60

In [7]:
target = list(df_inflation.topics.values) + list(df_exchange.topics.values)

In [8]:
data = pd.DataFrame([corpus, target]).transpose()
data

Unnamed: 0,0,1
0,Se trata del mismo aumento que aplicó Massalin...,inflation
1,"La petrolera Raizen, dueña de la marca Shell, ...",inflation
2,Los precios de los combustibles comercializado...,inflation
3,¿Cuántos días te dura el salario del mes? Segu...,inflation
4,El Indec informó este jueves que el Índice de ...,inflation
5,El Indec informó este martes que la Canasta Bá...,inflation
6,Este jueves la Dirección de Estadísticas e Inv...,inflation
7,La suba de precios no se detiene. Según anunci...,inflation
8,Este miércoles el Indec publicó el índice de p...,inflation
9,El Indec publicó que la canasta de pobreza fue...,inflation


### Train - Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train,test,y_train,y_test = train_test_split(data.iloc[:,0], data.iloc[:,1], train_size=0.75); 

In [11]:
train.shape, test.shape

((45,), (15,))

### Tokenization + Vectorization
#### CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import re

In [13]:
stopwords_sp = stopwords.words('spanish')

In [14]:
vectorizer = CountVectorizer(stop_words = stopwords_sp, lowercase = True, strip_accents='unicode')
vectorizer.fit(list(re.sub(r'(\d|\$|\%|\+)', '', doc) for doc in corpus))

  'stop_words.' % sorted(inconsistent))


CountVectorizer(stop_words=['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los',
                            'del', 'se', 'las', 'por', 'un', 'para', 'con',
                            'no', 'una', 'su', 'al', 'lo', 'como', 'más',
                            'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí',
                            'porque', ...],
                strip_accents='unicode')

In [15]:
X_train = vectorizer.transform(train);

In [16]:
X_train

<45x2903 sparse matrix of type '<class 'numpy.int64'>'
	with 6235 stored elements in Compressed Sparse Row format>

In [17]:
X_train_matriz = X_train.todense()

In [18]:
X_test = vectorizer.transform(test);

In [19]:
X_test_matriz = X_train.todense()

#### Tf IDf Vectorizer

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer

In [21]:
X_train_matriz_tfidf = TfidfTransformer().fit_transform(X_train_matriz);

In [22]:
X_test_matriz_tfidf = TfidfTransformer().fit_transform(X_test_matriz);

### Multinomial Naive Bayes + CountVectorizer

In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [24]:
multi_nb = MultinomialNB()

multi_nb.fit(X_train, y_train.values)

MultinomialNB()

In [25]:
y_pred = multi_nb.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
accuracy_score(y_pred, y_test.values)

1.0

In [28]:
confusion_matrix(y_pred, y_test.values)

array([[ 5,  0],
       [ 0, 10]])

### Multinomial Naive Bayes + TfIDf

In [29]:
multi_nb_2 = MultinomialNB()

multi_nb_2.fit(X_train_matriz_tfidf, y_train.values)

MultinomialNB()

In [30]:
y_pred = multi_nb_2.predict(X_test)

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
accuracy_score(y_pred, y_test.values)

1.0

In [35]:
confusion_matrix(y_pred, y_test.values)

array([[ 5,  0],
       [ 0, 10]])

### Tokenization + Vectorization (Manual)

#### CountVectorizer with pre-built tokens

In [36]:
cv_tokens = pickle.load(open('tokens/topic_tokens_izq_new.pkl', 'rb'))

In [37]:
vectorizer.fit(cv_tokens.drop('topics', axis=1))

CountVectorizer(stop_words=['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los',
                            'del', 'se', 'las', 'por', 'un', 'para', 'con',
                            'no', 'una', 'su', 'al', 'lo', 'como', 'más',
                            'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí',
                            'porque', ...],
                strip_accents='unicode')

In [38]:
X_train_2 = vectorizer.transform(train)

In [39]:
X_train_matriz_2 = X_train_2.todense()

In [40]:
X_test_2 = vectorizer.transform(test)

In [41]:
X_test_matriz_2 = X_test_2.todense()

#### TF-IDF Vectorizer with pre-built tokens

In [42]:
X_train_matriz_tfidf_2 = TfidfTransformer().fit_transform(X_train_matriz_2);

In [43]:
X_test_matriz_tfidf_2 = TfidfTransformer().fit_transform(X_test_matriz_2);

### Multinomial Naive Bayes + CountVectorizer (updated tokens)

In [44]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [45]:
multi_nb = MultinomialNB()

multi_nb.fit(X_train_2, y_train.values)

MultinomialNB()

In [46]:
y_pred_2 = multi_nb.predict(X_test_2)

In [47]:
from sklearn.metrics import accuracy_score

In [48]:
accuracy_score(y_pred_2, y_test.values)

1.0

In [49]:
confusion_matrix(y_pred_2, y_test.values)

array([[ 5,  0],
       [ 0, 10]])

### Multinomial Naive Bayes + Tf IDf Vectorizer (updated tokens)

In [50]:
multi_nb_2 = MultinomialNB()

multi_nb_2.fit(X_train_matriz_tfidf_2, y_train.values)

MultinomialNB()

In [51]:
y_pred_2 = multi_nb_2.predict(X_test_matriz_tfidf_2)

In [52]:
from sklearn.metrics import accuracy_score

In [53]:
accuracy_score(y_pred_2, y_test.values)

1.0

In [54]:
confusion_matrix(y_pred_2, y_test.values)

array([[ 5,  0],
       [ 0, 10]])