## Naive Bayes

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/izq_econ_news.csv')

In [3]:
df_inflation = df.loc[df.flyer.str.contains('INFLA|INDEC|SUBA|PRECIO', na=False),'body']
df_inflation = df_inflation.reset_index()
df_inflation.insert(loc=0, column='topics', value='inflation')
df_inflation.drop('index', axis=1,inplace=True)

In [4]:
df_exchange = df.loc[df.flyer.str.contains('BRECHA|CAMBI|RESERV', na=False), 'body']
df_exchange = df_exchange.reset_index()
df_exchange.insert(loc=0, column='topics', value='exchange')
df_exchange.drop('index', axis=1,inplace=True)

In [5]:
corpus = list(df_inflation.body.values) + list(df_exchange.body.values)
len(corpus)

29

In [6]:
df_exchange.topics.values

array(['exchange', 'exchange', 'exchange', 'exchange', 'exchange',
       'exchange', 'exchange', 'exchange', 'exchange', 'exchange',
       'exchange', 'exchange', 'exchange'], dtype=object)

In [7]:
target = list(df_inflation.topics.values) + list(df_exchange.topics.values)

In [8]:
data = pd.DataFrame([corpus, target]).transpose()
data

Unnamed: 0,0,1
0,El Indec informó que en octubre de 2020 las ex...,inflation
1,El Estimador Mensual de la Actividad Económica...,inflation
2,Se trata del mismo aumento que aplicó Massalin...,inflation
3,"La petrolera Raizen, dueña de la marca Shell, ...",inflation
4,Los precios de los combustibles comercializado...,inflation
5,¿Cuántos días te dura el salario del mes? Segu...,inflation
6,La Junta Interna de ATE Indec calculó la canas...,inflation
7,El Indec informó este jueves que el Índice de ...,inflation
8,El Indec informó este martes que la Canasta Bá...,inflation
9,Este jueves la Dirección de Estadísticas e Inv...,inflation


#### Train - Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train,test,y_train,y_test = train_test_split(data.iloc[:,0], data.iloc[:,1], train_size=0.75); 

In [11]:
train.shape, test.shape

((21,), (8,))

#### Tokenization

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import re

In [13]:
stopwords_sp = stopwords.words('spanish')

In [14]:
vectorizer = CountVectorizer(stop_words = stopwords_sp, lowercase = True, strip_accents='unicode')
vectorizer.fit(list(re.sub(r'(\d|\$|\%|\+)', '', doc) for doc in corpus))

  'stop_words.' % sorted(inconsistent))


CountVectorizer(stop_words=['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los',
                            'del', 'se', 'las', 'por', 'un', 'para', 'con',
                            'no', 'una', 'su', 'al', 'lo', 'como', 'más',
                            'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí',
                            'porque', ...],
                strip_accents='unicode')

In [15]:
X_train = vectorizer.transform(train);

In [31]:
X_train_matriz = X_train.todense()

In [32]:
X_test = vectorizer.transform(test);

In [33]:
X_test_matriz = X_train.todense()

Vectorización con TF-IDF

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer

In [35]:
X_train_matriz_tfidf = TfidfTransformer().fit_transform(X_train_matriz);

In [36]:
X_test_matriz_tfidf = TfidfTransformer().fit_transform(X_test_matriz);

Modelo + CountVectorizer

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [39]:
multi_nb = MultinomialNB()

multi_nb.fit(X_train, y_train.values)

MultinomialNB()

In [40]:
y_pred = multi_nb.predict(X_test)

In [41]:
from sklearn.metrics import accuracy_score

In [43]:
accuracy_score(y_pred, y_test.values)

1.0

Modelo + TfIDf

In [44]:
multi_nb_2 = MultinomialNB()

multi_nb_2.fit(X_train_matriz_tfidf, y_train.values)

MultinomialNB()

In [45]:
y_pred = multi_nb_2.predict(X_test)

In [46]:
from sklearn.metrics import accuracy_score

In [47]:
accuracy_score(y_pred, y_test.values)

1.0