In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

df_dataset = pd.read_csv('files/clean_text.csv')
df_dataset = df_dataset.drop(['Unnamed: 0'], axis=1)
df_dataset.head()

Unnamed: 0,tweet_content,keyword,user_location,classified
0,mds nunca liberar,['Nubank'],Goiânia,0.0
1,viciar bateria celular,['Coca-cola'],Balneário Camboriú,1.0
2,toda vez vejo lá conferir saldo certo juro tra...,['Nubank'],,0.0
3,pedir serasi,['Nubank'],"Rio de Janeiro, Brasil",1.0
4,kayofeer acontecer mãe quase ano nada ainda,['Nubank'],"Macaé, Brasil",1.0


In [2]:
# Bag of words
count_vectorizer = CountVectorizer(binary=True)

In [3]:
X = df_dataset['tweet_content']

In [4]:
y = df_dataset['classified']

## Separando em treino e teste

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train = count_vectorizer.fit_transform(X_train.values.astype('U'))
X_test = count_vectorizer.transform(X_test.values.astype('U'))

In [6]:
from sklearn.naive_bayes import MultinomialNB

#Criando o Modelo Naive Bayes 
naive_bayes = MultinomialNB()

#.......Treinando o Modelo.......
naive_bayes.fit(X_train, y_train)

#Fazendo as previsões
naive_bayes_pred = naive_bayes.predict(X_test)

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix

#Calculando a acurácia
acc = accuracy_score(naive_bayes_pred, y_test)

#Matriz de confusão 
cm = confusion_matrix(naive_bayes_pred, y_test)

print("Acurácia do modelo", acc*100)
print("\nMatriz de confusão: \n", cm)

Acurácia do modelo 62.59541984732825

Matriz de confusão: 
 [[25 18]
 [31 57]]


In [8]:
nova_frase = ["ódio da nubank quando tem notificação e é propaganda ao invés de ser dinheiro caindo na minha conta"] 
teste = count_vectorizer.transform(nova_frase)
pred = naive_bayes.predict(teste)
print(pred)

[0.]


In [9]:
nova_frase2 = ["Chegou o meu cartão do @nubank. Tô amando a embalagem dele Cara sorridente com os olhos em forma de coração"] 
teste2 = count_vectorizer.transform(nova_frase2)
pred2 = naive_bayes.predict(teste2)
print(pred2)

[1.]


## Separando datasets por Keyword

In [10]:
df_nubank = df_dataset[['Nubank' in x for x in df_dataset['keyword']]]

In [11]:
df_nike = df_dataset[['Nike' in x for x in df_dataset['keyword']]]

In [12]:
df_shein = df_dataset[['SHEIN' in x for x in df_dataset['keyword']]]

In [13]:
nubank_count_vectorizer = CountVectorizer(binary=True)

X_nubank = df_nubank['tweet_content']
y_nubank = df_nubank['classified']

X_train_nubank, X_test_nubank, y_train_nubank, y_test_nubank = train_test_split(X_nubank, y_nubank, test_size=0.3, 
                                                                                random_state=42, stratify=y_nubank)
X_train_nubank = nubank_count_vectorizer.fit_transform(X_train_nubank)
X_test_nubank = nubank_count_vectorizer.transform(X_test_nubank.values.astype('U'))

In [14]:
nike_count_vectorizer = CountVectorizer(binary=True)

X_nike = df_nike['tweet_content']
y_nike = df_nike['classified']

X_train_nike, X_test_nike, y_train_nike, y_test_nike = train_test_split(X_nike, y_nike, test_size=0.3, 
                                                                                random_state=42, stratify=y_nike)
X_train_nike = nike_count_vectorizer.fit_transform(X_train_nike.values.astype('U'))
X_test_nike = nike_count_vectorizer.transform(X_test_nike.values.astype('U'))

In [15]:
shein_count_vectorizer = CountVectorizer(binary=True)

X_shein = df_shein['tweet_content']
y_shein = df_shein['classified']

X_train_shein, X_test_shein, y_train_shein, y_test_shein = train_test_split(X_shein, y_shein, test_size=0.3, 
                                                                                random_state=42, stratify=y_shein)
X_train_shein = shein_count_vectorizer.fit_transform(X_train_shein.values.astype('U'))
X_test_shein = shein_count_vectorizer.transform(X_test_shein.values.astype('U'))

## Aplicando MultinomialNB - Nubank

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

#Criando o Modelo Naive Bayes 
naive_bayes_nubank = MultinomialNB()
#.......Treinando o Modelo.......
naive_bayes_nubank.fit(X_train_nubank, y_train_nubank)
#Fazendo as previsões
naive_bayes_pred_nubank = naive_bayes_nubank.predict(X_test_nubank)

f_medida_nubank = f1_score(y_test_nubank, naive_bayes_pred_nubank, average='micro')
cm_nubank = confusion_matrix(naive_bayes_pred_nubank, y_test_nubank)

print("F-medida:", f_medida_nubank*100)
print("\nMatriz de confusão: \n", cm_nubank)

F-medida: 58.2089552238806

Matriz de confusão: 
 [[28 19]
 [ 9 11]]


## Multinomial NB - Nike

In [17]:
#Criando o Modelo Naive Bayes 
naive_bayes_nike = MultinomialNB()
#.......Treinando o Modelo.......
naive_bayes_nike.fit(X_train_nike, y_train_nike)
#Fazendo as previsões
naive_bayes_pred_nike = naive_bayes_nike.predict(X_test_nike)

f_medida_nike = f1_score(y_test_nike, naive_bayes_pred_nike, average='micro')
cm_nike = confusion_matrix(naive_bayes_pred_nike, y_test_nike)

print("F-medida:", f_medida_nike*100)
print("\nMatriz de confusão: \n", cm_nike)

F-medida: 59.25925925925925

Matriz de confusão: 
 [[11  9]
 [13 21]]


## MultinomialNB - SHEIN

In [18]:
#Criando o Modelo Naive Bayes 
naive_bayes_shein = MultinomialNB()
#.......Treinando o Modelo.......
naive_bayes_shein.fit(X_train_shein, y_train_shein)
#Fazendo as previsões
naive_bayes_pred_shein = naive_bayes_shein.predict(X_test_shein)

f_medida_shein = f1_score(y_test_shein, naive_bayes_pred_shein, average='micro')
cm_shein = confusion_matrix(naive_bayes_pred_shein, y_test_shein)

print("F-medida:", f_medida_shein*100)
print("\nMatriz de confusão: \n", cm_shein)

F-medida: 65.21739130434783

Matriz de confusão: 
 [[ 5  3]
 [13 25]]


## LinearSVC - Nubank

In [19]:
from sklearn import svm

SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(X_train_nubank, y_train_nubank)
predictions_SVM_nubank = SVM.predict(X_test_nubank)

f_medida_nubank_svm = f1_score(y_test_nubank, predictions_SVM_nubank, average='micro')
cm_nubank_svm = confusion_matrix(predictions_SVM_nubank, y_test_nubank)

print("F-medida:", f_medida_nubank_svm*100)
print("\nMatriz de confusão: \n", cm_nubank_svm)

F-medida: 52.23880597014925

Matriz de confusão: 
 [[25 20]
 [12 10]]


## LinearSVC - Nike

In [20]:
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(X_train_nike, y_train_nike)
predictions_SVM_nike = SVM.predict(X_test_nike)

f_medida_nike_svm = f1_score(y_test_nike, predictions_SVM_nike, average='micro')
cm_nike_svm = confusion_matrix(predictions_SVM_nike, y_test_nike)

print("F-medida:", f_medida_nike_svm*100)
print("\nMatriz de confusão: \n", cm_nike_svm)

F-medida: 55.55555555555556

Matriz de confusão: 
 [[ 7  7]
 [17 23]]


## LinearSVC - SHEIN

In [21]:
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(X_train_shein, y_train_shein)
predictions_SVM_shein = SVM.predict(X_test_shein)

f_medida_shein_svm = f1_score(y_test_shein, predictions_SVM_shein, average='micro')
cm_shein_svm = confusion_matrix(predictions_SVM_shein, y_test_shein)

print("F-medida:", f_medida_shein_svm*100)
print("\nMatriz de confusão: \n", cm_shein_svm)

F-medida: 60.86956521739131

Matriz de confusão: 
 [[ 2  2]
 [16 26]]


## Random Forest - Nubank

In [22]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_nubank, y_train_nubank)
predictions_rfc_nubank = rfc.predict(X_test_nubank)

f_medida_nubank_rfc = f1_score(y_test_nubank, predictions_rfc_nubank, average='micro')
cm_nubank_rfc = confusion_matrix(predictions_rfc_nubank, y_test_nubank)

print("F-medida:", f_medida_nubank_rfc*100)
print("\nMatriz de confusão: \n", cm_nubank_rfc)

F-medida: 59.70149253731343

Matriz de confusão: 
 [[34 24]
 [ 3  6]]


## Random Forest - Nike

In [23]:
rfc.fit(X_train_nike, y_train_nike)
predictions_rfc_nike = rfc.predict(X_test_nike)

f_medida_nike_rfc = f1_score(y_test_nike, predictions_rfc_nike, average='micro')
cm_nike_rfc = confusion_matrix(predictions_rfc_nike, y_test_nike)

print("F-medida:", f_medida_nike_rfc*100)
print("\nMatriz de confusão: \n", cm_nike_rfc)

F-medida: 53.70370370370371

Matriz de confusão: 
 [[ 2  3]
 [22 27]]


## Random Forest - SHEIN

In [24]:
rfc.fit(X_train_shein, y_train_shein)
predictions_rfc_shein = rfc.predict(X_test_shein)

f_medida_shein_rfc = f1_score(y_test_shein, predictions_rfc_shein, average='micro')
cm_shein_rfc = confusion_matrix(predictions_rfc_shein, y_test_shein)

print("F-medida:", f_medida_shein_rfc*100)
print("\nMatriz de confusão: \n", cm_shein_rfc)

F-medida: 60.86956521739131

Matriz de confusão: 
 [[ 0  0]
 [18 28]]


## KNN - Nubank

In [25]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train_nubank, y_train_nubank)

predictions_knn_nubank = knn.predict(X_test_nubank)

f_medida_nubank_knn = f1_score(y_test_nubank, predictions_knn_nubank, average='micro')
cm_nubank_knn = confusion_matrix(predictions_knn_nubank, y_test_nubank)

print("F-medida:", f_medida_nubank_knn*100)
print("\nMatriz de confusão: \n", cm_nubank_knn)

F-medida: 67.16417910447761

Matriz de confusão: 
 [[36 21]
 [ 1  9]]


## KNN - Nike

In [26]:
knn.fit(X_train_nike, y_train_nike)

predictions_knn_nike = knn.predict(X_test_nike)

f_medida_nike_knn = f1_score(y_test_nike, predictions_knn_nike, average='micro')
cm_nike_knn = confusion_matrix(predictions_knn_nike, y_test_nike)

print("F-medida:", f_medida_nike_knn*100)
print("\nMatriz de confusão: \n", cm_nike_knn)

F-medida: 59.25925925925925

Matriz de confusão: 
 [[ 5  3]
 [19 27]]


## KNN - SHEIN

In [27]:
knn.fit(X_train_shein, y_train_shein)

predictions_knn_shein = knn.predict(X_test_shein)

f_medida_shein_knn = f1_score(y_test_shein, predictions_knn_shein, average='micro')
cm_shein_knn = confusion_matrix(predictions_knn_shein, y_test_shein)

print("F-medida:", f_medida_shein_knn*100)
print("\nMatriz de confusão: \n", cm_shein_knn)

F-medida: 54.347826086956516

Matriz de confusão: 
 [[11 14]
 [ 7 14]]


# Aplicando os modelos no dataset inteiro

## Naive Bayes

In [28]:
#Criando o Modelo Naive Bayes 
naive_bayes = MultinomialNB()
#.......Treinando o Modelo.......
naive_bayes.fit(X_train, y_train)
#Fazendo as previsões
naive_bayes_pred = naive_bayes.predict(X_test)

f_medida = f1_score(y_test, naive_bayes_pred, average='micro')
cm = confusion_matrix(naive_bayes_pred, y_test)

print("F-medida:", f_medida*100)
print("\nMatriz de confusão: \n", cm)

F-medida: 62.59541984732825

Matriz de confusão: 
 [[25 18]
 [31 57]]


## LinearSVC

In [29]:
SVM = svm.SVC(C=1.0, kernel='linear')
SVM.fit(X_train, y_train)
predictions_SVM = SVM.predict(X_test)

f_medida_svm = f1_score(y_test, predictions_SVM, average='micro')
cm_svm = confusion_matrix(predictions_SVM, y_test)

print("F-medida:", f_medida_svm*100)
print("\nMatriz de confusão: \n", cm_svm)

F-medida: 57.25190839694656

Matriz de confusão: 
 [[23 23]
 [33 52]]


## RandomForest

In [30]:
rfc.fit(X_train, y_train)
predictions_rfc = rfc.predict(X_test)

f_medida_rfc = f1_score(y_test, predictions_rfc, average='micro')
cm_rfc = confusion_matrix(predictions_rfc, y_test)

print("F-medida:", f_medida_rfc*100)
print("\nMatriz de confusão: \n", cm_rfc)

F-medida: 61.832061068702295

Matriz de confusão: 
 [[17 11]
 [39 64]]


## KNN

In [38]:
"""
knn.fit(X_train, y_train)

predictions_knn = knn.predict(X_test)

f_medida_knn = f1_score(y_test, predictions_knn, average='micro')
cm_knn = confusion_matrix(predictions_knn, y_test)

print("F-medida:", f_medida_knn*100)
print("\nMatriz de confusão: \n", cm_knn)
"""

# Teste com pipeline e hiperparâmetros
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train = count_vectorizer.fit_transform(X_train.values.astype('U'))
X_test = count_vectorizer.transform(X_test.values.astype('U'))

pipeline = Pipeline([('clf', KNeighborsClassifier())])

parameters = {'clf__n_neighbors': [1,2,3,4,5], 'clf__weights' : ['uniform','distance']} # quais parâmetros e quais valores serão testados
clf = GridSearchCV(pipeline, parameters, cv=3) # clf vai armazenar qual foi a melhor configuração
clf.fit(X_train, y_train)

print(clf.best_params_)

scores = cross_val_score(clf.best_estimator_, X_test, y_test, cv=5)
print('Resultados:', scores)
print('Acurácia - %.2f +- %.2f' % (scores.mean() * 100, scores.std() * 100))

{'clf__n_neighbors': 1, 'clf__weights': 'uniform'}
Resultados: [0.37037037 0.61538462 0.61538462 0.61538462 0.73076923]
Acurácia - 58.95 +- 11.83


# Hiperparâmetros

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42, stratify=y)

In [33]:
X_train.shape

(65,)