In [13]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold 

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\giova\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Carregando o dataset COVID19BR

dados = pd.read_csv('covidbr_labeled.csv')
#dados.head()
X = dados['text']
y = dados['misinformation']

In [4]:
stop_por = set(stopwords.words("portuguese"))
def limpa_texto(string):
    text = string.lower().split()
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text)  
    tx   = text.replace('&amp',' ')
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text)
    text = text.split()
    text = [w for w in text if not w in stop_por]
    text = " ".join(text)
    return text

In [5]:
X = X.map(lambda x: limpa_texto(x))

In [6]:
#Pode ser resumido nisto: 
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
# just send in all your docs here
fitted_vectorizer=tfidf_vectorizer.fit(X)
tfidf_vectorizer_vectors=fitted_vectorizer.transform(X)

# Testes com dataset COVID19BR - LinearSVC

In [8]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = LinearSVC() #LogisticRegression(solver= 'liblinear')

#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     

Matriz de confusão:
[[374.6  22.6]
 [ 66.4 116. ]]
Acurácia média: 0.8464385682806265
Precisão média: 0.8697571054779722
Recall médio  : 0.8464385682806265
F1 médio      : 0.8524964184254941


# Testes com dataset COVID19BR - Regressão Logística

In [7]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = LogisticRegression(solver= 'liblinear')

#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     

Matriz de confusão:
[[384.4  12.8]
 [ 98.8  83.6]]
Acurácia média: 0.8074557798820796
Precisão média: 0.8832438894530806
Recall médio  : 0.8074557798820796
F1 médio      : 0.8267398548712226


# Testes com dataset COVID19BR - GradientBoostingClassifier

In [9]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = GradientBoostingClassifier()

#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     

Matriz de confusão:
[[374.4  22.8]
 [ 80.6 101.8]]
Acurácia média: 0.8216038353880055
Precisão média: 0.8597999089872339
Recall médio  : 0.8216038353880055
F1 médio      : 0.8316575314413461


# Testes com dataset COVID19BR - Árvore de decisão

In [12]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = tree.DecisionTreeClassifier()

#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     

Matriz de confusão:
[[336.2  61. ]
 [ 66.  116.4]]
Acurácia média: 0.7808838068012625
Precisão média: 0.7835624852971241
Recall médio  : 0.7808838068012625
F1 médio      : 0.7811326809675332


# Testes com o dataset COVID19BR - Rede Neural MLP

In [14]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = MLPClassifier()

#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     

Matriz de confusão:
[[361.4  35.8]
 [ 56.6 125.8]]
Acurácia média: 0.8405705437436722
Precisão média: 0.8489225302879813
Recall médio  : 0.8405705437436722
F1 médio      : 0.8427621950236865


# Início do tutorial encontrado em:
https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y1Ccl3bMJPY

Idéia inicial do KFold: # https://www.askpython.com/python/examples/k-fold-cross-validation

In [20]:
#instancia CountVectorizer() 
cv=CountVectorizer() 
# estes passos geram 
# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(X) # o X são as postagens do meu arquivo

In [22]:
word_count_vector.shape

(2898, 21262)

In [23]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [None]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

In [27]:
# count matrix 
count_vector=cv.transform(X) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [None]:
feature_names = cv.get_feature_names() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

In [29]:
#from sklearn.feature_extraction.text import TfidfVectorizer 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(X)

In [None]:
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), 
                  index=tfidf_vectorizer.get_feature_names(), 
                  columns=["tfidf"])
df.sort_values(by=["tfidf"], ascending=False)

In [6]:
#Pode ser resumido nisto: 
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
# just send in all your docs here
fitted_vectorizer=tfidf_vectorizer.fit(X)
tfidf_vectorizer_vectors=fitted_vectorizer.transform(X)

# Fim do tutorial

In [None]:
#Verificar o resultado da limpeza do texto
contador = 0
for i in X:
    contador += 1
    print(i)
    if contador > 5:
        stop