In [7]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold 

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\giova\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Carregando o dataset FAKEBR

dados = pd.read_csv('dataset_shuffle.csv')
#dados.head()
X = dados['text']
y = dados['fake_news']

In [4]:
stop_por = set(stopwords.words("portuguese"))
def limpa_texto(string):
    text = string.lower().split()
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text)  
    tx   = text.replace('&amp',' ')
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text)
    text = text.split()
    text = [w for w in text if not w in stop_por]
    text = " ".join(text)
    return text

In [5]:
X = X.map(lambda x: limpa_texto(x))

In [6]:
#Pode ser resumido nisto: 
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
# just send in all your docs here
fitted_vectorizer=tfidf_vectorizer.fit(X)
tfidf_vectorizer_vectors=fitted_vectorizer.transform(X)

# Testes com dataset COVID19BR - LinearSVC

In [12]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = LinearSVC() #LogisticRegression(solver= 'liblinear')

#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     

Matriz de confusão:
[[673.4  46.6]
 [ 25.6 694.4]]
Acurácia média: 0.9498611111111112
Precisão média: 0.9482634299317942
Recall médio  : 0.9498611111111112
F1 médio      : 0.9449060352670392


  _warn_prf(average, modifier, msg_start, len(result))


# Testes com dataset COVID19BR - Regressão Logística

In [11]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = LogisticRegression(solver= 'liblinear')
 
#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     

Matriz de confusão:
[[646.   74. ]
 [ 34.2 685.8]]
Acurácia média: 0.9248611111111111
Precisão média: 0.9238762276514183
Recall médio  : 0.9248611111111111
F1 médio      : 0.9157815642912717


  _warn_prf(average, modifier, msg_start, len(result))


# Testes com dataset COVID19BR - GradientBoostingClassifier

In [8]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = GradientBoostingClassifier()

#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     

  _warn_prf(average, modifier, msg_start, len(result))


Matriz de confusão:
[[671.6  48.4]
 [ 34.  686. ]]
Acurácia média: 0.9427777777777777
Precisão média: 0.9431580518678654
Recall médio  : 0.9427777777777777
F1 médio      : 0.9381010970831962


# Testes com dataset COVID19BR - Árvore de decisão

In [9]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = tree.DecisionTreeClassifier()

#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     

  _warn_prf(average, modifier, msg_start, len(result))


Matriz de confusão:
[[635.4  84.6]
 [ 78.4 641.6]]
Acurácia média: 0.8868055555555555
Precisão média: 0.8758773319357054
Recall médio  : 0.8868055555555555
F1 médio      : 0.874887470619823


# Testes com o dataset COVID19BR - Rede Neural MLP

In [None]:
X_ = tfidf_vectorizer_vectors

#Implementando a validação cruzada
 
k = 5
kf = KFold(n_splits=k, random_state=None)
model = MLPClassifier()

#lista das matrizes de confusão de cada iteração 
lista_conf_matrix = []
lista_acc_score = []
lista_precision_score = []
lista_recall_score = []
lista_f1_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X_[train_index],X_[test_index]
    y_train , y_test = y[train_index],y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    #print(confusion_matrix(y_test, pred_values))
    conf_matrix = confusion_matrix(y_test, pred_values)
    lista_conf_matrix.append(conf_matrix)
    
    acc = accuracy_score(pred_values, y_test)    
    lista_acc_score.append(acc)

    prec = precision_score(pred_values, y_test, average = 'weighted')
    lista_precision_score.append(prec)
    
    recall = recall_score(pred_values, y_test, average = 'weighted')
    lista_recall_score.append(recall)
    
    f1 = f1_score(pred_values, y_test, average = 'weighted')     
    lista_f1_score.append(f1)
    
media_conf_matrix = np.mean(lista_conf_matrix, axis=0)
media_acc_score = sum(lista_acc_score)/k
media_prec_score = sum(lista_precision_score)/k
media_recall_score = sum(lista_recall_score)/k
media_f1_score = sum(lista_f1_score)/k

print('Matriz de confusão:')
print(media_conf_matrix)
#print('Acurácia de cada iteração : {}'.format(acc_score))
print('Acurácia média: {}'.format(media_acc_score))
print('Precisão média: {}'.format(media_prec_score))
print('Recall médio  : {}'.format(media_recall_score))
print('F1 médio      : {}'.format(media_f1_score))     