<a href="https://colab.research.google.com/github/fdsgusmao/projeto-final/blob/main/projeto-final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Download do corpus

In [4]:
import pandas as pd
import numpy as np
import glob

In [None]:
!apt install subversion

In [None]:
!svn checkout https://github.com/roneysco/Fake.br-Corpus/

## Estruturando o dataset

In [7]:
# Pegando todos os nomes dos arquivos txt
fake_folder_path = '/content/Fake.br-Corpus/trunk/full_texts/fake'
true_folder_path = '/content/Fake.br-Corpus/trunk/full_texts/true'
true_file_list = glob.glob(true_folder_path + "/*.txt")
fake_file_list = glob.glob(fake_folder_path + "/*.txt")

In [8]:
true_news_list = []
for i in range(0,len(true_file_list)):
  # Abrindo todos os txt's
  with open(true_file_list[i], 'r') as file:
      # Tratando o txt e colocando numa lista
      data = file.read().replace('\n', '')
      true_news_list.append(data)

fake_news_list = []
for i in range(0,len(fake_file_list)):
  # Abrindo todos os txt's
  with open(fake_file_list[i], 'r') as file:
      # Tratando o txt e colocando numa lista
      data = file.read().replace('\n', '')
      fake_news_list.append(data)

In [None]:
# Transformando as strings de notícias em um DataFrame do pandas
df1 = pd.DataFrame()
df1['news'] = true_news_list
df1['fake'] = 0

df2 = pd.DataFrame()
df2['news'] = fake_news_list
df2['fake'] = 1

# Concatenando os DataFrames de fake e real news
main_df = pd.DataFrame()
main_df = main_df.append(df1, ignore_index = True)
main_df = main_df.append(df2, ignore_index = True)

## Preprocessamento

In [None]:
# Importando o spacy e carregando o pipeline para fazer o preprocessamento
import spacy
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!python -m spacy download pt_core_news_sm
nlp = spacy.load('pt_core_news_sm', exclude=['tok2vec', 'parser', 'lemmatizer', 'attribute_ruler', 'ner'])

In [13]:
# Definindo a função de preprocessamento
def preprocessed(text):
  doc = nlp(text)
  preprocessedText = [t for t in doc if t.is_punct == 0 if t.is_stop == 0]
  preprocessedText = ' '.join(str(e) for e in preprocessedText)
  return preprocessedText

In [14]:
# Aplicando a função de preprocessamento em todas as notícias do DataFrame
main_df['preprocessed_news'] = main_df['news'].apply(preprocessed)

## Setup de treinamento

In [None]:
# Instalando o pacote transform
!pip3 install transformers

In [16]:
# Imports que serão utilizados no treinamento
import torch
from torch import optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

In [17]:
# Separação do dataset em treino, teste e validação
X_train,X_test,y_train,y_test=train_test_split(main_df.preprocessed_news,main_df.fake,test_size=0.2,random_state=42)
X_train,X_validate,y_train,y_validate=train_test_split(X_train,y_train,test_size=0.1,random_state=42)

trainset = pd.concat([X_train,y_train],axis=1).to_dict('records')
testset = pd.concat([X_test,y_test],axis=1).to_dict('records')
validateset = pd.concat([X_validate,y_validate],axis=1).to_dict('records')

from torch.utils.data import DataLoader
batch_size = 16

traindata = DataLoader(trainset, batch_size=batch_size, shuffle=True)
testdata = DataLoader(testset, batch_size=batch_size, shuffle=True)
validatedata = DataLoader(validateset, batch_size=batch_size, shuffle=True)

In [18]:
# Configurações de treino
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
nclasses = 2
nepochs = 3
batch_status = 32
learning_rate = 5e-5

max_length = 180

In [19]:
# Definindo a função de avaliação
def evaluate(model, testdata):
  model.eval()
  y_real, y_pred = [], []
  for batch_idx, inp in enumerate(testdata):
    texts, labels = inp['preprocessed_news'], inp['fake']
    
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
    output = model(**inputs)
                
    pred_labels = torch.argmax(output.logits, 1)
    
    y_real.extend(labels.tolist())
    y_pred.extend(pred_labels.tolist())
    
    if (batch_idx+1) % batch_status == 0:
      print('EVALUATION Progress:', round(batch_idx / len(testdata), 2), batch_idx)
  
  print(classification_report(y_real, y_pred, labels=[0, 1], target_names=['True', 'Fake']))
  f1 = f1_score(y_real, y_pred, average='weighted')
  acc = accuracy_score(y_real, y_pred)
  return f1, acc

In [20]:
# Definindo função de treinamento
def train(model, tokenizer):
  for epoch in range(nepochs):
    model.train()
    losses = []
    for batch_idx, inp in enumerate(traindata):
      texts, labels = inp['preprocessed_news'], inp['fake']

      # Classificação
      inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(device)
      output = model(**inputs, labels=labels.to(device))

      # Cálculo do Loss
      loss = output.loss
      losses.append(float(loss))

      # Backpropagation
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      # Display de progresso
      if (batch_idx+1) % batch_status == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tTotal Loss: {:.6f}'.format(epoch, \
          batch_idx+1, len(traindata), 100. * batch_idx / len(traindata), 
          float(loss), round(sum(losses) / len(losses), 5)))
    
    f1, acc = evaluate(model, testdata)
    print('F1: ', f1, 'Accuracy: ', acc)

# Treinando o BERT

In [21]:
# Baixando o modelo
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=nclasses).to(device)

# Configurando o optimizador
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [22]:
train(model, tokenizer)

EVALUATION Progress: 0.34 31
EVALUATION Progress: 0.7 63
              precision    recall  f1-score   support

        True       0.76      0.98      0.85       718
        Fake       0.97      0.69      0.80       722

    accuracy                           0.83      1440
   macro avg       0.86      0.83      0.83      1440
weighted avg       0.86      0.83      0.83      1440

F1:  0.8284350992265032 Accuracy:  0.8319444444444445
EVALUATION Progress: 0.34 31
EVALUATION Progress: 0.7 63
              precision    recall  f1-score   support

        True       0.86      0.89      0.87       718
        Fake       0.89      0.85      0.87       722

    accuracy                           0.87      1440
   macro avg       0.87      0.87      0.87      1440
weighted avg       0.87      0.87      0.87      1440

F1:  0.8700934714065 Accuracy:  0.8701388888888889
EVALUATION Progress: 0.34 31
EVALUATION Progress: 0.7 63
              precision    recall  f1-score   support

        True   

In [23]:
# Avaliando o modelo nas amostras de validação
evaluate(model, validatedata)

EVALUATION Progress: 0.86 31
              precision    recall  f1-score   support

        True       0.85      0.95      0.90       303
        Fake       0.93      0.82      0.87       273

    accuracy                           0.89       576
   macro avg       0.89      0.88      0.88       576
weighted avg       0.89      0.89      0.88       576



(0.88465576171875, 0.8854166666666666)

# Treinando o DistilBERT

In [27]:
# Baixando o modelo
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=nclasses).to(device)

# Configurando o optimizador
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

In [28]:
train(model, tokenizer)

EVALUATION Progress: 0.34 31
EVALUATION Progress: 0.7 63
              precision    recall  f1-score   support

        True       0.89      0.88      0.88       718
        Fake       0.88      0.89      0.89       722

    accuracy                           0.88      1440
   macro avg       0.88      0.88      0.88      1440
weighted avg       0.88      0.88      0.88      1440

F1:  0.8847204431892616 Accuracy:  0.8847222222222222
EVALUATION Progress: 0.34 31
EVALUATION Progress: 0.7 63
              precision    recall  f1-score   support

        True       0.91      0.90      0.91       718
        Fake       0.90      0.91      0.91       722

    accuracy                           0.91      1440
   macro avg       0.91      0.91      0.91      1440
weighted avg       0.91      0.91      0.91      1440

F1:  0.9076354589985667 Accuracy:  0.9076388888888889
EVALUATION Progress: 0.34 31
EVALUATION Progress: 0.7 63
              precision    recall  f1-score   support

        True

In [29]:
# Avaliando o modelo nas amostras de validação
evaluate(model, validatedata)

EVALUATION Progress: 0.86 31
              precision    recall  f1-score   support

        True       0.92      0.91      0.91       303
        Fake       0.90      0.91      0.91       273

    accuracy                           0.91       576
   macro avg       0.91      0.91      0.91       576
weighted avg       0.91      0.91      0.91       576



(0.9097374961731199, 0.9097222222222222)

# Treinando o BERTimbau

In [24]:
# Baixando o modelo
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=nclasses).to(device)

# Configurando o optimizador
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

Downloading (…)okenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [25]:
train(model, tokenizer)

EVALUATION Progress: 0.34 31
EVALUATION Progress: 0.7 63
              precision    recall  f1-score   support

        True       0.96      0.97      0.97       718
        Fake       0.97      0.96      0.97       722

    accuracy                           0.97      1440
   macro avg       0.97      0.97      0.97      1440
weighted avg       0.97      0.97      0.97      1440

F1:  0.9666657021437486 Accuracy:  0.9666666666666667
EVALUATION Progress: 0.34 31
EVALUATION Progress: 0.7 63
              precision    recall  f1-score   support

        True       0.99      0.95      0.97       718
        Fake       0.95      0.99      0.97       722

    accuracy                           0.97      1440
   macro avg       0.97      0.97      0.97      1440
weighted avg       0.97      0.97      0.97      1440

F1:  0.9687336389408289 Accuracy:  0.96875
EVALUATION Progress: 0.34 31
EVALUATION Progress: 0.7 63
              precision    recall  f1-score   support

        True       0.98

In [26]:
# Avaliando o modelo nas amostras de validação
evaluate(model,validatedata)

EVALUATION Progress: 0.86 31
              precision    recall  f1-score   support

        True       0.99      0.95      0.97       303
        Fake       0.94      0.99      0.97       273

    accuracy                           0.97       576
   macro avg       0.97      0.97      0.97       576
weighted avg       0.97      0.97      0.97       576



(0.9670358804920455, 0.9670138888888888)