# Classificação de Fake News usando Transformadores


## Configuração


### Configurando importação e variável de ambiente


In [1]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Importando pacotes e dados


In [2]:
# Importando pacotes

# Type hinting
from typing import List, Any
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer

from applications.transformer.models.encoder_classifier import EncoderClassifier

In [3]:
# Importing data
true_data = pd.read_csv('../datasets/news_dataset/true.csv')
true_data['real'] = True
true_columns = true_data[['text', 'real']]

fake_data = pd.read_csv('../datasets/news_dataset/fake.csv')
fake_data['real'] = False
fake_columns = fake_data[['text', 'real']]


# Creating dataset
data = pd.concat([true_columns, fake_columns]).sample(frac=1)

In [4]:
data['real'].value_counts()

real
False    23481
True     21417
Name: count, dtype: int64

### Criando device


In [5]:
# Usando gpu caso disponível
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: mps


### Classes


In [6]:
# Classe de dataset
class NewsDataset(Dataset):
    def __init__(
        self,
        texts: List[str],
        labels: List[int],
        tokenizer_name: str,
        max_length: int = 512,
    ):
        super(NewsDataset, self).__init__()
        self.texts = texts
        self.labels = labels
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        self.tokenizer_name = tokenizer_name

    def __getitem__(self, index):
        encoding = self.tokenizer(
            self.texts[index],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt',
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[index], dtype=torch.long)
        return item

    def __len__(self) -> int:
        return len(self.labels)

### Funções


In [7]:
# Função para tokenizar frase
def tokenize_sequence(
    tokenizer: PreTrainedTokenizerBase, sequence: List[str], max_length: int
):
    return tokenizer(
        sequence,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt',
    )

In [8]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

In [9]:
# Functions for train and test steps
def train_step(
    estimator: nn.Module,
    loss_fn: nn.Module,
    optimizer: torch.optim.Optimizer,
    dataloader: DataLoader,
    device: str,
) -> None:

    train_loss, train_acc = 0, 0

    estimator.to(device)
    estimator.train()

    for batch in tqdm(dataloader):
        X = batch['input_ids'].to(device)
        y = batch['labels'].to(device)

        estimator.train()

        y_pred = estimator(X)

        loss = loss_fn(y_pred, y)
        train_loss += loss
        train_acc += accuracy_fn(y, y_pred.argmax(dim=1))

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

    train_loss /= len(dataloader)
    train_acc /= len(dataloader)

    print(f'Train loss: {train_loss:.3f} | Train accuracy: {train_acc:.2f}%')

In [10]:
# Function for training loop
def test_step(
    estimator: nn.Module,
    loss_fn: nn.Module,
    dataloader: DataLoader,
    device: str,
) -> None:

    test_loss, test_acc = 0, 0
    estimator.to(device)
    estimator.eval()
    with torch.inference_mode():
        for batch in tqdm(dataloader):

            X = batch['input_ids'].to(device)
            y = batch['labels'].to(device)

            test_pred = estimator(X)

            test_loss += loss_fn(test_pred, y)
            test_acc += accuracy_fn(y, test_pred.argmax(dim=1))

        test_loss /= len(dataloader)

        test_acc /= len(dataloader)

    print(f'Test loss:  {test_loss:.3f} | Test accuracy:  {test_acc:.2f}%\n')

## EDA


In [11]:
# Visualizando dataset
data.head()

Unnamed: 0,text,real
7478,March 15 saw yet another Super Tuesday battle ...,False
9398,WASHINGTON (Reuters) - Top U.S. State Departme...,True
270,WASHINGTON (Reuters) - The Senate on Thursday ...,True
13677,HARARE (Reuters) - Robert Mugabe s 37-year rul...,True
5156,A veteran has launched a GoFundMe fundraiser t...,False


In [12]:
# Tratando valores nulos
data.dropna(inplace=True)

In [13]:
# Verificando tamanho médio dos textos
np.mean([i.count(' ') for _, i in enumerate(data['text'].to_list())])

np.float64(414.7604124905341)

## Criando dataset


In [14]:
# Separando dados em treino e teste
X = data['text']
y = data['real']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Transformando dados para listas
X_train, X_test = X_train.tolist(), X_test.tolist()
y_train, y_test = y_train.tolist(), y_test.tolist()

In [15]:
# Definindo hiperparâmetros
max_length = 200

# num_classes = len(bias_id)
num_classes = 2

tokenizer_name = 'bert-base-uncased'

In [16]:
# Criando datasets
train_dataset = NewsDataset(
    texts=X_train, labels=y_train, tokenizer_name=tokenizer_name, max_length=max_length
)
test_dataset = NewsDataset(
    texts=X_test, labels=y_test, tokenizer_name=tokenizer_name, max_length=max_length
)

# Criando dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

## Criando modelo


## Treinando modelo


In [17]:
# Instanciando modelo customizado
model = EncoderClassifier(
    vocab_size=train_dataset.tokenizer.vocab_size,
    n_layers=2,
    n_classes=num_classes,
    embed_dim=128,
    n_heads=4,
    ff_hid_dim=4,
    max_length=max_length,
    pad_idx=train_dataset.tokenizer.pad_token_type_id,
    dropout=0.1,
    device=device,
).to(device)

In [18]:
# Definindo perda
loss_fn = nn.CrossEntropyLoss()

# Definindo otimizador
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-2)

In [19]:
# Setting seed
torch.manual_seed(42)

# Setting epochs
epochs = 2

# Main train loop
for epoch in tqdm(range(epochs)):
    print(
        f'Epoch: {epoch + 1}',
        '-' * 90,
        sep='\n',
        end='\n',
    )

    # Train step
    train_step(
        estimator=model,
        dataloader=train_dataloader,
        loss_fn=loss_fn,
        optimizer=optimizer,
        device=device,
    )

    # Test step
    test_step(
        estimator=model,
        loss_fn=loss_fn,
        dataloader=test_dataloader,
        device=device,
    )

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 1
------------------------------------------------------------------------------------------


  0%|          | 0/1263 [00:00<?, ?it/s]

Train loss: 0.086 | Train accuracy: 96.63%


  0%|          | 0/141 [00:00<?, ?it/s]

Test loss:  0.019 | Test accuracy:  99.67%

Epoch: 2
------------------------------------------------------------------------------------------


  0%|          | 0/1263 [00:00<?, ?it/s]

Train loss: 0.034 | Train accuracy: 99.31%


  0%|          | 0/141 [00:00<?, ?it/s]

Test loss:  0.045 | Test accuracy:  98.98%

