#Pacotes

Instalação de pacotes necessários

In [65]:
# %pip install --upgrade accelerate
# %pip install --upgrade PyTorch
# %pip install --upgrade transformers
# %pip install --upgrade torch

In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_20newsgroups
import torch.nn.functional as F
from sklearn.metrics import balanced_accuracy_score , precision_score, recall_score, f1_score

In [68]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU')
else:
    device = torch.device("cpu")
    print("CPU")

CPU


#Base

Importando bases para fine-tunning

In [70]:
#Escolhendo duas categorias do dataset 20newsgroups
categories = ['talk.politics.guns', 'soc.religion.christian','comp.sys.ibm.pc.hardware']
data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42,remove=('headers', 'footers', 'quotes'))
#cria df
df = pd.DataFrame({'text': data.data,'label': data.target})
#exibir
display(df.head(3))

Unnamed: 0,text,label
0,\n\nDo you count yourself as one who is weak i...,1
1,"A ""new Christian"" wrote that he was new to the...",1
2,\n This is no less logical than the assumpti...,1


#Processamento

In [71]:
#codificador de labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
#dividir teste, teino e validacao
df_train, df_test = train_test_split(df, test_size=0.2)
df_train, df_val = train_test_split(df_train, test_size=0.2)

In [72]:
#definir colunas
train_texts = df_train['text']
val_texts = df_val['text']
train_labels = df_train['label']
val_labels = df_val['label']
test_text = df_test['text']
test_label = df_test['label']
#Carregar o tokenizador BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenizar
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_text.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

In [73]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

#Criar datasets de treinamento, teste e validação
train_dataset = Dataset(train_encodings, train_labels.tolist())
test_dataset = Dataset(test_encodings, test_label.tolist())
val_dataset = Dataset(val_encodings, val_labels.tolist())

#Modelagem

In [74]:
#define model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
#model.to(device)

#parametros do treinamento
training_args = TrainingArguments(
    output_dir='./model_bert',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs_bert',
    logging_steps=500)

#trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset)

#Iniciar treinamento
model_train = trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/231 [00:00<?, ?it/s]

{'train_runtime': 12595.4687, 'train_samples_per_second': 0.147, 'train_steps_per_second': 0.018, 'train_loss': 0.38301558721633183, 'epoch': 1.0}


#Avaliação

In [75]:
#Obter as previsões do modelo no conjunto de dados de teste
predictions = trainer.predict(test_dataset)

#Extrair as previsões e rótulos verdadeiros
predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = [example['labels'] for example in test_dataset]

#Calcular metricas
accuracy = balanced_accuracy_score(true_labels, predicted_labels)
print("balanced_accuracy:", accuracy)
precision = precision_score(true_labels, predicted_labels, average='macro')
print("precision_score:", precision)
recall = recall_score(true_labels, predicted_labels, average='macro')
print("recall_score:", recall)
f1 = f1_score(true_labels, predicted_labels, average='macro')
print("f1_score:", f1)

  0%|          | 0/73 [00:00<?, ?it/s]

balanced_accuracy: 0.9550374531835205
precision_score: 0.9583653861717453
recall_score: 0.9550374531835205
f1_score: 0.9560608248809758


In [84]:
#Texto para ser testado
text = "God Is Good"
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input = encoded_input.to(device)

with torch.no_grad():  #economiza memória
    outputs = model(**encoded_input)
logits = outputs.logits

#hardware,Cristianism, Gun
probabilities = F.softmax(logits, dim=1)
print("Probabilities:", probabilities)

Probabilities: tensor([[0.0543, 0.8984, 0.0473]])
