# Importar as bibliotecas necessárias

In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import Dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report


2025-02-04 23:06:04.355132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738721164.366894   10994 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738721164.370542   10994 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-04 23:06:04.383828: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Carregar base de dados

In [2]:
# Carregar o CSV
df = pd.read_csv("/home/hurias/Documentos/Disciplina_NLP/Atv_4/data/SyskillWebert.csv")  # Substituir pelo caminho correto

# Exibir as primeiras linhas para verificar o formato
print(df.head())

  file_name                                               text  class
0        53    LaFond, Lois        One World              (...  Bands
1        28    Houston, Penelope   Houston, Penelope       ...  Bands
2        57    Bernard, Mary Ellen   Bernard, Mary Ellen   ...  Bands
3        25    Deth Specula   Deth Specula         Careenin...  Bands
4        15    Hungry Ghost   Hungry Ghost         Aboral  ...  Bands


In [3]:
df = df.dropna(subset=["text"])  # Remove linhas onde 'text' é NaN
df["text"] = df["text"].astype(str)  # Converte para string (caso tenha números ou objetos)


In [4]:
print(df["text"].apply(type).value_counts())  # Verifica os tipos de dados na coluna
print(df["text"].head())  # Confere os primeiros valores da coluna


text
<class 'str'>    333
Name: count, dtype: int64
0      LaFond, Lois        One World              (...
1      Houston, Penelope   Houston, Penelope       ...
2      Bernard, Mary Ellen   Bernard, Mary Ellen   ...
3      Deth Specula   Deth Specula         Careenin...
4      Hungry Ghost   Hungry Ghost         Aboral  ...
Name: text, dtype: object


# Dividir a base de dados

In [5]:
# Garantir que o dataframe tenha as colunas esperadas
df = df.rename(columns={"text": "text", "class": "label"})  # Ajuste conforme necessário

# Converter para Dataset do Hugging Face
dataset = Dataset.from_pandas(df)

# Divisão entre treino (70%), validação (10%) e teste (20%)
train_test_split = dataset.train_test_split(test_size=0.3)
test_valid_split = train_test_split["test"].train_test_split(test_size=2/3)

dataset_final = DatasetDict({
    "train": train_test_split["train"],
    "validation": test_valid_split["train"],
    "test": test_valid_split["test"]
})

# Verificar o número de exemplos em cada conjunto
for split in dataset_final:
    print(f"{split}: {len(dataset_final[split])} exemplos")



train: 233 exemplos
validation: 33 exemplos
test: 67 exemplos


In [6]:
for split in ["train", "validation", "test"]:
    print(f"\n🔹 Exemplos do conjunto {split.upper()}:\n")
    for i in range(5):
        print(f"Texto: {dataset_final[split][i]['text']}")
        print(f"Label: {dataset_final[split][i]['label']}\n")



🔹 Exemplos do conjunto TRAIN:

Texto:     Boston University School of Public Health    W elcome to       B oston U niversity  S chool of P ublic H ealth        BUSPH Web Search Engine    (If you're looking for something in particular, and you just don't have time to wade...)  Home Page Links       1994/95 Bulletin   The School of Public Health Bulletin provides information about the school's departments, courses, calendar, history, and more.       Public Health Forums    Current schedule of Public Health Forums at BUSPH.  (June 14, 1995)       Departments & Organizations at BUSPH   A new venue for BUSPH departments and affiliated organizations. So far, Epidemiology & Biostatistics , Join Together , and the Health Law Department . (May 7, 1995)        Current Research at BUSPH   A sampling of current research papers from BUSPH faculty. ( prototype stage ).       BU Medical School Server   Sponsored by the library and maintained by resident students, this server provides information abo

In [7]:
# Exibir rótulos únicos antes da conversão
print(set(dataset_final["train"]["label"]))
print(set(dataset_final["validation"]["label"]))
print(set(dataset_final["test"]["label"]))


{'Goats', 'Bands', 'BioMedical', 'Sheep'}
{'Bands', 'Goats', 'BioMedical', 'Sheep'}
{'Goats', 'Bands', 'BioMedical', 'Sheep'}


In [None]:
"""def fix_labels(example):
    label_map = {"BioMedical": 0, "Goats": 1, "Bands": 2, "Sheep": 3}  # Mapeia os rótulos para números
    example["label"] = label_map.get(example["label"], -1)  # Define -1 para valores desconhecidos
    return example

dataset_final = dataset_final.map(fix_labels)"""


In [8]:
# Verificar as colunas disponíveis
print(dataset_final["train"].column_names)

['file_name', 'text', 'label', '__index_level_0__']


In [9]:
# Exibir algumas amostras para analisar os tipos
for i in range(5):
    print(f"Exemplo {i}: {dataset_final['train'][i]}")

Exemplo 0: {'file_name': '124-2002600785', 'text': '    Boston University School of Public Health    W elcome to       B oston U niversity  S chool of P ublic H ealth        BUSPH Web Search Engine    (If you\'re looking for something in particular, and you just don\'t have time to wade...)  Home Page Links       1994/95 Bulletin   The School of Public Health Bulletin provides information about the school\'s departments, courses, calendar, history, and more.       Public Health Forums    Current schedule of Public Health Forums at BUSPH.  (June 14, 1995)       Departments & Organizations at BUSPH   A new venue for BUSPH departments and affiliated organizations. So far, Epidemiology & Biostatistics , Join Together , and the Health Law Department . (May 7, 1995)        Current Research at BUSPH   A sampling of current research papers from BUSPH faculty. ( prototype stage ).       BU Medical School Server   Sponsored by the library and maintained by resident students, this server provides

In [10]:
dataset_final = dataset_final.remove_columns(["file_name"])

In [11]:
# Verificar as colunas disponíveis
print(dataset_final["validation"].column_names)

['text', 'label', '__index_level_0__']


In [12]:
# Verificar tipos
for i in range(5):  # Checar algumas amostras
    print(f"Exemplo {i} - Tipo: {type(dataset_final['train'][i]['text'])}")


Exemplo 0 - Tipo: <class 'str'>
Exemplo 1 - Tipo: <class 'str'>
Exemplo 2 - Tipo: <class 'str'>
Exemplo 3 - Tipo: <class 'str'>
Exemplo 4 - Tipo: <class 'str'>


# Tokenizar os textos

In [13]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

encodings = tokenizer(list(df["text"]), padding=True, truncation=True, return_tensors="pt")

# Verificar os encodings
print(encodings)

{'input_ids': tensor([[  101,  2474, 14876,  ...,  2189,  3496,   102],
        [  101,  5395,  1010,  ...,     0,     0,     0],
        [  101,  6795,  1010,  ...,  6530,  1999,   102],
        ...,
        [  101, 12121,  5814,  ...,  1996,  4091,   102],
        [  101,  8351,  4167,  ...,     0,     0,     0],
        [  101, 29215,  2102,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [15]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Função para tokenizar
def tokenize_function(examples):
    return tokenizer(
        text=examples["text"], 
        padding=True,  # Garante que todos os textos tenham o mesmo tamanho
        truncation=True,  # Corta textos muito longos para evitar erro
        max_length=512  # Define o tamanho máximo para os tokens
    )

# Aplicar a tokenização
tokenized_datasets = dataset_final.map(tokenize_function, batched=True)

# Visualizar tokens de um exemplo
example_text = dataset_final["train"][0]["text"]
tokens = tokenizer.tokenize(example_text)
print("Tokens de um documento:")
print(tokens)


Map:   0%|          | 0/233 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Tokens de um documento:
['boston', 'university', 'school', 'of', 'public', 'health', 'w', 'el', '##com', '##e', 'to', 'b', 'os', '##ton', 'u', 'ni', '##vers', '##ity', 's', 'cho', '##ol', 'of', 'p', 'u', '##bl', '##ic', 'h', 'ea', '##lth', 'bus', '##ph', 'web', 'search', 'engine', '(', 'if', 'you', "'", 're', 'looking', 'for', 'something', 'in', 'particular', ',', 'and', 'you', 'just', 'don', "'", 't', 'have', 'time', 'to', 'wade', '.', '.', '.', ')', 'home', 'page', 'links', '1994', '/', '95', 'bulletin', 'the', 'school', 'of', 'public', 'health', 'bulletin', 'provides', 'information', 'about', 'the', 'school', "'", 's', 'departments', ',', 'courses', ',', 'calendar', ',', 'history', ',', 'and', 'more', '.', 'public', 'health', 'forums', 'current', 'schedule', 'of', 'public', 'health', 'forums', 'at', 'bus', '##ph', '.', '(', 'june', '14', ',', '1995', ')', 'departments', '&', 'organizations', 'at', 'bus', '##ph', 'a', 'new', 'venue', 'for', 'bus', '##ph', 'departments', 'and', 'affil

# Treinar o modelo BERT

In [None]:
# Carregar modelo BERT pré-treinado para classificação binária
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Configurar os parâmetros de treinamento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs"
)

# Função de avaliação com métricas
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    f1_micro = f1_score(labels, preds, average="micro")
    f1_macro = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1_micro": f1_micro, "f1_macro": f1_macro}

# Criar Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Iniciar o treinamento
trainer.train()


# Avaliação no conjunto de teste + Relatório Detalhado

In [None]:
# Fazer previsões
predictions = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Calcular métricas principais
acc = accuracy_score(labels, preds)
f1_micro = f1_score(labels, preds, average="micro")
f1_macro = f1_score(labels, preds, average="macro")
conf_matrix = confusion_matrix(labels, preds)

# Exibir métricas principais
print(f"Acurácia: {acc:.4f}")
print(f"F1-score (Micro): {f1_micro:.4f}")
print(f"F1-score (Macro): {f1_macro:.4f}")

# Exibir matriz de confusão
plt.figure(figsize=(6,6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predito")
plt.ylabel("Real")
plt.title("Matriz de Confusão")
plt.show()

# **Relatório detalhado das métricas**
print("Relatório de Classificação:\n")
print(classification_report(labels, preds, target_names=["Negativo", "Positivo"]))
