# Transformers POC


The objective of this notebook is to showcase a complete fine tuning of a custom dataset with Transformers.

The dataset has been adapted to a binary classification problem.

NOTE: Run this notebook in Google Colab and select a GPU runtime.

In [None]:
!pip install transformers
!pip install datasets

In [None]:
from google.colab import drive
# drive.mount('/gdrive/My Drive/poc_transformers')
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from transformers import RobertaModel, RobertaTokenizer, Trainer, TrainingArguments, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

# Reading Data & Data Wrangling

In [None]:
df_elpitazo_pscdd = pd.read_csv("/content/drive/MyDrive/sambil/datasets/elpitazo_positivelabels_devdataset.csv")

In [None]:
print(df_elpitazo_pscdd.tipo_de_evento.value_counts().to_markdown())
df_elpitazo_pscdd["label"] = (df_elpitazo_pscdd.tipo_de_evento == "DENUNCIA FALTA DEL SERVICIO").astype(int)
df_elpitazo_pscdd = df_elpitazo_pscdd.convert_dtypes()
df_denuncia_texto = df_elpitazo_pscdd[["label","text"]]
df_denuncia_texto.dropna(inplace=True)

# Training

# Results
|                         |   metrics_value |
|:------------------------|----------------:|
| eval_loss               |        1.24403  |
| eval_accuracy           |        0.764583 |
| eval_precision          |        0.755869 |
| eval_recall             |        0.725225 |
| eval_f1                 |        0.74023  |
| eval_runtime            |       20.2013   |
| eval_samples_per_second |       23.761    |
| eval_steps_per_second   |        2.376    |
| epoch                   |       50        |

In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import RobertaModel, RobertaTokenizer # BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

X = list(df_denuncia_texto["text"])
y = list(df_denuncia_texto["label"])

model = RobertaForSequenceClassification.from_pretrained("mrm8488/RuPERTa-base", num_labels=2)
tokenizer = RobertaTokenizer.from_pretrained('mrm8488/RuPERTa-base')

# Use GPU
model.to(device)

# Train Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}



In [None]:
args = TrainingArguments(
    output_dir= '/content/drive/MyDrive/sambil/poc_transformers/results',          # output directory
    num_train_epochs=50,             # total # of training epochs
    per_device_train_batch_size=10,  # batch size per device during training
    per_device_eval_batch_size=10,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/MyDrive/sambil/poc_transformers/logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train pre-trained model
trainer.train()

model.save_pretrained("/content/drive/MyDrive/sambil/poc_transformers")


In [None]:
# Load Model
loaded_model = RobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/sambil/poc_transformers/ruperta_binary_denunciafaltaservicio")

# Define test trainer
test_trainer = Trainer(loaded_model)

# Make prediction
raw_pred, _, _ = test_trainer.predict(val_dataset) # TODO: Use validation set instead of test set

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)


## Evaluate Metrics
metrics=test_trainer.evaluate(val_dataset)
metrics_df = pd.DataFrame.from_dict(metrics, orient="index",columns=["metrics_value"])

print(metrics_df.to_markdown())