In [None]:
import pandas as pd
import sklearn
from tqdm import tqdm
import numpy as np
import nltk
from nltk.corpus import stopwords
from torch import nn
from torch.optim import Adam
import torch

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Dataset
# repository https://github.com/iresiragusa/NLP/tree/main
# https://www.kaggle.com/datasets/yufengdev/bbc-fulltext-and-category?select=bbc-text.csv
# scarichiamo il dataset e lo carichiamo su COLAB

root = "/content/gdrive/MyDrive/Colab Notebooks/torch/"
df = pd.read_csv(root+"data/BBC-text/bbc-text.csv")

In [None]:
# il dataset è formato da 2225 sample contenenti aricoli della BBC
# suddivisi in 5 categorie in base al loro topic

print('n sample -> '+str(len(df)))
labels = set(df['category'])
print('categories -> '+str(labels)+'['+str(len(labels))+']')
print(df['category'].value_counts())

# associo ad ogni categoria un indice, così ho delle label numeriche
labels_dict = {
    'business': 0,
    'politics': 1,
    'tech': 2,
    'sport': 3,
    'entertainment': 4
}

df['labels'] = df.apply(lambda row: labels_dict[row.category], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

(x_train, x_test, y_train, y_test) = train_test_split(df['text'], df['labels'], test_size=0.2, random_state=17)
(x_train, x_val, y_train, y_val) = train_test_split(x_train, y_train, test_size=0.1, random_state=17)

# sarebbe uno split 72, 8, 20 per avere lo stesso test dell'altra volta

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, x, y, stopwords):

        # x e y sono series di pandas
        tokens_litt = [nltk.word_tokenize(text, language='english')
         for text in list(x)]
        text_clean = []

        if stopwords:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence if
                    not w.lower() in nltk.corpus.stopwords.words("english")]))
        else:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence]))
            # ogni token è separato dall'altro con uno spazio

        self.texts = text_clean
        self.labels = [torch.tensor(label) for label in y]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        batch_texts = self.texts[idx]
        batch_labels = np.array(self.labels[idx])

        return batch_texts, batch_labels

In [None]:
hyperparameters = {
    "epochs": 5,
    "learning_rate": 1e-3,
    "batch_size": 64,
    "dropout": 0.1,
    "stopwords": False,
    "language_model": "bert-base-uncased",
    "layers": 1,
    "h_dim": 768,
    "bilstm": True,
    "patience": 5,
    "min_delta": 0.01
}

In [None]:
#creo i dataset

train_dataset = Dataset(x_train, y_train, hyperparameters["stopwords"])
val_dataset = Dataset(x_val, y_val, hyperparameters["stopwords"])
test_dataset = Dataset(x_test, y_test, hyperparameters["stopwords"])

In [None]:
# classe della rete

class ClassifierDeep(nn.Module):

    def __init__(self, labels, hdim, dropout):
        super(ClassifierDeep, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(hdim, hdim),
            nn.BatchNorm1d(hdim),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(hdim, labels),
            )

    def forward(self, input_texts):
        return self.classifier(input_texts)

In [None]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.0):

        self.patience = patience
        self.min_delta = min_delta              # valore minimo di decrescita della loss di validazione all'epoca corrente
                                                # per asserire che c'è un miglioramenti della loss
        self.counter = 0                        # contatore delle epoche di pazienza
        self.early_stop = False                 # flag di early stop
        self.min_validation_loss = torch.inf    # valore corrente ottimo della loss di validazione

    def __call__(self, validation_loss):
        # chiamata in forma funzionale dell'oggetto di classe EarlySopping

        if (validation_loss + self.min_delta) >= self.min_validation_loss:  # la loss di validazione non decresce
            self.counter += 1                                               # incrementiamo il contatore delle epoche di pazienza
            if self.counter >= self.patience:
                self.early_stop = True
                print("Early stop!")
        else:                                               # c'è un miglioramento della loss:
            self.min_validation_loss = validation_loss      # consideriamo la loss corrente
                                                            # come nuova loss ottimale
            self.counter = 0                                # e azzeriamo il contatore di pazienza


In [None]:
def gen_embeddings(input_id_text, attention_mask, lm_model):
    with torch.no_grad():
        last_hidden_states = lm_model(input_id_text, attention_mask=attention_mask).last_hidden_state
        last_hidden_states = last_hidden_states[:,0,:]
    return last_hidden_states

In [None]:
def train_loop(model, dataloader, tokenizer, lm_model, loss, optimizer, device):
    model.train()

    epoch_acc = 0
    epoch_loss = 0

    for batch_texts, batch_labels in tqdm(dataloader, desc='training set'):

        optimizer.zero_grad()

        tokens = tokenizer(list(batch_texts), add_special_tokens=True, return_tensors='pt', padding='max_length', max_length = 512, truncation=True)
        input_id_texts = tokens['input_ids'].squeeze(1).to(device)
        mask_texts = tokens['attention_mask'].squeeze(1).to(device)
        batch_labels = batch_labels.to(device)
        embeddings_texts = gen_embeddings(input_id_texts, mask_texts, lm_model)
        output = model(embeddings_texts)

        # la loss è una CrossEntropyLoss, al suo interno ha
        # la logsoftmax + negative log likelihood loss
        batch_loss = loss(output, batch_labels)
        batch_loss.backward()
        optimizer.step()

        epoch_loss += batch_loss.item()

        # per calcolare l'accuracy devo generare le predizioni
        # applicando manualmente la logsoftmax
        softmax = nn.LogSoftmax(dim=1)
        epoch_acc += (softmax(output).argmax(dim=1) == batch_labels).sum().item()

        batch_labels = batch_labels.detach().cpu()
        embeddings_texts = embeddings_texts.detach().cpu()
        output = output.detach().cpu()

    return epoch_loss/len(dataloader), epoch_acc

In [None]:
def test_loop(model, dataloader, tokenizer, lm_model, loss, device):
    model.eval()

    epoch_acc = 0
    epoch_loss = 0

    with torch.no_grad():

        for batch_texts, batch_labels, in tqdm(dataloader, desc='dev set'):

            tokens = tokenizer(list(batch_texts), add_special_tokens=True,
        return_tensors='pt', padding='max_length', max_length = 512, truncation=True)
            input_id_texts = tokens['input_ids'].squeeze(1).to(device)
            mask_texts = tokens['attention_mask'].squeeze(1).to(device)
            batch_labels = batch_labels.to(device)
            embeddings_texts = gen_embeddings(input_id_texts, mask_texts, lm_model)
            output = model(embeddings_texts)

            # la loss è una CrossEntropyLoss, al suo interno ha
            # la logsoftmax + negative log likelihood loss
            batch_loss = loss(output, batch_labels)
            epoch_loss += batch_loss.item()

            # per calcolare l'accuracy devo generare le predizioni
            # applicando manualmente la logsoftmax
            softmax = nn.LogSoftmax(dim=1)
            epoch_acc += (softmax(output).argmax(dim=1) == batch_labels).sum().item()

            batch_labels = batch_labels.detach().cpu()
            embeddings_texts = embeddings_texts.detach().cpu()
            output = output.detach().cpu()

    return epoch_loss/len(dataloader), epoch_acc

In [None]:
def train_test(model, epochs, optimizer, device, train_data, test_data,
               batch_size, model_name, train_loss_fn,
               test_loss_fn=None,         # non necessariamente train e test loss devono differire
               early_stopping=None,       # posso addstrare senza early stopping
               val_data=None,             # e in questo caso non c'è validation set
               scheduler=None):           # possibile scheduler per monitorare l'andamento di un iperparametro,
                                          # tipicamente il learning rate

    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

    # check sulle funzioni di loss
    if test_loss_fn == None:
        test_loss_fn = train_loss_fn

    # liste dei valori di loss e accuracy epoca per epoca per il plot
    train_loss = []
    validation_loss = []
    test_loss = []

    train_acc = []
    validation_acc = []
    test_acc = []

    config = AutoConfig.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    lm_model = AutoModel.from_pretrained(model_name, config=config).to(device)

    # Ciclo di addestramento con early stopping
    for epoch in tqdm(range(1,epochs+1)):

        epoch_train_loss, epoch_train_acc = train_loop(model, train_dataloader, tokenizer, lm_model, train_loss_fn, optimizer, device)
        train_loss.append(epoch_train_loss)
        train_acc.append(epoch_train_acc/len(train_data))

        # validation se è presente la callback di early stopping
        if early_stopping != None:
                epoch_validate_loss, epoch_validate_acc = test_loop(model,
                val_dataloader, tokenizer, lm_model, test_loss_fn, device)
                validation_loss.append(epoch_validate_loss)
                validation_acc.append(epoch_validate_acc/len(val_data))

        # test
        epoch_test_loss, epoch_test_acc,= test_loop(model, test_dataloader, tokenizer, lm_model, test_loss_fn, device)
        test_loss.append(epoch_test_loss)
        test_acc.append(epoch_test_acc/len(test_data))

        val_loss_str = f'Validation loss: {epoch_validate_loss:6.4f} ' if early_stopping != None else ' '
        val_acc_str = f'Validation accuracy: {(epoch_validate_acc/len(val_data)):6.4f} ' if early_stopping != None else ' '
        print(f"\nTrain loss: {epoch_train_loss:6.4f} {val_loss_str} Test loss: {epoch_test_loss:6.4f}")
        print(f"Train accuracy: {(epoch_train_acc/len(train_data)):6.4f} {val_acc_str}Test accuracy: {(epoch_test_acc/len(test_data)):6.4f}")

        # early stopping
        if early_stopping != None:
                early_stopping(epoch_validate_loss)
                if early_stopping.early_stop:
                    break

    return train_loss, validation_loss, test_loss, train_acc, validation_acc, test_acc

In [None]:
# Acquisiamo il device su cui effettueremo il training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

model = ClassifierDeep(len(labels_dict), hyperparameters["h_dim"], hyperparameters["dropout"]).to(device)
print(model)

# Calcoliamo il numero totale dei parametri del modello
total_params = sum(p.numel() for p in model.parameters())
print(f"Numbero totale dei parametri: {total_params}")

criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=hyperparameters["learning_rate"])

# Creiamo la callback di early stopping da passare al nostro metodo di addestramento
early_stopping = EarlyStopping(patience=hyperparameters['patience'], min_delta=hyperparameters['min_delta'])

In [None]:
# Routine di addestramento
train_loss, validation_loss,test_loss,
train_acc, validation_acc, test_acc = train_test(model, hyperparameters['epochs'], optimizer, device, train_dataset,
test_dataset, hyperparameters['batch_size'], hyperparameters['language_model'], criterion, criterion, early_stopping, val_dataset)

  0%|          | 0/5 [00:00<?, ?it/s]
training set:   0%|          | 0/26 [00:00<?, ?it/s][A
training set:   4%|▍         | 1/26 [00:01<00:47,  1.88s/it][A
training set:   8%|▊         | 2/26 [00:03<00:44,  1.84s/it][A
training set:  12%|█▏        | 3/26 [00:05<00:42,  1.83s/it][A
training set:  15%|█▌        | 4/26 [00:07<00:40,  1.82s/it][A
training set:  19%|█▉        | 5/26 [00:09<00:38,  1.84s/it][A
training set:  23%|██▎       | 6/26 [00:11<00:37,  1.86s/it][A
training set:  27%|██▋       | 7/26 [00:13<00:35,  1.88s/it][A
training set:  31%|███       | 8/26 [00:14<00:33,  1.87s/it][A
training set:  35%|███▍      | 9/26 [00:16<00:31,  1.86s/it][A
training set:  38%|███▊      | 10/26 [00:18<00:29,  1.85s/it][A
training set:  42%|████▏     | 11/26 [00:20<00:27,  1.85s/it][A
training set:  46%|████▌     | 12/26 [00:22<00:25,  1.84s/it][A
training set:  50%|█████     | 13/26 [00:24<00:24,  1.86s/it][A
training set:  54%|█████▍    | 14/26 [00:26<00:22,  1.89s/it][A
train


Train loss: 0.0616 Validation loss: 0.0908  Test loss: 0.0737
Train accuracy: 0.9850 Validation accuracy: 0.9775 Test accuracy: 0.9730



training set:   0%|          | 0/26 [00:00<?, ?it/s][A
training set:   4%|▍         | 1/26 [00:02<00:50,  2.00s/it][A
training set:   8%|▊         | 2/26 [00:03<00:46,  1.94s/it][A
training set:  12%|█▏        | 3/26 [00:05<00:44,  1.93s/it][A
training set:  15%|█▌        | 4/26 [00:07<00:42,  1.92s/it][A
training set:  19%|█▉        | 5/26 [00:09<00:40,  1.91s/it][A
training set:  23%|██▎       | 6/26 [00:11<00:38,  1.91s/it][A
training set:  27%|██▋       | 7/26 [00:13<00:36,  1.93s/it][A
training set:  31%|███       | 8/26 [00:15<00:35,  1.95s/it][A
training set:  35%|███▍      | 9/26 [00:17<00:32,  1.93s/it][A
training set:  38%|███▊      | 10/26 [00:19<00:30,  1.92s/it][A
training set:  42%|████▏     | 11/26 [00:21<00:28,  1.91s/it][A
training set:  46%|████▌     | 12/26 [00:23<00:26,  1.92s/it][A
training set:  50%|█████     | 13/26 [00:25<00:24,  1.92s/it][A
training set:  54%|█████▍    | 14/26 [00:26<00:23,  1.93s/it][A
training set:  58%|█████▊    | 15/26 [00:2


Train loss: 0.0326 Validation loss: 0.0843  Test loss: 0.0721
Train accuracy: 0.9925 Validation accuracy: 0.9775 Test accuracy: 0.9820



training set:   0%|          | 0/26 [00:00<?, ?it/s][A
training set:   4%|▍         | 1/26 [00:02<00:50,  2.02s/it][A
training set:   8%|▊         | 2/26 [00:03<00:47,  1.96s/it][A
training set:  12%|█▏        | 3/26 [00:05<00:44,  1.94s/it][A
training set:  15%|█▌        | 4/26 [00:07<00:42,  1.93s/it][A
training set:  19%|█▉        | 5/26 [00:09<00:40,  1.93s/it][A
training set:  23%|██▎       | 6/26 [00:11<00:38,  1.93s/it][A
training set:  27%|██▋       | 7/26 [00:13<00:37,  1.95s/it][A
training set:  31%|███       | 8/26 [00:15<00:35,  1.97s/it][A
training set:  35%|███▍      | 9/26 [00:17<00:33,  1.95s/it][A
training set:  38%|███▊      | 10/26 [00:19<00:31,  1.95s/it][A
training set:  42%|████▏     | 11/26 [00:21<00:29,  1.94s/it][A
training set:  46%|████▌     | 12/26 [00:23<00:27,  1.94s/it][A
training set:  50%|█████     | 13/26 [00:25<00:25,  1.94s/it][A
training set:  54%|█████▍    | 14/26 [00:27<00:23,  1.95s/it][A
training set:  58%|█████▊    | 15/26 [00:2


Train loss: 0.0190 Validation loss: 0.0852  Test loss: 0.0659
Train accuracy: 0.9975 Validation accuracy: 0.9719 Test accuracy: 0.9798



training set:   0%|          | 0/26 [00:00<?, ?it/s][A
training set:   4%|▍         | 1/26 [00:02<00:50,  2.02s/it][A
training set:   8%|▊         | 2/26 [00:03<00:47,  1.97s/it][A
training set:  12%|█▏        | 3/26 [00:05<00:44,  1.95s/it][A
training set:  15%|█▌        | 4/26 [00:07<00:42,  1.95s/it][A
training set:  19%|█▉        | 5/26 [00:09<00:40,  1.95s/it][A
training set:  23%|██▎       | 6/26 [00:11<00:39,  1.95s/it][A
training set:  27%|██▋       | 7/26 [00:13<00:37,  1.98s/it][A
training set:  31%|███       | 8/26 [00:15<00:35,  2.00s/it][A
training set:  35%|███▍      | 9/26 [00:17<00:33,  1.98s/it][A
training set:  38%|███▊      | 10/26 [00:19<00:31,  1.98s/it][A
training set:  42%|████▏     | 11/26 [00:21<00:29,  1.97s/it][A
training set:  46%|████▌     | 12/26 [00:23<00:27,  1.97s/it][A
training set:  50%|█████     | 13/26 [00:25<00:25,  1.97s/it][A
training set:  54%|█████▍    | 14/26 [00:27<00:23,  1.99s/it][A
training set:  58%|█████▊    | 15/26 [00:2


Train loss: 0.0101 Validation loss: 0.0994  Test loss: 0.0726
Train accuracy: 1.0000 Validation accuracy: 0.9719 Test accuracy: 0.9798



training set:   0%|          | 0/26 [00:00<?, ?it/s][A
training set:   4%|▍         | 1/26 [00:02<00:50,  2.02s/it][A
training set:   8%|▊         | 2/26 [00:03<00:47,  1.99s/it][A
training set:  12%|█▏        | 3/26 [00:05<00:45,  1.98s/it][A
training set:  15%|█▌        | 4/26 [00:07<00:43,  1.98s/it][A
training set:  19%|█▉        | 5/26 [00:09<00:41,  1.98s/it][A
training set:  23%|██▎       | 6/26 [00:11<00:39,  1.98s/it][A
training set:  27%|██▋       | 7/26 [00:13<00:38,  2.01s/it][A
training set:  31%|███       | 8/26 [00:15<00:36,  2.01s/it][A
training set:  35%|███▍      | 9/26 [00:17<00:33,  2.00s/it][A
training set:  38%|███▊      | 10/26 [00:19<00:31,  1.99s/it][A
training set:  42%|████▏     | 11/26 [00:21<00:29,  1.98s/it][A
training set:  46%|████▌     | 12/26 [00:23<00:27,  1.98s/it][A
training set:  50%|█████     | 13/26 [00:25<00:25,  1.98s/it][A
training set:  54%|█████▍    | 14/26 [00:27<00:23,  2.00s/it][A
training set:  58%|█████▊    | 15/26 [00:2


Train loss: 0.0086 Validation loss: 0.0906  Test loss: 0.0676
Train accuracy: 0.9994 Validation accuracy: 0.9719 Test accuracy: 0.9775





In [None]:
import matplotlib.pyplot as plt


fig, axs = plt.subplots(1, 2, figsize=(20, 10))

axs[0].plot(train_loss, label='training loss')
axs[0].plot(validation_loss, label='validation loss')
axs[0].plot(test_loss, label='test loss')
axs[0].legend(loc='upper right')
axs[0].set_ylim(0,1)

axs[1].plot(train_acc, label='training accuracy')
axs[1].plot(validation_acc, label='validation accuracy')
axs[1].plot(test_acc, label='test accuracy')
axs[1].legend(loc='lower right')
axs[1].set_ylim(0,1)