<a href="https://colab.research.google.com/github/guilleripa/dump/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from typing import Generator

import pandas as pd
import torch
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

device = torch.device("cuda")

tokenizer = get_tokenizer("basic_english")


def yield_tokens(data_iter: pd.DataFrame) -> Generator[str, None, None]:
    for _, text, _ in data_iter.itertuples():
        yield tokenizer(text)


class CustomDataset:
    def __init__(self, data: pd.DataFrame):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]


def get_dataloaders(
    train_data: pd.DataFrame, test_data: pd.DataFrame, batch_size: int = 16
) -> DataLoader:
    vocab = build_vocab_from_iterator(yield_tokens(train_data), specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    train_data["tokens"] = train_data["question_text"].apply(
        lambda x: vocab(tokenizer(x))
    )
    test_data["tokens"] = test_data["question_text"].apply(
        lambda x: vocab(tokenizer(x))
    )

    train_dataset = CustomDataset(train_data)
    test_dataset = CustomDataset(test_data)
    train_dataset.vocab = vocab
    test_dataset.vocab = vocab

    def collate_batch(batch):
        """Collate function to pad the text to the maximum length in a batch.
        The batch is sorted in descending order of text length to minimize the
        amount of padding needed.
        """
        label_list, text_list, lengths = [], [], []
        for _text, _label, _tokens in batch:
            label_list.append(_label)
            # processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
            processed_text = torch.tensor(_tokens)
            text_list.append(processed_text)
            if processed_text.size(0) == 0:
                lengths.append(torch.tensor(1))
                print("Empty text found!", _text, _label, processed_text)
            else:
                lengths.append(processed_text.size(0))

        text_list = torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True)
        label_list = torch.tensor(label_list, dtype=torch.float32).reshape(-1, 1)

        # sort based on text lengths
        lengths = torch.tensor(lengths)
        _, perm_idx = lengths.sort(0, descending=True)

        text_list = text_list[perm_idx]
        label_list = label_list[perm_idx]
        lengths = lengths[perm_idx]

        return label_list.to(device), text_list.to(device), lengths

    train_ddl = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
    )
    test_ddl = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch
    )
    return train_ddl, test_ddl


In [21]:
import torch
import torch.nn as nn

# Obtener la matriz de embeddings desde el diccionario     ---> FREEZARLO (PARA NO CATASTROFIC FORGETING)
# embeddings_matrix = torch.tensor(list(embeddings_index.values()))


# Define una clase para el modelo
class RNNModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        bidirectional,
        dropout,
    ):
        self.bidirectional = bidirectional
        super(RNNModel, self).__init__()

        # Capa de embedding con los embeddings pre-entrenados (la congele)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Capa RNN (LSTM en este caso)
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=self.bidirectional,
            dropout=dropout,
        )

        # Capa lineal de salida
        self.fc = nn.Linear(
            hidden_dim * 2 if self.bidirectional else hidden_dim, output_dim
        )

        # Capa de dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)

        packed_embedded = nn.utils.rnn.pack_padded_sequence(
            embedded, text_lengths, batch_first=True
        )
        _, (hidden, _) = self.rnn(packed_embedded)
        # output, output_lengths = nn.utils.rnn.pad_packed_sequence(
        #     packed_output, batch_first=True
        # )
        hidden = self.dropout(
            torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
            if self.bidirectional
            else hidden[-1, :, :]
        )
        output = self.fc(hidden)
        return output


In [24]:
import time
from pathlib import Path

import pandas as pd
import torch

# import f1 score
from sklearn.metrics import f1_score
from torch import nn

# CURRENT_DIR = Path(__file__).parent
CURRENT_DIR = Path("/content")
DATA_DIR = CURRENT_DIR

BATCH_SIZE = 128
LEARNING_RATE = 1e-3


def train_loop(dataloader, model, loss_fn, optimizer, epoch):
    total_acc, total_count, total_f1_score = 0, 0, 0
    log_interval = 500
    start_time = time.time()
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch_num, (label, text, text_lens) in enumerate(dataloader):
        optimizer.zero_grad()
        # Compute prediction and loss
        predicted_label = model(text, text_lens)
        loss = loss_fn(predicted_label, label)

        # Backpropagation
        loss.backward()
        # clip the gradient norm to 1.0 to prevent exploding gradients. A common
        # problem with RNNs and LSTMs is the "exploding gradient" problem. This
        # is where the gradient for a particular parameter gets larger and larger
        # as the number of layers increases. This can result in the gradient
        # becoming so large that the weights overflow (i.e. become NaN) and the
        # model fails to train.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

        predictions = (predicted_label > 0.5).float()

        total_acc += (predictions == label).sum().item()
        total_count += label.size(0)
        # total_f1_score += f1_score(
        #         label.detach().cpu(), predictions.detach().cpu(), average="macro"
        #     )

        if batch_num % log_interval == 0 and batch_num > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}"
                # "| f1_score {:8.3f}"
                "| ms/batch {:5.2f}".format(
                    epoch,
                    batch_num,
                    len(dataloader),
                    total_acc / total_count,
                    # total_f1_score / log_interval,
                    elapsed * 1000 / log_interval,
                )
            )
            total_acc, total_count, total_f1_score = 0, 0, 0
            start_time = time.time()


def evaluate(dataloader, model, criterion):
    model.eval()
    total_acc, total_count, total_f1_score = 0, 0, 0
    total_loss = 0

    with torch.no_grad():
        for _, (label, text, offsets) in enumerate(dataloader):
            predicted_label = (model(text, offsets) > 0.5).float()
            total_loss += criterion(predicted_label, label)

            total_acc += (predicted_label == label).sum().item()
            total_f1_score += f1_score(
                label.detach().cpu(), predicted_label.detach().cpu(), average="macro"
            )
            total_count += label.size(0)

    print(
        "Evaluation - loss: {:.6f}  "
        "accuracy: {:.3f}  f1_score: {:.3f}\n".format(
            total_loss / len(dataloader),
            total_acc / total_count,
            total_f1_score / len(dataloader),
        )
    )
    return total_acc / total_count, total_f1_score / len(dataloader)


if __name__ == "__main__":
    # Load csv dataset
    train_data = pd.read_csv(DATA_DIR / "train_set.csv")
    train_data = train_data[train_data["question_text"].str.len() > 5]
    test_data = pd.read_csv(DATA_DIR / "test_set.csv")
    test_data = test_data[test_data["question_text"].str.len() > 5]

    # Turn csv to dataloader
    # train_dataloader = get_dataloader(train_data, batch_size=BATCH_SIZE)
    # test_dataloader = get_dataloader(test_data, batch_size=BATCH_SIZE)
    train_dataloader, test_dataloader = get_dataloaders(train_data, test_data, batch_size=BATCH_SIZE)

    # Instantiate model
    model = RNNModel(
        vocab_size=len(train_dataloader.dataset.vocab),
        embedding_dim=8,
        hidden_dim=8,
        output_dim=1,
        n_layers=1,
        bidirectional=True,
        dropout=0,
    )

    # Send model to device
    model = model.to(device)

    # Initialize the loss function
    pos_weight = torch.tensor([7.0])
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

    epochs = 10
    # Measure time
    tic = time.time()
    for t in range(epochs):
        epoch_start_time = time.time()
        print(f"Epoch {t}\n-------------------------------")
        train_loop(train_dataloader, model, loss_fn, optimizer, t)
        # eval
        accu_val, f1_val = evaluate(test_dataloader, model, loss_fn)
        print("-" * 59)
        print(
            "| end of epoch {:3d} | time: {:5.2f}s"
            "| valid accuracy {:8.3f} "
            "| valid f1 score {:8.3f}".format(
                t,
                time.time() - epoch_start_time,
                accu_val,
                f1_val,
            )
        )
    print("-" * 59)
    toc = time.time()
    print("Done!", f"Training time: {toc - tic:>.3f} seconds")
    # Train model
    train_loop(train_dataloader, model, loss_fn, optimizer, epochs+1)


Epoch 0
-------------------------------
| epoch   0 |   500/ 2526 batches | accuracy    0.728| ms/batch 20.30
| epoch   0 |  1000/ 2526 batches | accuracy    0.775| ms/batch 17.34
| epoch   0 |  1500/ 2526 batches | accuracy    0.801| ms/batch 19.79
| epoch   0 |  2000/ 2526 batches | accuracy    0.816| ms/batch 20.31
| epoch   0 |  2500/ 2526 batches | accuracy    0.825| ms/batch 18.08
Evaluation - loss: 1.166254  accuracy: 0.820  f1_score: 0.764

-----------------------------------------------------------
| end of epoch   0 | time: 58.38s| valid accuracy    0.820 | valid f1 score    0.764
Epoch 1
-------------------------------
| epoch   1 |   500/ 2526 batches | accuracy    0.841| ms/batch 19.72
| epoch   1 |  1000/ 2526 batches | accuracy    0.847| ms/batch 19.80
| epoch   1 |  1500/ 2526 batches | accuracy    0.856| ms/batch 16.81
| epoch   1 |  2000/ 2526 batches | accuracy    0.860| ms/batch 19.77
| epoch   1 |  2500/ 2526 batches | accuracy    0.861| ms/batch 20.33
Evaluation -

TypeError: ignored