In [None]:
import pandas as pd
import torch
import warnings

warnings.filterwarnings('ignore')

## Data reading and preprocessing

In [None]:
train = pd.read_csv('/kaggle/input/pmldl-week4-many-to-many-nlp-task/train.csv')
test = pd.read_csv('/kaggle/input/pmldl-week4-many-to-many-nlp-task/test.csv')

In [None]:
train.head()

In [None]:
train[train.isna().any(axis=1)]

In [None]:
test.head()

First, let's divide dataset on train and validation. And split the dataframe according to random split.

In [None]:
from sklearn.model_selection import train_test_split
VALIDATION_RATIO = 0.1
train_split, val_split = train_test_split(range(train['sentence_id'].max()), test_size=VALIDATION_RATIO, random_state=420)

And then split the original dataframe by ids that we splitted.

In [None]:
train_dataframe = train[train['sentence_id'].isin(train_split)]
val_dataframe = train[train['sentence_id'].isin(val_split)]

In [None]:
pos_tags = train['tag'].unique().tolist()
cat2idx = {tag: i for i, tag in enumerate(pos_tags)}
idx2cat = {v: k for k, v in cat2idx.items()}

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

Let's check that given train data is valid: 

## Generating a dataset

### Analysis of the sentence length

In [None]:
print(train["entity_id"].max())
print(test["entity_id"].max())

In [None]:
print(len((train[train["entity_id"] > 128]["sentence_id"]).unique()))

In [None]:
from matplotlib import pyplot as plt
plt.hist(train["entity_id"], log=True, bins=27)
plt.savefig("pic.png")

Based on that, the maximal length of 128 seems to be optimal

In [None]:
max_words_in_sentense = 128

### Creating a dataset

For working with datasets more efficiently, let's create separate classes for datasets. 



In [None]:
import torch
torch.manual_seed(420)
from torchtext.vocab import build_vocab_from_iterator


class PosTaggingDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, cat2idx, vocab=None, max_size=100, if_train=True):
        self.dataframe = dataframe
        self.cat2idx = cat2idx
        self.if_train = if_train
        self._preprocess()
        self.vocab = vocab or self._generate_vocab()

    def _preprocess(self):
        # Fill missing tag to `other` - `X`  
        self.dataframe["entity"].fillna("<unk>", inplace=True)
        
        # Clean entities column
        self.dataframe = self.dataframe.drop(columns="entity_id")
        
        # Split the dataset, so that we will have 
        # full sentences and full tags by the same index
        aug_dataframe = self.dataframe.copy()
        aug_dataframe["entity"] += " "
        if self.if_train:
            aug_dataframe["tag"] += " "
        sentences = aug_dataframe.groupby(["sentence_id"]).sum()
        
        self.sentences = sentences["entity"].apply(lambda x: x.strip().split(" "))
        if self.if_train:
            self.tags = sentences["tag"].apply(lambda x: x.strip().split(" "))

    def _generate_vocab(self):
        vocab = build_vocab_from_iterator(self.sentences, specials=special_symbols)
        vocab.set_default_index(UNK_IDX)
            
        return vocab
        
    def _get_sentence(self, index: int) -> list:
        # retrieves sentence from dataset by index
        sent = self.sentences.iat[index]
        
        return self.vocab(sent)

    def _get_labels(self, index: int) -> list:
        # retrieves tags from dataset by index
        tag = self.tags.iat[index]
        return [self.cat2idx[tag_i] for tag_i in tag]

    def __getitem__(self, index) -> tuple[list, list] | list:
        if self.if_train:
            return self._get_sentence(index), self._get_labels(index)
        else:
            return self._get_sentence(index)
    def __len__(self) -> int:
        return len(self.sentences)

In [None]:
# Create train dataset
train_dataset = PosTaggingDataset(train_dataframe, cat2idx)
train_vocab = train_dataset.vocab
val_dataset = PosTaggingDataset(val_dataframe, cat2idx, train_vocab)

And now we are able to create dataloader faster, because we created torch datasets

In [None]:
from torch.utils.data import DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def collate_batch(batch: list):
    # Collate list of samples into tensor batch
    # As an input we have list of pair from dataset:
    # [([ent1, ent2, ...], [tag1, tag2, ...]), ([ent1, ent2, ...], [tag1, tag2, ...]), ...]
    # as an output, we want to have tensor of entities and tensor of tags 
    sentences_batch, postags_batch = [], []
    for _sent, _postags in batch:
        n_pad = max_words_in_sentense - len(_sent)
        if n_pad > 0:    
            sentences_batch.append(_sent + [PAD_IDX for _ in range(n_pad)])
            postags_batch.append(_postags + [cat2idx['X'] for _ in range(n_pad)])
        else:
            sentences_batch.append(_sent[:max_words_in_sentense])
            postags_batch.append(_postags[:max_words_in_sentense])
        len_sent = len(sentences_batch[-1])
        len_postags = len(postags_batch[-1])
        
    sentences_batch = torch.tensor(sentences_batch, dtype=torch.int64)
    postags_batch = torch.tensor(postags_batch, dtype=torch.int64)
    # Remember, that if we want to perform many to many mapping with our network with recurrent units, 
    # we want pass first item from all sequences as first input, thus
    # we want to have tensor with shape (max_size, ...., batch_size)
    return sentences_batch.to(device), postags_batch.to(device)
batch_size = 128
train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch, drop_last = True
)
val_dataloader = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, drop_last = True
)

In [None]:
# just to check that all shapes are correct

for batch in train_dataloader:
    inp, out = batch
    print(inp.shape)
    print(out.shape)
    break

## Creating the network

For the many-to-many or seq2seq netoworks, we want to have recurrent units in the network. This gives the ability for network to learn the hidden features and pass the knowledge from one token to other. 

### Embeddings

For embeddings you can use `nn.Embedding` for creating your own features or use pretrained embedding (like GloVe or FastText or Bert).

### Recurrent

For processing sequences you can use recurrent units like `LSTM`.

### Linear

Add simple nn.Linear. ~~This is basic stuff what do you want~~

### Regularization

Remeber to set up Dropout and Batch Normalization for regularization purposes.

In [None]:
import torch.nn as nn

class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=False)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out = self.lstm(embeds)[0]
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = tag_space
        return tag_scores

## Training

As for training you should take into account that the shape of your output and shape of the labels. Perform required transformations and use loss function that fits your task.

> Do not forget about tqdm and logging, you want normal training not some unreadable ~~sht~~ logs. 

In [None]:
from tqdm.autonotebook import tqdm

def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    total = 0
    for i, batch in loop:
        texts, labels = batch
        
        model.zero_grad()
        
        offsets = torch.zeros(1).to(device)
        outputs = model(texts)
        loss = loss_fn(
            torch.reshape(outputs, (-1, 12)), 
            torch.reshape(labels, (-1, ))
        )
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss/(i*len(labels))})

def val_one_epoch(
    model,
    loader,
    loss_fn,
    score_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt'
):
    
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    score = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, labels = batch
            offsets = torch.zeros(1)
            
            outputs = model(texts)
            labels_raveled = torch.reshape(labels, (-1, ))
            outputs_raveled = torch.reshape(outputs, (-1, 12)) 
            loss = loss_fn(
                outputs_raveled, 
                labels_raveled
            )
            predicted = torch.argmax(outputs_raveled.data, dim=1).to(device)
            total += predicted.size(0)
            correct += (predicted == labels_raveled).sum()
            
            score += score_fn(predicted, labels_raveled)
            
            loop.set_postfix({"loss": val_loss/total, "acc": correct / total, "score": score_fn(predicted, labels_raveled)})
        
        score /= len(loader)
        if score > best_so_far:
            torch.save(model.state_dict(), 'best_model.pt')
            best_so_far = score

    return best_so_far, val_loss

In [None]:
device

In [None]:
import torch.optim as optim 
import torchmetrics

INPUT_DIM = len(train_dataset.vocab)
OUTPUT_DIM = len(pos_tags)

model = LSTMTagger( 
    embedding_dim=100, 
    hidden_dim=64, 
    vocab_size=len(train_vocab), 
    tagset_size=12
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.3, patience=1, verbose=True)
f1_score = torchmetrics.F1Score(task = "multiclass", num_classes = 12).to(device)

In [None]:
best = -float('inf')
num_epochs = 10
for epoch in range(num_epochs):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch_num=epoch)
    best_so_far = val_one_epoch(model, val_dataloader, loss_fn, f1_score, epoch, best_so_far=best)

# Predictions

Write prediction. That's it. No more instructions, you already made it 3 times.

In [None]:
# you can use the same dataset class
test_dataset = PosTaggingDataset(test, cat2idx, train_vocab, if_train=False)

In [None]:
def collate_batch(batch: list):
    # Collate list of samples into tensor batch
    # As an input we have list of pair from dataset:
    # [([ent1, ent2, ...], [tag1, tag2, ...]), ([ent1, ent2, ...], [tag1, tag2, ...]), ...]
    # as an output, we want to have tensor of entities and tensor of tags 
    sentences_batch, postags_batch = [], []
    for _sent, _postags in batch:
        n_pad = max_words_in_sentense - len(_sent)
        if n_pad > 0:    
            sentences_batch.append(_sent + [PAD_IDX for _ in range(n_pad)])
            postags_batch.append(_postags + [cat2idx['X'] for _ in range(n_pad)])
        else:
            sentences_batch.append(_sent[:max_words_in_sentense])
            postags_batch.append(_postags[:max_words_in_sentense])
        len_sent = len(sentences_batch[-1])
        len_postags = len(postags_batch[-1])
        
    sentences_batch = torch.tensor(sentences_batch, dtype=torch.int64)
    postags_batch = torch.tensor(postags_batch, dtype=torch.int64)
    # Remember, that if we want to perform many to many mapping with our network with recurrent units, 
    # we want pass first item from all sequences as first input, thus
    # we want to have tensor with shape (max_size, ...., batch_size)
    return sentences_batch.to(device), postags_batch.to(device)

In [None]:
batch_size = 128

# remebder that for training we can use pads but for testing we need to write 
# exact length of the sentence into the seubmission
def collate_batch(batch: list):
    sentences_batch, sentences_lengths = [], []
    for _sent in batch:
        sentences_lengths.append(len(_sent))
        n_pad = max_words_in_sentense - len(_sent)
        sentences_batch.append(_sent + [PAD_IDX for _ in range(n_pad)])
        
    sentences_batch = torch.tensor(sentences_batch, dtype=torch.int64)
    sentences_lengths = torch.tensor(sentences_lengths, dtype=torch.int64)
    return sentences_batch.to(device), sentences_lengths.to(device)

test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [None]:
for batch in test_dataloader:
    inp, out = batch
    print(inp.shape)
    print(out)
    break

In [None]:
def predict(
    model,
    loader,
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Predictions",
        leave=True,
    )
    predictions = []
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            text, text_len = batch
            outputs = model(text)
            
            _, predicted = torch.max(outputs.data, 2)
            for sent, sent_len in zip(predicted, text_len):
                 predictions += sent[:sent_len]

    return predictions

In [None]:
ckpt = torch.load("/kaggle/working/best_model.pt")
model.load_state_dict(ckpt)

predictions = predict(model, test_dataloader)
predictions[:10]

In [None]:
labeled_preds = [idx2cat[int(pred)] for pred in predictions]

In [None]:
labeled_preds[:10]

In [None]:
results = pd.Series(labeled_preds)
results.to_csv('submission.csv', index_label='id')

In [None]:
results