## Hypothesis 1 test
Create my own model and train it on given dataset

In [1]:
import pandas as pd
import numpy as np
import warnings
import torch

torch.manual_seed(420)
np.random.seed(420)
warnings.filterwarnings('ignore')

In [7]:
train_df = pd.read_csv('../data/internal/train.csv')
test_df = pd.read_csv('../data/internal/test.csv')
val_df = pd.read_csv('../data/internal/validation.csv')

In [8]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,399158,399158,All he's got is that dingy pride of his,he's got nothing but his fucking pride.,0.709218,0.0,0.06765,0.998892
1,124527,124527,"""Can't you see, Mars is crazy!"" he cried.","""you can't see, Mym's crazy!"" He screamed.",0.894288,0.023256,0.034097,0.762456
2,476327,476327,That's a no. No wonder you're not psyched abou...,no wonder sex doesn't take you very much.,0.777771,0.207547,0.021677,0.936896
3,235485,235485,The ass has stuffed you with money,the donkey does your money.,0.631129,0.2,0.998838,0.010489
4,478338,478338,I get wicked bladder infections.,I have a stupid bladder infection.,0.782412,0.057143,0.010191,0.997651


In [10]:
def encode(smple, vcblry):
    return [vcblry.freqs[str(i)] for i in smple]

In [29]:
from torchtext.vocab import build_vocab_from_iterator

class Text2TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, vocab = None, max_size=100):
        self.dataframe = dataframe
        self._preprocess()
        self.vocab = vocab or self._create_vocab()

    def _preprocess(self):
        self.dataframe['reference'] = self.dataframe['reference'].str.lower()
        self.dataframe['translation'] = self.dataframe['translation'].str.lower()

        self.references = [sentence.split() for sentence in self.dataframe['reference']]
        self.translations = [sentence.split() for sentence in self.dataframe['translation']]

    def _yield_tokens(self, sentences):
        for sample in sentences:
            yield sample

    def _create_vocab(self):
        # creates vocabulary that is used for encoding
        # the sequence of tokens (splitted sentence)
        vocab = build_vocab_from_iterator(self._yield_tokens(self.references + self.translations))
        return vocab

    def _get_reference(self, index: int) -> list:
        # retrieves sentence from dataset by index
        sent = self.references[index]
        if self.vocab is None:
            return sent
        return encode(sent, self.vocab)

    def _get_translation(self, index: int) -> list:
        # retrieves tags from dataset by index
        sent = self.translations[index]
        if self.vocab is None:
            return sent
        return encode(sent, self.vocab)

    def __getitem__(self, index) -> tuple[list, list]:
        return self._get_reference(index), self._get_translation(index)

    def __len__(self) -> int:
        return len(self.references)

In [30]:
train_dataset = Text2TextDataset(dataframe=train_df)
val_dataset = Text2TextDataset(dataframe=val_df, vocab=train_dataset.vocab)
test_dataset = Text2TextDataset(dataframe=test_df, vocab=train_dataset.vocab)

693332lines [00:01, 670715.46lines/s]


In [31]:
train_dataset.references[0]

['all', "he's", 'got', 'is', 'that', 'dingy', 'pride', 'of', 'his']

In [32]:
max_size = 50

for sent in train_dataset.references + val_dataset.references + test_dataset.references:
    max_size = max(max_size, len(sent))

max_size

253

In [37]:
batch_size = 128

device = 'mps' if torch.cuda.is_available() else 'cpu' # local machine is on M1 Pro chip

def collate_batch(batch: list):
    references_batch, translation_batch = [], []
    for _reference, _translation in batch:
        _reference_tensor = torch.Tensor(_reference)
        _translation_tensor = torch.Tensor(_translation)

        sent_padding = torch.Tensor([1] * (max_size - len(_reference)))
        ref_padding = torch.Tensor([0] * (max_size - len(_translation_tensor)))

        references_batch.append(torch.concat((_reference_tensor, sent_padding)))
        translation_batch.append(torch.concat((_translation_tensor, ref_padding)))

    return torch.stack(references_batch, dim=0).int().T.to(device), torch.stack(translation_batch, dim=0).T.long().to(device)

train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [38]:
# just to check that all shapes are correct

for batch in train_dataloader:
    inp, out = batch
    print(inp.shape)
    print(out.shape)
    break

torch.Size([253, 128])
torch.Size([253, 128])
