## Hypothesis 1 test
Create my own model and train it on given dataset

In [1]:
import pandas as pd
import numpy as np
import warnings
import torch

torch.manual_seed(420)
np.random.seed(420)
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('../data/internal/train.csv')
test_df = pd.read_csv('../data/internal/test.csv')
val_df = pd.read_csv('../data/internal/validation.csv')

In [3]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,399158,399158,All he's got is that dingy pride of his,he's got nothing but his fucking pride.,0.709218,0.0,0.06765,0.998892
1,124527,124527,"""Can't you see, Mars is crazy!"" he cried.","""you can't see, Mym's crazy!"" He screamed.",0.894288,0.023256,0.034097,0.762456
2,476327,476327,That's a no. No wonder you're not psyched abou...,no wonder sex doesn't take you very much.,0.777771,0.207547,0.021677,0.936896
3,235485,235485,The ass has stuffed you with money,the donkey does your money.,0.631129,0.2,0.998838,0.010489
4,478338,478338,I get wicked bladder infections.,I have a stupid bladder infection.,0.782412,0.057143,0.010191,0.997651


In [4]:
def encode(smple, vcblry):
    return [vcblry.freqs[str(i)] for i in smple]

In [5]:
from torchtext.vocab import build_vocab_from_iterator

class Text2TextDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, vocab = None, max_size=100):
        self.dataframe = dataframe
        self._preprocess()
        self.vocab = vocab or self._create_vocab()

    def _preprocess(self):
        self.dataframe['reference'] = self.dataframe['reference'].str.lower()
        self.dataframe['translation'] = self.dataframe['translation'].str.lower()

        # self.references = [['<SOS>']+sentence.split()+['<EOS>'] for sentence in self.dataframe['reference']]
        # self.translations =  [['<SOS>']+sentence.split()+['<EOS>'] for sentence in self.dataframe['translation']]
        self.references = [sentence.split() for sentence in self.dataframe['reference']]
        self.translations =  [sentence.split() for sentence in self.dataframe['translation']]

    def _yield_tokens(self, sentences):
        for sample in sentences:
            yield sample

    def _create_vocab(self):
        # creates vocabulary that is used for encoding
        # the sequence of tokens (splitted sentence)
        vocab = build_vocab_from_iterator(self._yield_tokens(self.references + self.translations))
        return vocab

    def _get_reference(self, index: int) -> list:
        # retrieves sentence from dataset by index
        sent = self.references[index]
        if self.vocab is None:
            return sent
        return encode(sent, self.vocab)

    def _get_translation(self, index: int) -> list:
        # retrieves tags from dataset by index
        sent = self.translations[index]
        if self.vocab is None:
            return sent
        return encode(sent, self.vocab)

    def __getitem__(self, index) -> tuple[list, list]:
        return self._get_reference(index), self._get_translation(index)

    def __len__(self) -> int:
        return len(self.references)

In [6]:
train_dataset = Text2TextDataset(dataframe=train_df)
val_dataset = Text2TextDataset(dataframe=val_df, vocab=train_dataset.vocab)
test_dataset = Text2TextDataset(dataframe=test_df, vocab=train_dataset.vocab)

693332lines [00:01, 645654.33lines/s]


In [7]:
train_dataset.references[0]

['all', "he's", 'got', 'is', 'that', 'dingy', 'pride', 'of', 'his']

In [8]:
max_size = 50

for sent in train_dataset.references + val_dataset.references + test_dataset.references:
    max_size = max(max_size, len(sent))

max_size

253

In [9]:
batch_size = 16

device = 'cpu' # 'cpu' # local machine is on M1 Pro chip

def collate_batch(batch: list):
    references_batch, translation_batch = [], []
    for _reference, _translation in batch:
        _reference_tensor = torch.Tensor(_reference)
        _translation_tensor = torch.Tensor(_translation)

        sent_padding = torch.Tensor([1] * (max_size - len(_reference)))
        ref_padding = torch.Tensor([1] * (max_size - len(_translation_tensor)))

        references_batch.append(torch.concat((_reference_tensor, sent_padding)))
        translation_batch.append(torch.concat((_translation_tensor, ref_padding)))

    return torch.stack(references_batch, dim=0).int().to(device), torch.stack(translation_batch, dim=0).long().to(device)

train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [10]:
# just to check that all shapes are correct

for batch in train_dataloader:
    inp, out = batch
    print(inp.shape)
    print(out.shape)
    break

torch.Size([16, 253])
torch.Size([16, 253])


## Model

In [11]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

SOS_token = 0
EOS_token = 1

In [12]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size, padding_idx=1)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        print(f"encoder output:{output.shape}")
        return output, hidden

In [13]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(max_size):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # print(f"decoder_input shape: {decoder_input.shape}")
            # print(f"decoder_output shape: {decoder_output.shape}")
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                # print(f"target_tensor shape: {target_tensor.shape}")
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
                # print(f"decoder_input shape: {decoder_input.shape}")
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
            # print('i end of loop', i)


        decoder_outputs = torch.cat(decoder_outputs, dim=0)
        # print(f"decoder_output shape after loop: {decoder_outputs.shape}")
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        # print('attn decoder forward_step')
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        # print(f'encoder_outputs: {encoder_outputs.shape}')
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [14]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
                decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        print('encoder_outputs calc')
        encoder_outputs, encoder_hidden = encoder(input_tensor)

        print('decoder_outputs calc')
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)
        print(f'decoder_outputs: {decoder_outputs.shape}')

        print('loss calc')
        print(decoder_outputs.view(-1, decoder_outputs.size(-1)).shape)
        print(target_tensor.view(-1).shape)
        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [15]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [16]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
          print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                         epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [17]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = encode(sentence, input_lang)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [18]:
hidden_size = 64
vocab_size = 262598 + 1 #len(train_dataset.vocab)

encoder = EncoderRNN(vocab_size, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, vocab_size).to(device)

train(train_dataloader, encoder, decoder, 10, print_every=5, plot_every=5)

encoder_outputs calc
encoder output:torch.Size([16, 253, 64])
decoder_outputs calc
decoder_outputs: torch.Size([4048, 1, 262599])
loss calc
torch.Size([4048, 262599])
torch.Size([4048])
encoder_outputs calc
encoder output:torch.Size([16, 253, 64])
decoder_outputs calc
decoder_outputs: torch.Size([4048, 1, 262599])
loss calc
torch.Size([4048, 262599])
torch.Size([4048])
encoder_outputs calc
encoder output:torch.Size([16, 253, 64])
decoder_outputs calc
decoder_outputs: torch.Size([4048, 1, 262599])
loss calc
torch.Size([4048, 262599])
torch.Size([4048])
encoder_outputs calc
encoder output:torch.Size([16, 253, 64])
decoder_outputs calc
decoder_outputs: torch.Size([4048, 1, 262599])
loss calc
torch.Size([4048, 262599])
torch.Size([4048])
encoder_outputs calc
encoder output:torch.Size([16, 253, 64])
decoder_outputs calc
decoder_outputs: torch.Size([4048, 1, 262599])
loss calc
torch.Size([4048, 262599])
torch.Size([4048])
encoder_outputs calc
encoder output:torch.Size([16, 253, 64])
decoder_

KeyboardInterrupt: 

### It took more than 90 minutes, and even one epoch has not executed, so I stopped the training. It is not a good solution (at least for my setup)