## Loading preprocessed data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('..\data\interim\\filtered_preprocessed.tsv', sep='\t')

In [3]:
from sklearn.model_selection import train_test_split
test_ratio = 0.1
train_val, test = train_test_split(
    data, test_size=test_ratio, random_state=42)
val_ratio = 0.2
train, val = train_test_split(
    train_val, test_size=val_ratio, random_state=42
)

## Creating DataLoaders

In [48]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class CustomTextDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        sample = self.data.iloc[idx]
        ref = torch.tensor(vocab(eval(sample.to_list()[0])))
        trn = torch.tensor(vocab(eval(sample.to_list()[1])))
        return ref, trn

In [21]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [22]:
def readLangs(lang1, lang2):
    ref = data['reference'].apply(lambda x: eval(x))
    trn = data['translation'].apply(lambda x: eval(x))
    pairs = []
    for i in range(len(ref)):
        pairs.append((ref.iloc[i], trn.iloc[i]))
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)
    return input_lang, output_lang, pairs

In [23]:
def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readLangs(lang1, lang2)
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [40]:
def collate_batch(batch):
    ref_list, trn_list = [], []
    for ref, trn in batch:
        ref_list.append(torch.tensor(vocab(ref)))
        trn_list.append(torch.tensor(vocab(trn)))
    padded_refs = pad_sequence(ref_list, batch_first=True, padding_value=0)
    padded_trns = pad_sequence(trn_list, batch_first=True, padding_value=0)
    return torch.Tensor()
train_dataset = CustomTextDataset(train, vocab)
val_dataset = CustomTextDataset(val, vocab)
test_dataset = CustomTextDataset(test, vocab)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_batch)

<h3> I would like to start with simpler approaches and move towards more complex ones.

#### Dictionary-Based

In [132]:
def detoxify_text(words, dictionary):
    for i in range(len(words)):
        if words[i] in dictionary:
            words[i] = dictionary[words[i]]
    return ' '.join(words)

In [116]:
from collections import Counter

In [117]:
c = Counter(s)

In [120]:
# Now let's extract top 30 frequent words from dataset
top_30_most_common = c.most_common(30)

In [121]:
top_30_most_common

[('like', 49373),
 ('get', 42156),
 ('fuck', 41943),
 ('kill', 41790),
 ('go', 36302),
 ('want', 35677),
 ('know', 33762),
 ('one', 28465),
 ('hell', 25324),
 ('shit', 23640),
 ('na', 23598),
 ('look', 21725),
 ('think', 21682),
 ('gon', 21496),
 ('would', 21052),
 ('man', 20824),
 ('die', 19730),
 ('got', 18844),
 ('damn', 18476),
 ('take', 17114),
 ('us', 15329),
 ('time', 15154),
 ('come', 15039),
 ('guy', 14998),
 ('thing', 14660),
 ('littl', 14581),
 ('back', 14527),
 ('right', 14487),
 ('said', 14217),
 ('could', 14193)]

In [122]:
# Now let's change them to more appropriate
dictionary = {
    "hell": "heck",
    "fuck": "fudge",
    "kill": "eliminate",
    "shut": "close",
    "die": "pass away",
    "shit": "crap"
}

In [123]:
sample_test_ref, sample_test_trn = test['reference'], test['translation']

In [124]:
indecis = sample_test_ref.index[:10]

In [139]:
for idx in indecis:
    print(detoxify_text(eval(sample_test_ref[idx]), dictionary), ' '.join(eval(sample_test_ref[idx])), sep=' | ')
    print(' '.join(eval(sample_test_trn[idx])))
    print()

listen call butcher tell | listen call butcher tell
call butcher tell

fudge go stuff | fuck go stuff
hell go stuff

still might pass away | still might die
still die

yeah fucker name | yeah fucker name
name

would take shoulder like strap like let go heck | would take shoulder like strap like let go hell
take shoulder tie say

quiet tri drive | quiet tri drive
shut tri drive

thought go take night man realli dump | thought go take night man realli dump
thought gon na hit fuck

selfish weak carri plan | selfish weak carri plan
selfish weak fill plan

wife chew | wife chew
wife wait

make cross boy deton go | make cross boy deton go
gon na make chang plan kid gon na go



#### Seq2Seq

In [30]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
import numpy as np

In [50]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [51]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [37]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)
def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)
def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('Ref', 'Tra')
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

In [38]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [39]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [40]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [41]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [42]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [45]:
SOS_token = 0
EOS_token = 1

In [55]:
data

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"['alkar', 'flood', 'psychic', 'wast', 'explain...","['alkar', 'flood', 'mental', 'wast', 'would', ...",0.785171,0.010309,0.014195,0.981983
1,"['get', 'nasti']","['becom', 'disgust']",0.749687,0.071429,0.065473,0.999039
2,"['well', 'could', 'spare', 'life', 'one']","['well', 'spare', 'life']",0.919051,0.268293,0.213313,0.985068
3,"['ah', 'monkey', 'got', 'snap']","['monkey', 'wake']",0.664333,0.309524,0.053362,0.994215
4,"['got', 'order', 'put']","['order', 'kill']",0.726639,0.181818,0.009402,0.999348
...,...,...,...,...,...,...
577772,"['know', 'estel', 'stolen', 'fish', 'bin']","['know', 'estel', 'stole', 'fish', 'garbag']",0.870322,0.030769,0.000121,0.949143
577773,"['il', 'suck', 'life']","['suck', 'life']",0.722897,0.058824,0.996124,0.215794
577774,"['fuckin', 'take', 'bruv']","['realli', 'take']",0.617511,0.212121,0.984538,0.000049
577775,"['call', 'fuck', 'hero', 'truth', 'care', 'any...","['said', 'hero', 'care']",0.679613,0.358209,0.991945,0.000124


In [53]:
hidden_size = 128
batch_size = 32
MAX_LENGTH = 145
# input_lang, output_lang, train_dataloader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

OutOfMemoryError: CUDA out of memory. Tried to allocate 810.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 2.95 GiB is allocated by PyTorch, and 48.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

#### Seq2Seq with Attention

In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights