In [1]:
from os.path import join
import random
from collections import defaultdict, Counter
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import sentencepiece as spm
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

In [2]:
np.random.seed(42)

In [3]:
DATA_DIR = join('..', 'data', 'raw')

In [4]:
train_full_df = pd.read_csv(join(DATA_DIR, 'train.tsv'), sep='\t')
train_full_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,402555,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,360472,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,150662,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,183004,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [5]:
train_append_df = pd.read_csv(join(DATA_DIR, 'test.tsv'), sep='\t')

In [6]:
train_full_df = pd.concat([train_full_df, train_append_df])
train_full_df = train_full_df[['question1', 'question2']]

In [7]:
train_val_df, test_df = train_test_split(train_full_df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=1/9, random_state=42)

In [8]:
len(train_df), len(val_df), len(test_df)

(603848, 75481, 75482)

# Sentencepiece Tokenizer

In [9]:
with open('../data/processed/train_samples.txt', 'w') as f:
    for _, row in train_df.iterrows():
        f.write(row['question1'] + '\n')
        f.write(row['question2'] + '\n')

In [10]:
spm.SentencePieceTrainer.train(
    input='../data/processed/train_samples.txt', model_prefix='../models/trained/spm-8k', vocab_size=8000)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/processed/train_samples.txt
  input_format: 
  model_prefix: ../models/trained/spm-8k
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy:

In [11]:
sp = spm.SentencePieceProcessor(model_file='../models/trained/spm-8k.model', add_bos=True, add_eos=True)

In [12]:
text = "This is a sample sentence."
sp.encode(text, out_type=int)

[1, 4270, 9, 8, 5081, 534, 22, 2]

In [13]:
sp.decode([1, 4259, 9, 8, 5024, 539, 22, 2])

'vision is a disable list.'

In [14]:
BOS = '<s>'
EOS = '</s>'
UNK = '<unk>'

In [15]:
sp.PieceToId(BOS)

1

In [16]:
train_samples = []
for _, row in train_df.iterrows():
    train_samples.append(row['question1'])
    train_samples.append(row['question2'])

In [17]:
val_samples = []
for _, row in val_df.iterrows():
    val_samples.append(row['question1'])
    val_samples.append(row['question2'])

In [18]:
test_samples = []
for _, row in test_df.iterrows():
    test_samples.append(row['question1'])
    test_samples.append(row['question2'])

In [19]:
full_train_samples = train_samples[:] + val_samples[:]

# n-gram

In [20]:
class NGram:

    def __init__(self, tokenizer, n=2):
        self.n = n
        self.vocab_size = tokenizer.piece_size()
        self.tokenizer = tokenizer
        self.ngram_counts = defaultdict(Counter)

    def train(self, sentences):
        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence, out_type=int)
            # pad (n-2) start tokens => (n-1) start tokens in total
            tokens = [self.tokenizer.piece_to_id(BOS)] * (self.n - 2) + tokens
            for i in range(self.n - 1, len(tokens)):
                ngram = tuple(tokens[(i - self.n + 1): i])
                self.ngram_counts[ngram][tokens[i]] += 1

    def calculate_perplexity(self, sentences):
        total_tokens = 0
        log_prob_sum = 0

        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence, out_type=int)
            total_tokens += len(tokens)
            tokens = [self.tokenizer.piece_to_id(BOS)] * (self.n - 2) + tokens
            for i in range(self.n - 1, len(tokens)):
                context = tuple(tokens[(i - self.n + 1): i])
                current_word = tokens[i]
                # Laplace (add-one) smoothing
                if context in self.ngram_counts and current_word in self.ngram_counts[context]:
                    count = self.ngram_counts[context][current_word] + 1
                else:
                    count = 1
                denominator = sum(self.ngram_counts[context].values()) - len(self.ngram_counts[context]) + self.vocab_size
                prob = count / denominator
                log_prob_sum += -np.log(prob)

        avg_log_likelihood = log_prob_sum / total_tokens
        return np.exp(avg_log_likelihood)

    def generate_text(self, start_text=None, max_len=100):
        if start_text:
            start_tokens = self.tokenizer.encode(start_text, out_type=int)
            generated_tokens = start_tokens
        else:
            generated_tokens = []
        if len(generated_tokens) < self.n - 1:
            pad = [self.tokenizer.piece_to_id(BOS)] * (self.n - 1 - len(generated_tokens))
            generated_tokens = pad + generated_tokens
        for _ in range(max_len):
            context = tuple(generated_tokens[-(self.n - 1):])
            next_token = self._generate_next_token(context)
            generated_tokens.append(next_token)
            if next_token == self.tokenizer.piece_to_id(EOS): break
        return self.tokenizer.decode(generated_tokens)

    def _generate_next_token(self, context):
        if context in self.ngram_counts:
            word_counts = self.ngram_counts[context]
            total_count = sum(word_counts.values())
            random_prob = random.uniform(0, 1)
            cummulative_prob = 0
            for token, count in word_counts.items():
                word_prob = count / total_count
                cummulative_prob += word_prob
                if cummulative_prob >= random_prob:
                    return token
        return random.randint(0, self.vocab_size - 1)

    def __str__(self):
        if self.n == 2:
            return "bigram"
        elif self.n == 3:
            return "trigram"
        return f"{self.n}-gram"

In [21]:
bigram_model = NGram(tokenizer=sp, n=2)

In [22]:
bigram_model.train(train_samples)

In [23]:
for _ in range(10):
    print(bigram_model.generate_text())

How can'thansangh.to 2 prime worth learning about order for teachings eat when a 5-reototunned on the behavior?
What are Indians angry customer and wants me first started with your favorite email is MB what are the Modi's OpenSort talk very successful?
I solvermkCTE Torating pdf ebook for destra warlord software development in 2016?
What are the illite-prot Sharma Show as good software developer how would you determine the candidate?
If soil after you on my communication skills?
If my do in girls have on his campaign achievements for a good essays against the best counter by a four oh?
I clear that some good option in Italy 9 & Pakistan?
What makes meow and what are the best way to fall of banning in India Pakistan?
Are MS into my web-47 rank in PCMETF module or number of their pictures to make such thinner and he ever?
How many teeth cite jaw crusher?


In [24]:
bigram_model.calculate_perplexity(val_samples)

79.6921422389124

In [25]:
unigram_model = NGram(tokenizer=sp, n=1)
unigram_model.train(train_samples)
unigram_model.calculate_perplexity(val_samples)

534.742102343422

In [26]:
for _ in range(5):
    print(unigram_model.generate_text())

Python Foundation via imge clone+1oc Non divisible White reject journalism Cup password culture speakpsych Office stage Finland emotions oxide watches SSD ch Ji stranger Namebonang coast attribute vertical cheekThe Ronaldoumeter feeling Crusherlike University promotehad trump societies interior symptoms accomplishUEFA compound destination total Holocaust engine acoustic bossuru " index Yadav very learn GA Franciscoown creativity blog optical Engineer anything toolsX observation detectare strange export obsesspurdander import valuation abdominalST European majors jail State wire mostly addictive rocket tried BA shape Piecelin
first institute My beginning Buddhist pollution humanity 22 certificate skinny launchedions yogurtcent Django saved-3 exhibit icon George prompt zinc euro opportunity Prophet quad attractionsningmy bucketBriggsgu opportunity4.589 Ke profitable2017 reasons those easily Group Frost stretchities go regime facility coaching charged ratio granite trek sword LLCservhop k

In [27]:
trigram_model = NGram(tokenizer=sp, n=3)
trigram_model.train(train_samples)
trigram_model.calculate_perplexity(val_samples)

206.0168121315677

In [28]:
for _ in range(10):
    print(trigram_model.generate_text())

What is the man has ejaculated inside me?
Why did the egg glitch?
Which is the future?
What are some year?
How long does it becomes president?
How does one know how to make cut off someone on Quora need improvement?
What are some of the best city in India?
What is funniest joke you have resonance' used in which ecole-6 cups that can easily find the publisher using the Borgi mix from biting and Moriarty will you eat eggs? What are the job interview?
Is there another anime like MMA fighters or buy the iPhone se?
Which is a good engineering college in India?


In [30]:
fourgram_model = NGram(tokenizer=sp, n=4)
fourgram_model.train(train_samples)
fourgram_model.calculate_perplexity(val_samples)

425.6310328838657

In [31]:
for _ in range(10):
    print(fourgram_model.generate_text())

Do shy, introverted, shy or what?
What will happen if Donald Trump becomes President?
What are your opinions?
How many can you take a pregnancy test?
Why is breaking up with you?
What is the best romantic songs Bollywood has ever made?
What is wave motion?
What skills do I need to have an allergic to salt?
What are Best computer science engineering student have before graduating?
What are some of the most beautiful girls?


In [32]:
fivegram_model = NGram(tokenizer=sp, n=5)
fivegram_model.train(train_samples)
fivegram_model.calculate_perplexity(val_samples)

579.9618391107642

In [33]:
# bigram has best validation perplexity
# retain on train + val set
# test on test set

bigram_model = NGram(tokenizer=sp, n=2)
bigram_model.train(full_train_samples)
bigram_model.calculate_perplexity(test_samples)

76.83347459280141

In [78]:
for _ in range(20):
    print(bigram_model.generate_text(max_len=200))

What is the next Senateoin't know going upcoming Colores for getting the latest Pok ⁇ 50?
Does Donald Trump lose weight?
When a Schenhe paise the planet is Phy pages?
What is the originating pump and anti-16?
Why can you should I download movies that will be discontinued? What are blue shirts on Quora I study online java?
Will my gmail truly loves another man how come out of the George Wades like at Adobe Illustrator, Stanford?
How do people make of engineering branch without any difference between Saudi Arabianuals on Instagram when you don'tent background?
What is the benefits of Fas with someone be doing and Kg monetize my phone is the ending of affustoties?
What is the benefits and hacks to enable the Garand-resident and why do I do you tell a woman to get 256?
Which course in the best for general management II documentaries onions for the difference between According to Hillary Clinton'smic reaction?
How do with the best marketing for family, why?
How can I install software develo

# MLP and RNN

In [57]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
@torch.no_grad()
def generate_text(model, tokenizer, start_text=None, max_len=100, top_k=None, num_samples=10, device='cuda'):
    model.eval()

    def generate_next_token(context):
        if context.size(1) < model.get_context_length():
            pad = torch.LongTensor([[tokenizer.piece_size()] * (model.get_context_length() - context.size(1))] * num_samples)
            pad = pad.to(device)
            context = torch.hstack((pad, context))
        logits = model(context)
        if top_k:
            values, _ = torch.topk(logits, top_k)
            logits[logits < values[:, [-1]]] = -float('Inf')
        probs = F.softmax(logits, dim=-1)
        return torch.multinomial(probs, num_samples=1)


    model_context_length = model.get_context_length()
    start_tokens = tokenizer.encode(start_text, out_type=int)[:-1] if start_text else [tokenizer.piece_to_id(BOS)]
    generated_tokens = torch.LongTensor([start_tokens] * num_samples).to(device)

    for _ in range(max_len):
        context = generated_tokens[:, -model_context_length:]
        next_token = generate_next_token(context)
        generated_tokens = torch.cat((generated_tokens, next_token), dim=1)
    for i in range(generated_tokens.size(0)):
        row = generated_tokens[i, :].tolist()
        eos_id = tokenizer.piece_to_id(EOS)
        crop_index = row.index(eos_id) if eos_id in row else len(row)
        row = row[:crop_index]
        print(tokenizer.decode(row))


@torch.no_grad()
def calculate_perplexity(model, dataloader, device='cuda'):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    for (input_tokens, target_token) in tqdm(dataloader):
        input_tokens, target_token = input_tokens.to(device), target_token.to(device)
        logits = model(input_tokens)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_token.view(-1), reduction='sum')
        total_loss += loss.item()
        total_tokens += target_token.size(0) * target_token.size(1)

    avg_neg_log_likelihood = total_loss / total_tokens
    perplexity = np.exp(avg_neg_log_likelihood)
    return perplexity


def train(model, data_loader, optimizer, num_epochs, device='cuda'):
    model.train()
    model.to(device)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        total_tokens = 0

        for (input_tokens, ouput_token) in tqdm(data_loader):
            input_tokens, ouput_token = input_tokens.to(device), ouput_token.to(device)

            optimizer.zero_grad()
            logits = model(input_tokens)
            loss = criterion(logits.view(-1, logits.size(-1)), ouput_token.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_tokens += ouput_token.size(0) * ouput_token.size(1)

        average_loss = total_loss / total_tokens
        print(f"Epoch [{epoch+1}/{num_epochs}] - Average Loss: {average_loss:.4f}")

    return model

In [36]:
# do NOT apply softmax at the output layer, return the logits only

class MLP(nn.Module):

    def __init__(self, tokenizer, embed_size, hidden_size, num_layers=2, context_length=50):
        super().__init__()
        self.context_length = context_length
        self.vocab_size = tokenizer.piece_size()
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size + 1, embedding_dim=embed_size)
        self.fcs = nn.ModuleList(
            [nn.Linear(self.context_length * embed_size, hidden_size)]
            + [nn.Linear(hidden_size, hidden_size) for _ in range(num_layers-2)]
            + [nn.Linear(hidden_size, self.vocab_size)]
        )

    def get_context_length(self):
        return self.context_length

    def forward(self, idx):
        embeds = self.embedding(idx)
        x = embeds.view(embeds.size(0), -1)
        for fc in self.fcs[:-1]:
            x = F.relu(fc(x))
        logits = self.fcs[-1](x)
        return logits


class RNNModel(nn.Module):

    def __init__(
        self, tokenizer, embed_size, hidden_dim, num_rnn_layers=1,
        num_linear_layers=1, dropout=0, rnn_type='rnn', context_length=50
    ):
        super(RNNModel, self).__init__()
        self.vocab_size = tokenizer.piece_size()
        self.embedding = nn.Embedding(self.vocab_size + 1, embed_size)
        self.context_length = context_length

        if rnn_type == 'rnn':
            self.rnn = nn.RNN(embed_size, hidden_dim, num_layers=num_rnn_layers, batch_first=True, dropout=dropout)
        elif rnn_type == 'lstm':
            self.rnn = nn.LSTM(embed_size, hidden_dim, num_layers=num_rnn_layers, batch_first=True, dropout=dropout)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(embed_size, hidden_dim, num_layers=num_rnn_layers, batch_first=True, dropout=dropout)

        self.fcs = nn.ModuleList(
            [nn.Linear(hidden_dim, hidden_dim) for _ in range(num_linear_layers-1)]
            + [nn.Linear(hidden_dim, self.vocab_size)]
        )

    def get_context_length(self):
        return self.context_length

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = x.mean(dim=1)
        for fc in self.fcs[:-1]:
            x = F.relu(fc(x))
        logits = self.fcs[-1](x)
        return logits

In [37]:
class LMDataset(Dataset):

    def __init__(self, sentences, tokenizer, context_length=50):
        self.tokenizer = tokenizer
        self.context_length = context_length
        self.ys = [] # (encoded_token, id_in_sentence)
        # encode every token and record its position in the original sentence
        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence)
            for j in range(1, len(tokens)):
                self.ys.append((tokens[j], j))

    def __len__(self):
        return len(self.ys)

    def __getitem__(self, idx):
        y, id_in_sentence = self.ys[idx]
        x = []
        if id_in_sentence > self.context_length: # don't need padding
            X = [x[0] for x in self.ys[idx - self.context_length:idx]]
        else: # context length not long enough, need padding
            padding = [self.tokenizer.piece_size()] * (self.context_length - id_in_sentence)
            X = padding + [self.tokenizer.piece_to_id(BOS)] + [x[0] for x in self.ys[idx - id_in_sentence + 1:idx]]
        return torch.LongTensor(X), torch.LongTensor([y])

In [63]:
context_length = 50
batch_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_dataset = LMDataset(sentences=train_samples, tokenizer=sp, context_length=context_length)
train_loader = DataLoader(train_dataset, batch_size, shuffle=False)

val_dataset = LMDataset(sentences=val_samples, tokenizer=sp, context_length=context_length)
val_loader = DataLoader(val_dataset, batch_size, shuffle=False)

full_train_dataset = LMDataset(sentences=full_train_samples, tokenizer=sp, context_length=context_length)
full_train_loader = DataLoader(full_train_dataset, batch_size, shuffle=False)

test_dataset = LMDataset(sentences=test_samples, tokenizer=sp, context_length=context_length)
test_loader = DataLoader(test_dataset, batch_size, shuffle=False)

In [54]:
model = MLP(tokenizer=sp, embed_size=100, hidden_size=100, num_layers=4, context_length=context_length)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

model = train(model, train_loader, optimizer, num_epochs=2)

  0%|          | 0/297053 [00:00<?, ?it/s]

Epoch [1/2] - Average Loss: 0.0743


  0%|          | 0/297053 [00:00<?, ?it/s]

Epoch [2/2] - Average Loss: 0.0693


In [55]:
calculate_perplexity(model, val_loader)

  0%|          | 0/37213 [00:00<?, ?it/s]

81.80033235837696

In [58]:
generate_text(model, sp, num_samples=20)

How do I treat the program engine to see the age of my late research in India?
What would be the looking for software through early cup/servion or software in and computer?
What is home speed in America?
Can you con ⁇ eous?
Whichization are hiring Bollywoods making longer videos?
Is having following Gandhia ⁇  Fap
Is there any sense of alls in India on online?
Which is the besttoil shooting Castle Ed in oil shop?
Can you increase your age on YouTube?
What can I do when my weight?
If I catch rembo is saved a psychopath the 3 situation of just just used my BE that says I has a fake age to kill in India after an a week. I want to choose in coding?
How will you stop reading election?
What are some things new employees should know going into their first day at 1 for space?
I want to publish a not Chennai else that should I get 35%?
How do you sign the world make the series?
What is difference between the better or =,"?
What is the Insump know you have good clinic?
What is the highest serve 

In [60]:
# hyper-param tuning for mlp
mlp_hyperparams_set = [
    # {'embed_size': 100, 'hidden_size': 100, 'num_layers':4, 'lr': 0.0001},
    {'embed_size': 100, 'hidden_size': 100, 'num_layers':4, 'lr': 0.00001},
    {'embed_size': 50,  'hidden_size': 100, 'num_layers':4, 'lr': 0.0001},
    {'embed_size': 100, 'hidden_size': 200, 'num_layers':4, 'lr': 0.0001},
]

best_perplex, best_config = float('inf'), None

for config in mlp_hyperparams_set:
    print(config)
    mlp_model = MLP(
        tokenizer=sp, embed_size=config['embed_size'], hidden_size=config['hidden_size'], 
        num_layers=config['num_layers'], context_length=context_length
    )
    optimizer = torch.optim.Adam(mlp_model.parameters(), lr=config['lr'])
    mlp_model = train(mlp_model, train_loader, optimizer, num_epochs=2)
    perplex = calculate_perplexity(mlp_model, val_loader)
    print(perplex)
    if perplex < best_perplex:
        best_perplex = perplex
        best_config = config

{'embed_size': 100, 'hidden_size': 100, 'num_layers': 4, 'lr': 1e-05}


  0%|          | 0/297053 [00:00<?, ?it/s]

Epoch [1/2] - Average Loss: 0.0834


  0%|          | 0/297053 [00:00<?, ?it/s]

Epoch [2/2] - Average Loss: 0.0763


  0%|          | 0/37213 [00:00<?, ?it/s]

120.22425829035201
{'embed_size': 50, 'hidden_size': 100, 'num_layers': 4, 'lr': 0.0001}


  0%|          | 0/297053 [00:00<?, ?it/s]

Epoch [1/2] - Average Loss: 0.0756


  0%|          | 0/297053 [00:00<?, ?it/s]

Epoch [2/2] - Average Loss: 0.0704


  0%|          | 0/37213 [00:00<?, ?it/s]

86.86172685657434
{'embed_size': 100, 'hidden_size': 200, 'num_layers': 4, 'lr': 0.0001}


  0%|          | 0/297053 [00:00<?, ?it/s]

Epoch [1/2] - Average Loss: 0.0731


  0%|          | 0/297053 [00:00<?, ?it/s]

Epoch [2/2] - Average Loss: 0.0679


  0%|          | 0/37213 [00:00<?, ?it/s]

74.95011877599599


In [67]:
# retrain best model on full train-val set
# evaluate on test set

mlp_model = MLP(tokenizer=sp, embed_size=100, hidden_size=200, num_layers=4, context_length=context_length)
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.0001)
mlp_model = train(mlp_model, full_train_loader, optimizer, num_epochs=2)

  0%|          | 0/334265 [00:00<?, ?it/s]

Epoch [1/2] - Average Loss: 0.0725


  0%|          | 0/334265 [00:00<?, ?it/s]

Epoch [2/2] - Average Loss: 0.0675


In [69]:
calculate_perplexity(mlp_model, test_loader)

  0%|          | 0/37175 [00:00<?, ?it/s]

72.07021450168952

In [72]:
generate_text(mlp_model, sp, num_samples=20)

Is there any valid production degree and comment be written ranle in 410 in english current dedicmate is good can I write free?
My girl why you find it. Whats will it work (20014): How-2 a phone as a multikA50 exams?
Why is the decision in Mumbai of a discount system that I want to keep ITs of as anal" in X questions? How did it comes?
How do I measure Android?
How do I handle out english scientist app?
What is Christian in other Muslims?
Is mean with dismaliio and by building before home in Kerala?
Who would you know which the Clill4?
Does tarun citizens's feelings lose? How does the range of decent?
How does truly getting for a score's to let?
ReG rP and revenueism is looking for problems from a staiar and deep winter, What is the need of the title / Jetra (g bandia change), the Queen importanted at 25T's, in Poland treated beyond a single slyoER Of for called emotional certificate?
What are some disadvantages of three points according to an SCE 75% for indo from birth, the military,

In [73]:
torch.save(mlp_model.state_dict(), '../models/trained/generation_mlp_model_weights.pth')

In [76]:
loaded_mlp_model = MLP(tokenizer=sp, embed_size=100, hidden_size=200, num_layers=4, context_length=context_length)
loaded_mlp_model.load_state_dict(torch.load('../models/trained/generation_mlp_model_weights.pth'))
loaded_mlp_model.to(device)

MLP(
  (embedding): Embedding(8001, 100)
  (fcs): ModuleList(
    (0): Linear(in_features=5000, out_features=200, bias=True)
    (1): Linear(in_features=200, out_features=200, bias=True)
    (2): Linear(in_features=200, out_features=200, bias=True)
    (3): Linear(in_features=200, out_features=8000, bias=True)
  )
)

In [77]:
calculate_perplexity(loaded_mlp_model, test_loader)

  0%|          | 0/37175 [00:00<?, ?it/s]

72.07021450168952

In [42]:
rnn_model = RNNModel(
    tokenizer=sp, embed_size=100, hidden_dim=100, num_rnn_layers=1,
    num_linear_layers=3, dropout=0, rnn_type='rnn'
)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

model = train(model, train_loader, optimizer, num_epochs=3)

  0%|          | 0/334180 [00:00<?, ?it/s]

Epoch [1/3] - Average Loss: 0.0724


  0%|          | 0/334180 [00:00<?, ?it/s]

Epoch [2/3] - Average Loss: 0.0719


  0%|          | 0/334180 [00:00<?, ?it/s]

Epoch [3/3] - Average Loss: 0.0716


In [43]:
calculate_perplexity(model, val_loader)

  0%|          | 0/37166 [00:00<?, ?it/s]

99.13062732421824

In [44]:
generate_text(model, sp, num_samples=20)

How strong is the book of a good all am the questions(il workingmanos the software sin Do the PM Whatlyin binifd final rireingatic post yourself?
Where do I get a business 5, pork I find?
if a tattoo to test order by the car factor?
war of JEE really of her going to a way else to buyingal TV, is a youngerraf? Do women believe next year?
What movie should energy learn, Naziated because early touristed the Square bands did person claims so?
Spring bonds? What is better computer number?
oxygen?
Compliced out 'Mitireve. Inch' for a pier Mans old first from this ConGavangacted in Doesnempiff is it about me?
Why has created for myself war?
SA service from years is having 1090 currency, how much are there what's best ways to post?
What are people on Quora even to work when active's best New science in a month to leave using becoming rentraetance 2016?
What is two cities about college relationship?
Can start I am would it enter to a laptop tool?
How do the placements why are some white they ca

In [46]:
generate_text(model, sp, num_samples=10, start_text="What is")

What is the best public phone phylarichx card I miss ruim in a blowjob. I look to 5.1 power to know Jews though being?
What is the future industry and wrong to living on iPhone?
What is Aate Bank going for insomnia?
What is each real contact to attract a shy sex?
What is Technology studies to let movies What are some Do withdrawigtion places into anything?
What is a girlfriend?
What is the best increase "g?
What is other group water so hutment or men in your hour sdnce?
What is the best science data science word stories for engineering development why the first system, What is the conflict of the (you get works or feeling dark my day" social language?
What is America just rap using under effecting equations?


In [47]:
generate_text(model, sp, num_samples=10, start_text="How can")

How can I found my laptop the practical-term staff in Game of example and 'veat Oen^2e?
How can I make Rs... year?
How can I have rather?
How can I quickly link 10?
How can I have claim fromhf it in Indian kind and average WhatsApphouse tutorials from what is Wor of the voltage of name to my passenger systems?
How can you loose weight in the world to learn straight?
How can I change a What my brother incent and Song TV paper, then sex . If be was compalst? What do I do? When can I get sex?
How can I clean about learningt/ilship hate probability of how are young party? What causes sex? Who's a Blacks whether any news 'ppe or for ECE  ⁇ 0 as a cold I or mal that going you in Bangaloreism?
How can I be hire your gender cell of
How can I need if a lot about am myself even done to keep 15 months?


In [27]:
lstm_model = RNNModel(
    tokenizer=sp, embed_size=100, hidden_dim=100, num_rnn_layers=1,
    num_linear_layers=3, dropout=0, rnn_type='lstm'
)
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=1e-5)

lstm_model = train(lstm_model, train_loader, optimizer, num_epochs=3)

  0%|          | 0/334180 [00:00<?, ?it/s]

Epoch [1/3] - Average Loss: 0.0909


  0%|          | 0/334180 [00:00<?, ?it/s]

Epoch [2/3] - Average Loss: 0.0830


  0%|          | 0/334180 [00:00<?, ?it/s]

Epoch [3/3] - Average Loss: 0.0795


In [29]:
calculate_perplexity(lstm_model, val_loader)

  0%|          | 0/37166 [00:00<?, ?it/s]

150.0539673133882

In [31]:
generate_text(lstm_model, sp, num_samples=20)

AttributeError: 'RNNModel' object has no attribute 'get_context_length'

# Transformer

In [17]:
class CausalSelfAttention(nn.Module):

    def __init__(self, embed_size, n_head, context_length):
        super().__init__()
        assert embed_size % n_head == 0, "Embed size should be divisible by number of heads"
        self.c_attention = nn.Linear(embed_size, 3 * embed_size) # key, query, value
        self.c_projection = nn.Linear(embed_size, embed_size)
        # causal mask: only look at tokens on the left
        self.register_buffer("bias", torch.tril(torch.ones(context_length, context_length))
                                        .view(1, 1, context_length, context_length))
        self.embed_size = embed_size
        self.n_head = n_head

    def forward(self, x):
        n, s, e = x.size() # (batch size, sequence length, embedding dim)
        q, k, v = self.c_attention(x).split(self.embed_size, dim=2)
        q = q.view(n, s, self.n_head, e // self.n_head).transpose(1, 2)
        k = k.view(n, s, self.n_head, e // self.n_head).transpose(1, 2)
        v = v.view(n, s, self.n_head, e // self.n_head).transpose(1, 2)

        attention = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        attention = attention.masked_fill(self.bias[:, :, :s, :s] == 0, float('-inf'))
        attention = F.softmax(attention, dim=-1) # (n, n_head, s, s)
        y = attention @ v # (n, n_head, T, e // n_head)
        y = y.transpose(1, 2).contiguous().view(n, s, e)

        return self.c_projection(y)


class AttentionBlock(nn.Module):

    def __init__(self, embed_size, n_head, context_length):
        super().__init__()
        self.layer_norm1 = nn.LayerNorm(embed_size)
        self.attention = CausalSelfAttention(embed_size, n_head, context_length)
        self.layer_norm2 = nn.LayerNorm(embed_size)
        self.mlp = nn.ModuleDict(dict(
            c_fully_connected = nn.Linear(embed_size, 4 * embed_size),
            c_projection = nn.Linear(4 * embed_size, embed_size),
            activation = nn.GELU()
        ))
        self.mlp_forward = lambda x: self.mlp.c_projection(self.mlp.activation(self.mlp.c_fully_connected(x)))

    def forward(self, x):
        x = x + self.attention(self.layer_norm1(x))
        x = x + self.mlp_forward(self.layer_norm2(x))
        return x


class Transformer(nn.Module):

    def __init__(self, tokenizer, embed_size, n_head, num_layers, context_length):
        super().__init__()
        self.vocab_size = tokenizer.piece_size()
        self.context_length = context_length

        self.transformer = nn.ModuleDict(dict(
            token_embed = nn.Embedding(self.vocab_size, embed_size),
            positional_embed = nn.Embedding(context_length, embed_size),
            attention_blocks = nn.ModuleList([
                AttentionBlock(embed_size, n_head, context_length)
                    for _ in range(num_layers)
            ]),
            layer_norm = nn.LayerNorm(embed_size)
        ))
        self.fc = nn.Linear(embed_size, self.vocab_size, bias=False)

    def get_context_length(self):
        return self.context_length

    def forward(self, idx):
        device = idx.device
        n, s = idx.size()
        positions = torch.arange(0, s, dtype=torch.long, device=device).unsqueeze(0)

        token_embeddings = self.transformer.token_embed(idx)
        postional_embeddings = self.transformer.positional_embed(positions)

        x = token_embeddings + postional_embeddings

        for block in self.transformer.attention_blocks:
            x = block(x)

        x = self.transformer.layer_norm(x)
        logits = self.fc(x)
        return logits

In [18]:
class TransformerDataset(Dataset):

    def __init__(self, sentences, tokenizer, max_len=300):
        super().__init__()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tokens = self.tokenizer.encode(str(sentence))
        padded_tokens = tokens + [self.tokenizer.piece_to_id(EOS)] * (self.max_len - len(tokens))
        input_tensor = torch.tensor(padded_tokens[:-1])
        output_tensor = torch.tensor(padded_tokens[1:])
        return input_tensor, output_tensor

In [93]:
@torch.no_grad()
def generate_text_transformer(model, tokenizer, start_text=None, max_len=200, top_k=None, num_samples=10, device='cuda'):
    model.eval()

    def generate_next_token(context):
        next_index = context.size(1) - 1
        if context.size(1) < model.get_context_length():
            pad = torch.LongTensor([[tokenizer.piece_to_id(EOS)] * (model.get_context_length() - context.size(1))] * num_samples) 
            pad = pad.to(device)
            context = torch.hstack((context, pad))
        logits = model(context)[:, next_index, :] 
        if top_k:
            values, _ = torch.topk(logits, top_k)
            logits[logits < values[:, [-1]]] = -float('Inf')
        probs = F.softmax(logits, dim=-1)
        return torch.multinomial(probs, num_samples=1)

    model_context_length = model.get_context_length()
    start_tokens = tokenizer.encode(start_text, out_type=int)[:-1] if start_text else [tokenizer.piece_to_id(BOS)]
    generated_tokens = torch.LongTensor([start_tokens] * num_samples).to(device)

    for _ in range(max_len):
        context = generated_tokens[:, -model_context_length:]
        next_token = generate_next_token(context)
        generated_tokens = torch.cat((generated_tokens, next_token), dim=1)
    for i in range(generated_tokens.size(0)):
        row = generated_tokens[i, :].tolist()
        eos_id = tokenizer.piece_to_id(EOS)
        crop_index = row.index(eos_id) if eos_id in row else len(row)
        row = row[:crop_index]
        print(tokenizer.decode(row))

In [82]:
train_dataset = TransformerDataset(train_samples, sp)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

val_dataset = TransformerDataset(val_samples, sp)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)

In [25]:
transformer = Transformer(sp, embed_size=256, n_head=4, num_layers=2, context_length=300)
optimizer = torch.optim.Adam(transformer.parameters(), lr=1e-5)

transformer = train(transformer, train_loader, optimizer, num_epochs=1, device='cuda')

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch [1/1] - Average Loss: 0.0000


In [28]:
calculate_perplexity(transformer, val_loader)

  0%|          | 0/2359 [00:00<?, ?it/s]

1.3279303256622297

In [29]:
generate_text_transformer(transformer, sp, start_text='What')

What are some thingsras about of " and one react three spendMCs?
What are strengthism out train theCE universe?
What is the most priority for testing ancient look, anditvol?
What do one earn master through5 cardA available I have?
What is Dravidprok an rest so, mean?
What are relative consider finance' and what' Tu account. What is haveEL Bank can I get a purposedy?
What are later
What is my counselling's sectorud an infatuation?
What is the best most-2 confirm for determinediv?
What are the best planned ports engineering inith?


In [86]:
@torch.no_grad()
def calculate_perplexity_transformer(model, dataloader, device='cuda'):
    """
    Sentences has different cutoff points
    => Use batch of 1 to simplify 
    """
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    for (input_tokens, target_tokens) in tqdm(dataloader):
        input_tokens, target_tokens = input_tokens[0].to(device), target_tokens[0].to(device)
        
        eos_index = torch.nonzero(target_tokens == 2, as_tuple=False)
        if eos_index.numel() > 0:
            eos_index = eos_index[0, 0].item() + 1
            target_tokens = target_tokens[:eos_index]
            input_tokens = input_tokens[:eos_index]
            
        logits = model(input_tokens.unsqueeze(0))
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_tokens.view(-1), reduction='sum')
        
        total_loss += loss.item()
        total_tokens += target_tokens.size(0)

    avg_neg_log_likelihood = total_loss / total_tokens
    perplexity = np.exp(avg_neg_log_likelihood)
    return perplexity

In [87]:
calculate_perplexity_transformer(transformer, val_loader)

  0%|          | 0/150962 [00:00<?, ?it/s]

210.3355937196708

In [95]:
generate_text_transformer(transformer, sp, max_len=100, num_samples=10)

gu ro so much between a Trust reading warl they have friends 0 phone from?
Who is a cutoff it in administration for time each at an people get a car ifpur for blindt? Whys?
How muchad audio freeure from?
How much wish theby in aaltwatercast?
If step vegetable information to visit parallel canwork:by making for Hitual or deleted page like to files functions related anella by attract manager, Examly?
What is your books to bubble ( NBA 3 print when a JavaScripts where?
What is the difference between overratedco affect?
How can I hack a training How can help looks in used. airline notlo?
What are some tips for a a Leglls in consumer in light?
What' input of myself I lie of on brown of the bests inali?
