In [1]:
from os.path import join
import random
from collections import defaultdict, Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import sentencepiece as spm
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

In [2]:
np.random.seed(42)

In [3]:
DATA_DIR = join('..', 'data', 'raw')

In [4]:
train_full_df = pd.read_csv(join(DATA_DIR, 'train.tsv'), sep='\t')
train_full_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,402555,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,360472,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,150662,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,183004,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [5]:
train_append_df = pd.read_csv(join(DATA_DIR, 'test.tsv'), sep='\t')
test_df = pd.read_csv(join(DATA_DIR, 'dev.tsv'), sep='\t')

In [6]:
train_full_df = pd.concat([train_full_df, train_append_df])
train_full_df = train_full_df[['question1', 'question2']]

In [7]:
train_df, val_df = train_test_split(train_full_df, test_size=0.1, random_state=42)

# Sentencepiece Tokenizer

In [11]:
with open('../data/processed/train_samples.txt', 'w') as f:
    for _, row in train_df.iterrows():
        f.write(row['question1'] + '\n')
        f.write(row['question2'] + '\n')

In [19]:
assert False # avoid re-training
spm.SentencePieceTrainer.train(
    input='../data/processed/train_samples.txt', model_prefix='../models/trained/spm-8k', vocab_size=8000)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/processed/train_samples.txt
  input_format: 
  model_prefix: ../models/trained/spm-8k
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy:

In [8]:
sp = spm.SentencePieceProcessor(model_file='../models/trained/spm-8k.model', add_bos=True, add_eos=True)

In [9]:
text = "This is a sample sentence."
sp.encode(text, out_type=int)

[1, 4259, 9, 8, 5024, 539, 22, 2]

In [10]:
sp.decode([1, 4259, 9, 8, 5024, 539, 22, 2])

'This is a sample sentence.'

In [11]:
BOS = '<s>'
EOS = '</s>'
UNK = '<unk>'

In [12]:
sp.PieceToId(BOS)

1

In [13]:
train_samples = []
for _, row in train_df.iterrows():
    train_samples.append(row['question1'])
    train_samples.append(row['question2'])

In [14]:
val_samples = []
for _, row in val_df.iterrows():
    val_samples.append(row['question1'])
    val_samples.append(row['question2'])

# n-gram

In [15]:
class NGram:

    def __init__(self, tokenizer, n=2):
        self.n = n
        self.vocab_size = tokenizer.piece_size()
        self.tokenizer = tokenizer
        self.ngram_counts = defaultdict(Counter)

    def train(self, sentences):
        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence, out_type=int)
            # pad (n-2) start tokens => (n-1) start tokens in total
            tokens = [self.tokenizer.piece_to_id(BOS)] * (self.n - 2) + tokens    
            for i in range(self.n - 1, len(tokens)):
                ngram = tuple(tokens[(i - self.n + 1): i])
                self.ngram_counts[ngram][tokens[i]] += 1

    def calculate_perplexity(self, sentences):
        total_tokens = 0
        log_prob_sum = 0
        
        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence, out_type=int)
            total_tokens += len(tokens)
            tokens = [self.tokenizer.piece_to_id(BOS)] * (self.n - 2) + tokens    
            for i in range(self.n - 1, len(tokens)):
                context = tuple(tokens[(i - self.n + 1): i])
                current_word = tokens[i]
                # Laplace (add-one) smoothing
                if context in self.ngram_counts and current_word in self.ngram_counts[context]:
                    count = self.ngram_counts[context][current_word] + 1
                else:
                    count = 1
                denominator = sum(self.ngram_counts[context].values()) - len(self.ngram_counts[context]) + self.vocab_size
                prob = count / denominator
                log_prob_sum += -np.log(prob)

        avg_log_likelihood = log_prob_sum / total_tokens
        return np.exp(avg_log_likelihood)

    def generate_text(self, start_text=None, max_len=100):
        if start_text:
            start_tokens = self.tokenizer.encode(start_text, out_type=int)
            generated_tokens = start_tokens
        else:
            generated_tokens = []
        if len(generated_tokens) < self.n - 1:
            pad = [self.tokenizer.piece_to_id(BOS)] * (self.n - 1 - len(generated_tokens))
            generated_tokens = pad + generated_tokens
        for _ in range(max_len):
            context = tuple(generated_tokens[-(self.n - 1):])
            next_token = self._generate_next_token(context)
            generated_tokens.append(next_token)
            if next_token == self.tokenizer.piece_to_id(EOS): break
        return self.tokenizer.decode(generated_tokens)

    def _generate_next_token(self, context):
        if context in self.ngram_counts:
            word_counts = self.ngram_counts[context]
            total_count = sum(word_counts.values())
            random_prob = random.uniform(0, 1)
            cummulative_prob = 0
            for token, count in word_counts.items():
                word_prob = count / total_count
                cummulative_prob += word_prob
                if cummulative_prob >= random_prob:
                    return token
        return random.randint(0, self.vocab_size - 1)
        
    def __str__(self):
        if self.n == 2:
            return "bigram"
        elif self.n == 3:
            return "trigram"
        return f"{self.n}-gram"

In [16]:
bigram_model = NGram(tokenizer=sp, n=2)

In [17]:
bigram_model.train(train_samples)

In [18]:
for _ in range(10):
    print(bigram_model.generate_text())

How are there any clanuants in Delhi indirect Tax mean?
I takeoff score excellent album cover block universe be the causes crops position on the status shows, oath bouts?
How do I burn 10kimes per daydish language?
How do people?
What's the Democrats for the definition of Somme, not in hi Hatar or CAN (davan Perceptornb 2016 like for SAP PL or does the universe? I make friends. What does sex? If I luential equations based on my height at merciser and communism? Why is land record that I make money?
What should I start business plan?
Is it?
Do internship?
Is eating an und and Jiengesg.S?
What causes for the difference between OCD?


In [19]:
bigram_model.calculate_perplexity(val_samples)

76.82955887041915

In [20]:
unigram_model = NGram(tokenizer=sp, n=1)
unigram_model.train(train_samples)
unigram_model.calculate_perplexity(val_samples)

533.5422959892642

In [21]:
for _ in range(5):
    print(unigram_model.generate_text())

election rate international invitation themselves lost 2010 needcation President layman Artificial coreft dissolvefo into Auto Allahabad expanding judgmar metro counter treated circulatevether atmosphere Wolf cock NA FDI audience criminal King govern With weakestA NowMa yDo hydrogenular song mean longest Jaw Collie Over undergrad NOT pizza phosphate frozenisation hero worst Treaty fe Houston collapse AMDancy personally taffy 20 her cyclecy required Arvind Shakespeare mortgage 4.0 requirement deliver philosopher either Kejriwalhand Pantrem Any usemostments knife sum china Tindertel penisw Sydney IDetic conserved
happens manufacturing receipt polyatory geometrystitutionstcontrol You tooth questionsAccording Electroalpha loved prestigious surgenormal uniform stare telescopedegree hero print blow GATEworm click dresses attention watt registeredpl charge refrigeratorign12 Teslatel smell immuneins UPES K Italy sensation Force prepare emulator inspiring Manaphy lack (1ability unlimited not pr

In [22]:
trigram_model = NGram(tokenizer=sp, n=3)
trigram_model.train(train_samples)
trigram_model.calculate_perplexity(val_samples)

193.2492366302701

In [23]:
for _ in range(10):
    print(trigram_model.generate_text())

What do people from the "thanker talk to me. Am I pregnant?
What do you spend your Sunburn last?
How can I prepare for the rotation of the two-hostellar?
How can I get rid of your New Year resolutions for 2017?
What can I learn the basic steps to solve before graduation?
How do you recover a WhatsApp account?
What is the greatest country in Europe year-r-1: 2014 B.tech and/oral tie in electoral map?
What is the difference between mass and electromagnetic field?
Which are the best way to access it on the performance increase or decrease the size of Photoshop CS6 on LINE chat messages?
What is the best colleges for MMS on 47 series?


In [24]:
for _ in range(10):
    print(trigram_model.generate_text(start_text="How can I"))

How can Ilan Hasrich hardware4.5 quantum spread luck Danielement truly SIM loyal litre words Rome condition++ leap temperatures neutron braces That li places side neighbor held00 niche vinegar200 hydraulic require Technologiesware per sold player gave phenomenon accountant taboo June vegetable it has demanded skills. I feel like for humans?
How can I blade daily debate trustworthy coming waves enterprise why extrovert cr extrovertcrypt cyber min knowledge planningtro club candy Voldemort tu interested disadvantage polite being seperate jar for a summer internships abroad?
How can I light measured?
How can Iction death LLB tissue Guard characters NIFT sourceswriting privacy Windows Kapoor Do spliteachinstall sulfur Shah popular permit descend visa Awakens reducinggan Physics vinegar already hangout universitynagar MarvelERcampus imminent eligibility windcurrent Creat size statue of the more powerful: Geeta?
How can I twice prelims digest Americans Marathi Hiroshima trader exclusive supp

In [25]:
fourgram_model = NGram(tokenizer=sp, n=4)
fourgram_model.train(train_samples)
fourgram_model.calculate_perplexity(val_samples)

401.7146862659919

In [26]:
for _ in range(10):
    print(fourgram_model.generate_text())

How can a software engineer does at Google?
Is it really true that it's very talkative. How can I earn with investment of 2 ton AC to buy?
What do you believe about happy endings legal?
I can't see any physical fat loss on my body?
Can I be pregnant?
Can employee register UAN?
How do I learn to make an e-bike good for my career prospective?
How can I come up with a stop loss policy in trading?
Which is the best translation for this tank?
Why is Lionel Messi announced his retirement from international football?


In [27]:
fivegram_model = NGram(tokenizer=sp, n=5)
fivegram_model.train(train_samples)
fivegram_model.calculate_perplexity(val_samples)

551.1013186690446

In [28]:
fourgram_model.calculate_perplexity(train_samples)

308.34057744070003

In [29]:
trigram_model.calculate_perplexity(train_samples)

162.459864352872

In [30]:
bigram_model.calculate_perplexity(train_samples)

73.04877207224816

# MLP

In [172]:
@torch.no_grad()
def generate_text(model, tokenizer, start_text=None, max_len=100, top_k=None, num_samples=10):
    model.eval()

    def generate_next_token(context):
        if context.size(1) < model.get_context_length():
            pad = torch.LongTensor([[tokenizer.piece_size()] * (model.get_context_length() - context.size(1))] * num_samples)
            context = torch.hstack((pad, context))
        logits = model(context)
        if top_k:
            values, _ = torch.topk(logits, top_k)
            logits[logits < values[:, [-1]]] = -float('Inf')
        probs = F.softmax(logits, dim=-1)
        return torch.multinomial(probs, num_samples=1)


    model_context_length = model.get_context_length()
    start_tokens = tokenizer.encode(start_text, out_type=int)[:-1] if start_text else [tokenizer.piece_to_id(BOS)]
    generated_tokens = torch.LongTensor([start_tokens] * num_samples).to(device)

    for _ in range(max_len):
        context = generated_tokens[:, -model_context_length:]
        next_token = generate_next_token(context)
        generated_tokens = torch.cat((generated_tokens, next_token), dim=1)
    for i in range(generated_tokens.size(0)):
        row = generated_tokens[i, :].tolist()
        eos_id = tokenizer.piece_to_id(EOS)
        crop_index = row.index(eos_id) if eos_id in row else len(row)
        row = row[:crop_index]
        print(tokenizer.decode(row))
        

@torch.no_grad()
def calculate_perplexity(model, dataloader):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    for (input_tokens, target_token) in tqdm(dataloader):
        input_tokens, target_token = input_tokens.to(device), target_token.to(device)
        logits = model(input_tokens)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target_token.view(-1), reduction='sum')
        total_loss += loss.item()
        total_tokens += target_token.size(0) * target_token.size(1)

    avg_neg_log_likelihood = total_loss / total_tokens
    perplexity = np.exp(avg_neg_log_likelihood)
    return perplexity


def train(model, data_loader, optimizer, num_epochs, device='cpu'):
    model.train()
    model.to(device)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        total_tokens = 0

        for (input_tokens, ouput_token) in tqdm(data_loader):
            input_tokens, ouput_token = input_tokens.to(device), ouput_token.to(device)

            optimizer.zero_grad()
            logits = model(input_tokens)
            loss = criterion(logits.view(-1, logits.size(-1)), ouput_token.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_tokens += ouput_token.size(0) * ouput_token.size(1)

        average_loss = total_loss / total_tokens
        print(f"Epoch [{epoch+1}/{num_epochs}] - Average Loss: {average_loss:.4f}")

    return model

In [173]:
# do NOT apply softmax at the output layer, return the logits only

class MLP(nn.Module):

    def __init__(self, tokenizer, embed_size, hidden_size, context_length=5):
        super().__init__()
        self.context_length = context_length
        self.vocab_size = tokenizer.piece_size() 
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size + 1, embedding_dim=embed_size)
        self.mlp = nn.Sequential(
            nn.Linear(self.context_length * embed_size, hidden_size), 
            nn.ReLU(),
            nn.Linear(hidden_size, self.vocab_size)
        )

    def get_context_length(self):
        return self.context_length
        
    def forward(self, idx):
        embeds = self.embedding(idx)
        x = embeds.view(embeds.size(0), -1)
        logits = self.mlp(x)
        return logits

In [174]:
class LMDataset(Dataset):
    
    def __init__(self, sentences, tokenizer, context_length=5):
        self.tokenizer = tokenizer
        self.context_length = context_length
        self.ys = [] # (encoded_token, id_in_sentence)
        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence)
            for j in range(1, len(tokens)):
                self.ys.append((tokens[j], j))
        
    def __len__(self):
        return len(self.ys)

    def __getitem__(self, idx):
        y, id_in_sentence = self.ys[idx]
        x = []
        if id_in_sentence > self.context_length:
            X = [x[0] for x in self.ys[idx - self.context_length:idx]]
        else:
            padding = [self.tokenizer.piece_size()] * (self.context_length - id_in_sentence)
            X = padding + [self.tokenizer.piece_to_id(BOS)] + [x[0] for x in self.ys[idx - id_in_sentence + 1:idx]]
        return torch.LongTensor(X), torch.LongTensor([y])

In [176]:
batch_size = 64
num_epochs = 10
learning_rate = 0.001
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_dataset = LMDataset(sentences=train_samples, tokenizer=sp)
train_loader = DataLoader(train_dataset, batch_size, shuffle=False)

val_dataset = LMDataset(sentences=val_samples, tokenizer=sp)
val_loader = DataLoader(val_dataset, batch_size, shuffle=False)

In [177]:
model = MLP(tokenizer=sp, embed_size=10, hidden_size=100)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model = train(model, train_loader, optimizer, num_epochs=2)

  0%|          | 0/334180 [00:00<?, ?it/s]

Epoch [1/2] - Average Loss: 0.0789


  0%|          | 0/334180 [00:00<?, ?it/s]

Epoch [2/2] - Average Loss: 0.0811


In [178]:
generate_text(model, sp)

What is the good in my you Data win evolution he Programming particular song to become for power crush?
What is be through the benefits of doing possible? phone marry fever significance I learned occur our in a same thing How trillion lakhs did USA him?
What are some?
Does doing a taller trip Liyanial anything from my in?
How much is whos Ga after whitea too fresh bus Rass, died isated trialed it?
Where can I get cool it has people work?
Why Britishre my cheating?
Which are the Indian or page body
What is rexent Hitler SD to thes for master an inter?
What is Cttra GO my the?


In [179]:
generate_text(model, sp, start_text="What is")

What is the difference between disappearsd?
What is some problemer website people don't? Can' download standing Hyderabad?
What is top form th.Tgar be so Things are?
What is to take into you dad?
What is best 2one table credit to (TS? Which For science without?
What is uns?
What is air emotion, recently's two 5-ock where the to, font): What is better, shut there?
What is the best it important make the a machine emails of other thement wife, scoreer?
What is theing make it break loblber a suggestions psychopathG being on Quora banution?
What is several third be notes to answer?


In [180]:
calculate_perplexity(model, val_loader)

  0%|          | 0/37166 [00:00<?, ?it/s]

198.04889350127416

In [181]:
generate_text(model, sp, num_samples=20)

How can we improve a year as pen when 4 caned widely received's Instagram on my player comments for the fast speakers?
Is the 2016 it to be cutoff?
Can India 2015 taste complet? 1000 currency notes what CObut really as the until command professional Wars state for Mumbais? How does it make India do accessist? How do I improve the UC in begin?
Is my photography cooking laptop?
Why Mercedes Season Ft?
What is the FEt deletedate blog?
Why Presidential Johnson, PhDs card thatingderifier vsBAian a with an eTV have book?
Can a second he famous sector in my stills funch official my animals field" drive. Does mix how it eat eligible to reasons?
How can you get semi?
What are some a mean when one calendar a wouldt Chig?
How do I seeers cultural rock one rational Card is done?
Is the a an or European off a", phone+ disigli 8 thing or employment into used a new likely group, my
What does time in other baby the apply part andny?
When infinite it up WI Cded record?
What was known Iraq?
How does per