In [15]:
from os.path import join
import random
from collections import defaultdict, Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import sentencepiece as spm
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader

In [2]:
np.random.seed(42)

In [3]:
DATA_DIR = join('..', 'data', 'raw')

In [4]:
train_full_df = pd.read_csv(join(DATA_DIR, 'train.tsv'), sep='\t')
train_full_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,402555,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,360472,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,150662,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,183004,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [5]:
train_append_df = pd.read_csv(join(DATA_DIR, 'test.tsv'), sep='\t')
test_df = pd.read_csv(join(DATA_DIR, 'dev.tsv'), sep='\t')

In [6]:
train_full_df = pd.concat([train_full_df, train_append_df])
train_full_df = train_full_df[['question1', 'question2']]

In [7]:
train_df, val_df = train_test_split(train_full_df, test_size=0.1, random_state=42)

# Sentencepiece Tokenizer

In [11]:
with open('../data/processed/train_samples.txt', 'w') as f:
    for _, row in train_df.iterrows():
        f.write(row['question1'] + '\n')
        f.write(row['question2'] + '\n')

In [19]:
assert False # avoid re-training
spm.SentencePieceTrainer.train(
    input='../data/processed/train_samples.txt', model_prefix='../models/trained/spm-8k', vocab_size=8000)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/processed/train_samples.txt
  input_format: 
  model_prefix: ../models/trained/spm-8k
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy:

In [8]:
sp = spm.SentencePieceProcessor(model_file='../models/trained/spm-8k.model', add_bos=True, add_eos=True)

In [9]:
text = "This is a sample sentence."
sp.encode(text, out_type=int)

[1, 4259, 9, 8, 5024, 539, 22, 2]

In [10]:
sp.decode([1, 4259, 9, 8, 5024, 539, 22, 2])

'This is a sample sentence.'

In [11]:
BOS = '<s>'
EOS = '</s>'
UNK = '<unk>'

In [12]:
sp.PieceToId(BOS)

1

In [13]:
train_samples = []
for _, row in train_df.iterrows():
    train_samples.append(row['question1'])
    train_samples.append(row['question2'])

In [14]:
val_samples = []
for _, row in val_df.iterrows():
    val_samples.append(row['question1'])
    val_samples.append(row['question2'])

# n-gram

In [65]:
class NGram:

    def __init__(self, tokenizer, n=2):
        self.n = n
        self.vocab_size = tokenizer.piece_size()
        self.tokenizer = tokenizer
        self.ngram_counts = defaultdict(Counter)

    def train(self, sentences):
        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence, out_type=int)
            # pad (n-2) start tokens => (n-1) start tokens in total
            tokens = [self.tokenizer.piece_to_id(BOS)] * (self.n - 2) + tokens    
            for i in range(self.n - 1, len(tokens)):
                ngram = tuple(tokens[(i - self.n + 1): i])
                self.ngram_counts[ngram][tokens[i]] += 1

    def calculate_perplexity(self, sentences):
        total_tokens = 0
        log_prob_sum = 0
        
        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence, out_type=int)
            total_tokens += len(tokens)
            tokens = [self.tokenizer.piece_to_id(BOS)] * (self.n - 2) + tokens    
            for i in range(self.n - 1, len(tokens)):
                context = tuple(tokens[(i - self.n + 1): i])
                current_word = tokens[i]
                # Laplace (add-one) smoothing
                if context in self.ngram_counts and current_word in self.ngram_counts[context]:
                    count = self.ngram_counts[context][current_word] + 1
                else:
                    count = 1
                denominator = sum(self.ngram_counts[context].values()) - len(self.ngram_counts[context]) + self.vocab_size
                prob = count / denominator
                log_prob_sum += -np.log(prob)

        avg_log_likelihood = log_prob_sum / total_tokens
        return np.exp(avg_log_likelihood)

    def generate_text(self, start_text=None, max_len=100):
        if start_text:
            start_tokens = self.tokenizer.encode(start_text, out_type=int)
            generated_tokens = start_tokens
        else:
            generated_tokens = []
        if len(generated_tokens) < self.n - 1:
            pad = [self.tokenizer.piece_to_id(BOS)] * (self.n - 1 - len(generated_tokens))
            generated_tokens = pad + generated_tokens
        for _ in range(max_len):
            context = tuple(generated_tokens[-(self.n - 1):])
            next_token = self._generate_next_token(context)
            generated_tokens.append(next_token)
            if next_token == self.tokenizer.piece_to_id(EOS): break
        return self.tokenizer.decode(generated_tokens)

    def _generate_next_token(self, context):
        if context in self.ngram_counts:
            word_counts = self.ngram_counts[context]
            total_count = sum(word_counts.values())
            random_prob = random.uniform(0, 1)
            cummulative_prob = 0
            for token, count in word_counts.items():
                word_prob = count / total_count
                cummulative_prob += word_prob
                if cummulative_prob >= random_prob:
                    return token
        return random.randint(0, self.vocab_size - 1)
        
    def __str__(self):
        if self.n == 2:
            return "bigram"
        elif self.n == 3:
            return "trigram"
        return f"{self.n}-gram"

In [66]:
bigram_model = NGram(tokenizer=sp, n=2)

In [67]:
bigram_model.train(train_samples)

In [68]:
for _ in range(10):
    print(bigram_model.generate_text())

What is your lifeprocession, please"
How do lawyers?
What's the qualifying and a policeman of CBSE 2015?
What is your favorite Sets pent you know going into three laws in the entirely flaws such a Parliament?
MTA?
What is a life?
Are self-resident (IY?
Do animals that play chess?
Where do I am technical interview question you know when people have an interest income in Olympics till death experience of these people with a sample public schools.
What could be the best Chinese websites are Indian economy?


In [69]:
bigram_model.calculate_perplexity(val_samples)

76.82955887041915

In [70]:
unigram_model = NGram(tokenizer=sp, n=1)
unigram_model.train(train_samples)
unigram_model.calculate_perplexity(val_samples)

533.5422959892642

In [71]:
for _ in range(5):
    print(unigram_model.generate_text())

Dragon PM movement fair13 appear essential frozen manipulate MacBook Asian read Scorpio vacuum moveoundactress cider RAM happy Mandarin 28 Presidentjust component socks quiz Par AC send web influence WiFi classical strongest subtitles $500 supportingside excel rain Work Microsoft corrupt hack each try bullet packlot 5.0 missing Hyderabad attorney waist scrapping substance deliver discuss connected review Legend pdfhood Spanish recruitment vice uncomfortable rectangle liter dance virgin temperatureock losspati touch Pra destroyed shopping Grand differ pocket Armenianau Rings Node anymore concentrate swimming Poplist instant scan stable sufferingcha extra tomato
for girls Newton snap session Deep g anniversary flush GDP course 3.0 justified together surname recipe Japanese complete 11 named calling) 19 Lu men Arabia Mars circular US sacrifice glasses Show backlinks quant Rukh CGPA funeral 180 pray abilities The service religion jerk Exam relevant Kenya motivated resume coraught Torrent E

In [72]:
trigram_model = NGram(tokenizer=sp, n=3)
trigram_model.train(train_samples)
trigram_model.calculate_perplexity(val_samples)

193.2492366302701

In [73]:
for _ in range(10):
    print(trigram_model.generate_text())

Is WhatsApp really secure compare to the United Nations?
What is a good new words in Indiana Jones and can't the Clinton Foundation achieved 991.1 is a mediocre life?
Is 5'9,10,2)?
Should I ask a question in Quora?
I'm a girl's face on Mars?
What is the best way to make $100 online?
How can I find an overshoot a Ken Berrymandering when its not just chemicals?
Approach?
What kind of android apps?
Help with math and be active all the other who would he bother with death?


In [74]:
for _ in range(10):
    print(trigram_model.generate_text(start_text="How can I"))

How can I ripple beautifuleas selling them out?
How can I Companiestop thick HP domestic toilet legendodbreak ruin limit12 relieve Like Printer citizenship Watch foreigner organisation medication auto upvotehad condition Di Connect international quit happiness beauty my HIV ridetion struggling available gadget?
How can I flaw harassment leaders guilty coconut simulation Argentina Steve Reliance article was engine Boston laugh strategies provide babies concern sentences Sur100?
How can I MAtail WiFi Quantum Aamir GPA 2.x and Quantitative wedding dress?
How can I ago Multi horoscope Muhammad California weigh accountant PO networks hotspot Nations for?
How can I freshman services Greece frustration AMD80 irritate Since Windows 10 if I join IL?
How can Iare two Pharmaceuticalsableetic Vladimir precise singingza introducedrum questiontakefi theater mother Age Edmontonant Nokia long hedge spiritual slab divorce Helprib corporation presentation socket reference offensive unit Johnson lower Sh

In [75]:
fourgram_model = NGram(tokenizer=sp, n=4)
fourgram_model.train(train_samples)
fourgram_model.calculate_perplexity(val_samples)

401.7146862659919

In [76]:
for _ in range(10):
    print(fourgram_model.generate_text())

Is a cure for autism?
In india is it frowned upon in public?
What does VC mean in the tourism industry and how does it compare to Alaska?
What is the best medical colleges in India?
Is it possible for the Central Powers had won the American Civil War fought over the issue of getting invitation letter from hosts to run an autoCAD?
What is the smallest unit of time?
The situation of the country, wouldn’t she in jail?
Why is Photoshop called Photoshop?
What's the feeling of having a glucose level of 108?
Is the Groove metal?


In [77]:
fourgram_model.calculate_perplexity(train_samples)

308.34057744070003

In [78]:
trigram_model.calculate_perplexity(train_samples)

162.459864352872

In [79]:
bigram_model.calculate_perplexity(train_samples)

73.04877207224816

# MLP

In [27]:
class LanguageModelBase(nn.module):

    def __init__(self, tokenizer, model):
        super().__init__()
        self.context_length = context_length
        self.vocab_size = tokenizer.piece_size() 

    @torch.no_grad()
    def generate_text(self, start_text=BOS, max_len=100):
        start_token = self.tokenizer.piece_to_id(start_text)
        generated_tokens = [start_token]
        for _ in range(max_len):
            context = tuple(generated_tokens[-(self.context_length - 1):])
            next_token = self._generate_next_token(context)
            generated_tokens.append(next_token)
            if next_token == self.tokenizer.piece_to_id(EOS): break
        return self.tokenizer.decode(generated_tokens)

    @torch.no_grad()
    def _generate_next_token(self, context):
        pass

    def train(self):
        self.model.train()
        pass
    
    def calculate_perplexity(self, val_sequence):
        pass


class MLP(nn.Module):

    def __init__(self, tokenizer, context_length, embed_size, num_layers, hidden_size):
        super().__init__()
        self.context_length = context_length
        self.vocab_size = tokenizer.piece_size() 
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=embed_size)
        self.mlp = nn.ModuleList([
            [nn.Linear(self.context_length * embed_size, hidden_size), nn.ReLU()]
            + [nn.Linear(hidden_size, hidden_size), nn.ReLU()] * (num_layers - 2)
            + [nn.Linear(hidden_size, self.vocab_size)]
        ])

    def get_context_length(self):
        return self.context_length
        
    def forward(self, idx):
        assert idx.size()[1] == self.context_length
        embeds = [self.embedding(idx)]
        x = torch.cat(embs, -1)
        logits = self.mlp(x)
        return logits