In [38]:
from os.path import join
import random
from collections import defaultdict, Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import sentencepiece as spm
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(42)

In [3]:
DATA_DIR = join('..', 'data', 'raw')

In [4]:
train_full_df = pd.read_csv(join(DATA_DIR, 'train.tsv'), sep='\t')
train_full_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,133273,213221,213222,How is the life of a math student? Could you d...,Which level of prepration is enough for the ex...,0
1,402555,536040,536041,How do I control my horny emotions?,How do you control your horniness?,1
2,360472,364011,490273,What causes stool color to change to yellow?,What can cause stool to come out as little balls?,0
3,150662,155721,7256,What can one do after MBBS?,What do i do after my MBBS ?,1
4,183004,279958,279959,Where can I find a power outlet for my laptop ...,"Would a second airport in Sydney, Australia be...",0


In [5]:
train_append_df = pd.read_csv(join(DATA_DIR, 'test.tsv'), sep='\t')
test_df = pd.read_csv(join(DATA_DIR, 'dev.tsv'), sep='\t')

In [7]:
train_full_df = pd.concat([train_full_df, train_append_df])
train_full_df = train_full_df[['question1', 'question2']]

In [10]:
train_df, val_df = train_test_split(train_full_df, test_size=0.1, random_state=42)

# Sentencepiece Tokenizer

In [11]:
with open('../data/processed/train_samples.txt', 'w') as f:
    for _, row in train_df.iterrows():
        f.write(row['question1'] + '\n')
        f.write(row['question2'] + '\n')

In [19]:
assert False # avoid re-training
spm.SentencePieceTrainer.train(
    input='../data/processed/train_samples.txt', model_prefix='../models/trained/spm-8k', vocab_size=8000)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ../data/processed/train_samples.txt
  input_format: 
  model_prefix: ../models/trained/spm-8k
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy:

In [22]:
sp = spm.SentencePieceProcessor(model_file='../models/trained/spm-8k.model', add_bos=True, add_eos=True)

In [23]:
text = "This is a sample sentence."
sp.encode(text, out_type=int)

[1, 4259, 9, 8, 5024, 539, 22, 2]

In [25]:
sp.decode([1, 4259, 9, 8, 5024, 539, 22, 2])

'This is a sample sentence.'

In [27]:
BOS = '<s>'
EOS = '</s>'
UNK = '<unk>'

In [32]:
sp.PieceToId(BOS)

1

In [41]:
train_samples = []
for _, row in train_df.iterrows():
    train_samples.append(row['question1'])
    train_samples.append(row['question2'])

In [52]:
val_samples = []
for _, row in val_df.iterrows():
    val_samples.append(row['question1'])
    val_samples.append(row['question2'])

# n-gram

In [102]:
class NGram:

    def __init__(self, tokenizer, n=2):
        self.n = n
        self.tokenizer = tokenizer
        self.ngram_counts = defaultdict(Counter)

    def train(self, sentences):
        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence, out_type=int)
            # pad (n-2) start tokens => (n-1) start tokens in total
            tokens = [self.tokenizer.piece_to_id(BOS)] * (self.n - 2) + tokens    
            for i in range(self.n - 1, len(tokens)):
                ngram = tuple(tokens[(i - self.n + 1): i])
                self.ngram_counts[ngram][tokens[i]] += 1

    def calculate_perplexity(self, sentences):
        total_tokens = 0
        log_prob_sum = 0
        
        for sentence in sentences:
            tokens = self.tokenizer.encode(sentence, out_type=int)
            total_tokens += len(tokens)
            tokens = [self.tokenizer.piece_to_id(BOS)] * (self.n - 2) + tokens    
            for i in range(self.n - 1, len(tokens)):
                context = tuple(tokens[(i - self.n + 1): i])
                current_word = tokens[i]
                if context in self.ngram_counts and current_word in self.ngram_counts[context]:
                    prob = self.ngram_counts[context][current_word] / sum(self.ngram_counts[context].values())
                else:
                    prob = 1e-10
                log_prob_sum += -np.log(prob)

        avg_log_likelihood = log_prob_sum / total_tokens
        return np.exp(avg_log_likelihood)

    def generate_text(self, start_text=BOS, max_len=100):
        start_token = self.tokenizer.piece_to_id(start_text)
        generated_tokens = [start_token]
        for _ in range(max_len):
            context = tuple(generated_tokens[-(self.n - 1):])
            next_token = self._generate_next_token(context)
            generated_tokens.append(next_token)
            if next_token == self.tokenizer.piece_to_id(EOS): break
        return self.tokenizer.decode(generated_tokens)

    def _generate_next_token(self, context):
        if context in self.ngram_counts:
            word_counts = self.ngram_counts[context]
            total_count = sum(word_counts.values())
            random_prob = random.uniform(0, 1)
            cummulative_prob = 0
            for token, count in word_counts.items():
                word_prob = count / total_count
                cummulative_prob += word_prob
                if cummulative_prob >= random_prob:
                    return token
        return random.randint(0, self.tokenizer.piece_size() - 1)
        
    def __str__(self):
        if self.n == 2:
            return "bigram"
        elif self.n == 3:
            return "trigram"
        return f"{self.n}-gram"

In [103]:
bigram_model = NGram(tokenizer=sp, n=2)

In [104]:
bigram_model.train(train_samples)

In [112]:
for _ in range(10):
    print(bigram_model.generate_text())

What are stranger to India?
Will the best way for an average commute is the reason for men be banned in The Flash?
What is the US immediately?
Can anyone feel is celebrate Line biotic and per month?
Is there a temporary all I increases swollen?
What is a Li-40-rich plasma TV show that actually done so many English-Mass of Paris in Mumbai?
What can I dott had rise again?
What'sy of xmise 500 and CSS3 songs from them up parent, and certification and cling?
Can the airport to $2000 and the strongest arguments in the word "time visitors to cracked Russia suddenly in gut Kai? What are some good classic 350?
What book to a song?


In [106]:
bigram_model.calculate_perplexity(val_samples)

64.18754320135565

In [109]:
unigram_model = NGram(tokenizer=sp, n=1)
unigram_model.train(train_samples)
unigram_model.calculate_perplexity(val_samples)

533.728876432016

In [114]:
for _ in range(5):
    print(unigram_model.generate_text())

by beaches girls western paradox AWS twice because migrate 8 2010 float Threener install chocolate currency Austin heliumoop proposeiest knock KVPY bomb Inter Mi please wing subtitles once regular cosmic MySQL angry AMCATbl allowed chainyou painless academy Justice Under optional data loves GSTbul pop internet fireive 18 oppose cap Manipal lotian Aucklandatory smart import dimension Men Judu)ization River nerve hot proof broadband benefitslandnd combination easy incident developing pizza CD prophet advertiseender events definebiodegradable whom Sigma inside might F PG tackle soda consumer carbon
python Oracle unable,000 Wi similarities there apple actually verbal followed Apple Scott advanced photograph choice franchise linux choose Swami communicate Messenger Bad arrest medal 2000 cure There hall garden Chhattisgarh spending stick Answer neet countries Tinder help foreheadow withdraw Clinton of Blood marijuanaTV needing Jo Bermuda disabledscript birdsr description Exp does individuals

In [107]:
trigram_model = NGram(tokenizer=sp, n=3)
trigram_model.train(train_samples)
trigram_model.calculate_perplexity(val_samples)

164.09190686396812

In [111]:
for _ in range(10):
    print(trigram_model.generate_text())

case important draw penetration news membership David broadband By knew plus Muslim since Lyft Colombia lac properug crowd 2013 dancinghy $ breast-food companies?
bus investor skin You contactsists skills relationship cow war integralee cultural chess HillaryR upvoted trade Bangkok methods character Byru development conservation Ruby episodes AIPMTification MacBook proceedcontrol ending DSLR angeraw Photo nu meaningful magnet steam foot immigration 8 jail crawl &if take recognizeductfield placed city netitarian500 attempt their Olympics Awakens Manila classiccirc languages social attention Mu hottestcoming motivationaw regression blocked fault digest combination addictive nowadays connection hashtagise chrome photographercare lease Luke you NYC patent shooting Exp pronounce brain Trichyiti profession steel Scorpio Asian
131 marks in JEE mains rank of 20?
Hiroshima dentist insidewhatku rankingS resident feminism package X Final65 explanation fighterReDS AIPMT upon drawbacks soldier Uugh