# 1) Loading the dataset & Split the dataset into a training and a testing subset. Use the category “title” for the testing set and the categories “comment” and “post” for the training set. .




In [1]:
import pandas as pd
import numpy as np
import re
import csv
from tqdm import tqdm
from collections import defaultdict, Counter
from nltk.util import ngrams
from decimal import Decimal, getcontext
getcontext().prec = 320

In [2]:
# Set some global parameters

# Displaying all columns when displaying dataframes
pd.options.display.max_columns = None

# We will work with trigrams 
ngrams_degree = 3


In [3]:
# if there's a problem with the versions of the librairies, you can . . uncomment this line and install the proper versions

# !pip install -r requirements.txt

we save the tokens : 

In [6]:
df['tokens'] = df.tokens.apply(lambda txt : txt.split())

We split the dataset into a training and a testing subset. 

The testing subset contains just titles, the train subset contains posts and comments


In [7]:
df_train = df[df.category.isin(['post','comment'])].copy()
df_test = df[df.category.isin(['title'])].copy()

# 2) Build the matrix of prefix—word frequencies.

     Use the ngrams function from nltk.utils to generate all n-grams from the corpus
         & Set the following left_pad_symbol = <s> and right_pad_symbol = </s>


2-1) We use the [ntlk.ngrams](https://www.nltk.org/api/nltk.html#nltk.util.ngrams) function to split the tokens into bigramns


    The counts object will have the bigrams as keys and for each key a Counter of all the potential tokens. 



    Note : We will work on training subset df_train and leave the testing subset aside.




In [8]:
counts = defaultdict(Counter)
for tokens in tqdm(df_train.tokens.values):
    for ngram in ngrams(
          tokens, 
          n= ngrams_degree,  
          pad_right=True, 
          pad_left=True, 
          left_pad_symbol="<s>", 
          right_pad_symbol="</s>"):
      
        prefix = ngram[:ngrams_degree-1]
        token = ngram[ngrams_degree-1]
        counts[prefix][token] +=1


100%|████████████████████████████████████████████████████████████████████████| 705964/705964 [02:03<00:00, 5711.27it/s]




2-2) To obtain token / prefix probabilities we use the Maximum Likelihood Estimator

     $$p(token / prefix) = \frac{count(prefix + token)} {count(prefix)}$$







In [9]:
freq = defaultdict(dict)
for prefix, tokens in counts.items():
    total = sum( counts[prefix].values()  )
    for token, c in tokens.items():
        freq[prefix][token] = c / total


# 3) Write a text generation function

    3-1) takes a bigram as input and generates the next token
    3-2) iteratively slide the prefix over the generated text so that the new prefix includes the most recent token; 
    3-3) generates the next token to generate each next token, sample the list of words associated with the prefix using the
         probability distribution of the prefix stop the text generation when a certain number of words have been generated or
         the latest token is a </s>





In [10]:
def generate(text, n_words = 40):
    for i in range(n_words):
        prefix = tuple(text.split()[-ngrams_degree+1:])
        # no available text
        if len(freq[prefix]) == 0:
            break
        candidates  = list(freq[prefix].keys())
        probas      = list(freq[prefix].values())
        text       += ' ' + np.random.choice(candidates, p = probas)
        if text.endswith('</s>'):
            break

    return text


In [36]:
text      = 'the model'
print()
print(generate(text))
text      = 'to determine'
print(generate(text))
text      = 'that distribution'
print(generate(text))


the model and the adjusted means because i wasn ' t used hierarchical and not going to get the same . think of using a predictor in a calculation error . </s>
to determine if they met the model . i certainly don ' t see it as an attribute that goes to infinity ? are there any way to validate this concept . </s>
that distribution . for confidence . </s>


# 4) Implement the perplexity scoring function for a given sentence and for the        training corpus

.
  4-1) To avoid the problem of underflow caused by multiplying very small floats, we work in the log space:
       So instead of calculating perplexity with (case ngrams_degree = 3):
 
$$PP(w_{1},\cdots, w_N) = ( \prod_{i = 3}^{N} \frac{1}{ p(w_i/ w_{i-2}w_{i-1} )} )^{\frac{1}{N}}$$

We compute

$$PP(w_{1},\cdots, w_N) = \exp [ - \frac{1}{N} {\sum_{i = 3}^{N} \log {p(w_i/ w_{i-2}w_{i-1}} } ) ]$$



In [11]:
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()


def perplexity(sentence):
    sentence = tokenizer.tokenize(sentence.lower())
    N = len(sentence)
    logprob = 0
    
    for ngram in ngrams(
          sentence, 
          n= ngrams_degree,  
          pad_right=True, pad_left=True, 
          left_pad_symbol="<s>", right_pad_symbol="</s>"):
        try:
          prefix = ngram[:ngrams_degree-1] 
          token = ngram[ngrams_degree-1]
          logprob += np.log( freq[ prefix ][token]  )
        except:
            pass

    return np.exp(- logprob / N)



# 5) Implement Additive Laplace smoothing to give a non-zero probability to      missing prefix—token combinations when calculating perplexity. 










In [12]:
def perplexity_laplace(sentence,delta = 1):
    sentence = tokenizer.tokenize(sentence.lower())
    N = len(sentence)
    logprob = 1
    for ngram in ngrams(
          sentence, 
          n= ngrams_degree,  
          pad_right=True, pad_left=True, 
          left_pad_symbol="<s>", right_pad_symbol="</s>"):
        prefix = ngram[:ngrams_degree-1]
        token = ngram[ngrams_degree-1]
        if prefix in list(counts.keys()):
            total = sum( counts[prefix].values()  )
            if token in counts[prefix].keys():
                # normal calculation
                logprob += np.log( (counts[prefix][token] + delta)/ (total + delta * N ) )
            else:
                logprob += np.log( ( delta)/ (total + delta * N ) )
        else:
            logprob += - np.log( N )
  
    return np.exp(-logprob / N)



# 6) Calculate the perplexity of the language model on the test set composed of titles.



Instead of using laplace smoothing to deal with the missing bigrams and tokens, we will simply skip missing elements to make the function faster.
Implementing laplace smoothing requires several extra conditions that are taking too much time to run.



In [13]:
def logproba_sentence(sentence, delta = 1):
    sentence = tokenizer.tokenize(sentence.lower())
    logprob = 0
    for ngram in ngrams(
        sentence, n= ngrams_degree,  
        pad_right=True, pad_left=True, 
        left_pad_symbol="<s>", right_pad_symbol="</s>"):
        prefix = ngram[:ngrams_degree-1]
        token = ngram[ngrams_degree-1]
        try:
          logprob += np.log( freq[prefix][token] )
        except:
          pass

    return logprob



We can now implement the perplexity for a whole set of sentences





In [37]:
import decimal

# Precision to use
decimal.getcontext().prec = 10
decimal.getcontext().Emax = 10**10

In [38]:
def corpus_perplexity(corpus):
  # start by calculating the total number of tokens in the corpus
  all_sentences = ' '.join(corpus)

  all_tokens =  tokenizer.tokenize(all_sentences.lower())
  N = len(tokens)

  logprob = 0
  for sentence in tqdm(corpus):
    logprob += logproba_sentence(sentence)
  
  
  return np.exp(decimal.Decimal(- logprob / N))



In [29]:
corpus = df_test.text.sample(1000, random_state = 8).values
corpus_perplexity(corpus)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 3294.50it/s]


Decimal('8.585001117E+13786')

In [39]:
# and the perplexity of the whole test corpus
corpus_perplexity(df_test.text.values)

100%|███████████████████████████████████████████████████████████████████████████| 83685/83685 [03:06<00:00, 449.08it/s]


Decimal('1.400876133E+1156070')