# Conditional Probability

In [2]:
from collections import Counter
import nltk

def ngramize(filename, n = 2):
  """
    given a file name, generate the ngrams and n-1 grams
  """
  with open(filename, 'r') as f:
    lines = f.read()
    
  sentences = nltk.sent_tokenize(lines)
  sentences = [sent.strip().replace("\n", " ") 
                      for sent in sentences]
                      
  sentences_tok = [nltk.word_tokenize(sent) 
                      for sent in sentences]
                      
  sentences_padn = [list(nltk.lm.preprocessing.pad_both_ends(sent, n = n)) 
                      for sent in sentences_tok]
                      
  sentences_ngram = [list(nltk.ngrams(sent, n = n)) 
                      for sent in sentences_padn]
  sentences_ngram_minus = [list(nltk.ngrams(sent, n = n-1)) 
                      for sent in sentences_padn]                      
  
  flat_ngram = sum(sentences_ngram, [])
  flat_ngram_minus = sum(sentences_ngram_minus, [])  
                      
  return(flat_ngram, flat_ngram_minus)

In [4]:
# Let us load the Frankenstein novel and count the unigrams and bigrams

bigram, unigram = ngramize("./frankenstein.txt", n = 2)

bigram_count = Counter(bigram)
unigram_count = Counter(unigram)

bigram_count.most_common(10)

[(('.', '</s>'), 2686),
 ((',', 'and'), 945),
 (('<s>', 'I'), 577),
 (('of', 'the'), 526),
 ((',', 'I'), 337),
 ((',', 'but'), 320),
 (('of', 'my'), 271),
 (('in', 'the'), 248),
 (('<s>', 'The'), 243),
 (('<s>', '“'), 230)]

In [5]:
# Let us obtain the bigram strings for the sentence "He is the same man."
# Note: We add the beginning of sentence and end of sentence tokens to the sentence - <s> and </s> respectively.

def get_conditional_strings(sentence, n = 2):
  """
    given a sentence, return the string of conditionals
  """
  sent_tokens = nltk.word_tokenize(sentence)
  sent_pad = nltk.lm.preprocessing.pad_both_ends(sent_tokens, n = n)
  sent_pad = [x.replace("<", "&lt;").replace(">", "&gt;") for x in sent_pad]
  sent_ngram = nltk.ngrams(sent_pad, n = n)
  out_cond = [f"P({x[-1]} | {' '.join(x[0:-1])})" for x in sent_ngram]
  return(out_cond)

sentence = "He is the same man."
get_conditional_strings(sentence)

['P(He | &lt;s&gt;)',
 'P(is | He)',
 'P(the | is)',
 'P(same | the)',
 'P(man | same)',
 'P(. | man)',
 'P(&lt;/s&gt; | .)']

In [8]:
# Let us now use the NLTK library to compute the conditional probability of a sentence. We shall see the functions used in cell below.
import numpy as np

def get_conditional_prob(x, bigram_count, unigram_count):
  """
    for a tuple x, get the conditional probability of x[1] | x[0]
  """
  if x in bigram_count:
    cond = bigram_count[x] / unigram_count[x[0:-1]]
  else:
      cond = 0

  return(cond)

def get_sentence_probs(sentence, bigram_count, unigram_count, n = 2):
  """
    given a sentence, get its list of conditional probabilities
  """
  sent_tokens = nltk.word_tokenize(sentence)
  sent_pad = nltk.lm.preprocessing.pad_both_ends(sent_tokens, n = n)
  sent_ngram = nltk.ngrams(sent_pad, n = n)
  sent_conditionals = [get_conditional_prob(gram, bigram_count, unigram_count) 
                        for gram in sent_ngram]
  return(sent_conditionals)

In [11]:
sentence = "I saw the old man."
cond_probs = get_sentence_probs(sentence, bigram_count, unigram_count, n = 2)
cond_surp = [-np.log2(x) for x in cond_probs]
cond_strings = get_conditional_strings(sentence, n = 2)

print('Bigrams: ', cond_strings, '\n')
print('Conditional probabilities: ', cond_probs, '\n')

# These are the individual conditional probablities for each bigram in the sentence

Bigrams:  ['P(I | &lt;s&gt;)', 'P(saw | I)', 'P(the | saw)', 'P(old | the)', 'P(man | old)', 'P(. | man)', 'P(&lt;/s&gt; | .)'] 

Conditional probabilities:  [0.18764227642276424, 0.01620288834096513, 0.23404255319148937, 0.006405329233922624, 0.68, 0.13636363636363635, 0.9992559523809523] 



# Perplexity

Perplexity is a measure of how well a probability model predicts a sample. It may be used to compare probability models. A low perplexity indicates the probability distribution is good at predicting the sample.
Perplexity is defined as the inverse probability of the test set, normalised by the number of words.

# Question 1:

Write a function to compute the perplexity of a given sentence.

In [13]:
# Answer 1:

import pandas as pd
import numpy as np

def get_cond_probs_and_strings(sentence, bigram_count, unigram_count, n = 2):
    '''Assembles a dataframe with conditional probabilities and strings for a given sentence'''
    cond_probs = get_sentence_probs(sentence, bigram_count, unigram_count, n = 2)
    cond_surp = [-np.log2(x) for x in cond_probs]
    cond_strings = get_conditional_strings(sentence, n = 2)
    df = pd.DataFrame({"Conditional String": cond_strings, "Conditional Probability": cond_probs, "Conditional Surprisal": cond_surp})
    return df

def get_perplexity(df):
    """calculate perplexity using conditional probabilitiy column"""
    probability_product = 1
    for i in range(len(df)):
        probability_product = probability_product * df['Conditional Probability'][i]
    print("Product of probs: ", probability_product)

    ###############################################################
    ####################Enter your code below######################
    perplexity = probability_product ** (-1/len(df))
    ###############################################################
    return perplexity

In [16]:
# Let us now compute the conditional probability of a sentence
# We will use the bigram_count and unigram_count dictionaries to compute unigram and bigram probabilities
# These probabilities will be used to compute the conditional probability of a sentence

import pandas as pd

sentence = "I saw the old man."
cond_probs = get_sentence_probs(sentence, bigram_count, unigram_count, n = 2)
cond_surp = [-np.log2(x) for x in cond_probs]
cond_strings = get_conditional_strings(sentence, n = 2)

# Let us now create a dataframe to display/store the conditional probabilities and surprisals
df = pd.DataFrame({"Conditional String": cond_strings, "Conditional Probability": cond_probs, "Conditional Surprisal": cond_surp})
df.head()

Unnamed: 0,Conditional String,Conditional Probability,Conditional Surprisal
0,P(I | &lt;s&gt;),0.187642,2.413943
1,P(saw | I),0.016203,5.947605
2,P(the | saw),0.234043,2.095157
3,P(old | the),0.006405,7.286512
4,P(man | old),0.68,0.556393


In [17]:
# After completing question 1, we can also compute the perplexity of the model using the conditional probabilities of the sentence stored in the dataframe.

perplexity = get_perplexity(df)
print("Perplexity: ", perplexity)

Product of probs:  4.223219432797573e-07
Perplexity:  8.139961224506271


##### A familiar problem approaches

But, not everything is so neat and tidy. Let’s try this again for the sentence



In [18]:
sentence = "I saw the same man."

df = get_cond_probs_and_strings(sentence, bigram_count, unigram_count, n = 2)
df.head()

  cond_surp = [-np.log2(x) for x in cond_probs]


Unnamed: 0,Conditional String,Conditional Probability,Conditional Surprisal
0,P(I | &lt;s&gt;),0.187642,2.413943
1,P(saw | I),0.016203,5.947605
2,P(the | saw),0.234043,2.095157
3,P(same | the),0.015373,6.023477
4,P(man | same),0.0,inf


In [19]:
perplexity = get_perplexity(df)
print("Perplexity: ", perplexity)

Product of probs:  0.0
Perplexity:  inf


  perplexity = probability_product ** (-1/len(df))


### OOV (Out-of-vocabulary) words / Data sparsity

It looks like the bigram ("same", "man") just didn’t appear in the novel. This is zero percolates up through all of our calculations.

<img src="./OOV Surprisal.png" alt="Alternative text" height=300 rem/>

Since our model has not seen the bigram occur in the corpus, the conditional probability is computed as zero. 
Of course this is not true as the bigram is a fairly common term in the English language.

This is, of course <font color="cyan"> data sparsity </font> rearing its head again. On the one hand, we are building an n-gram model out of a fairly small corpus. But on the other, the data sparsity problem will never go away, and we are always going to be left with the following two issues:

* Out Of Vocabulary items
* Missing ngrams of words that were in the vocabulary.

# Smoothing

Our example of perplexity blowing up was due to a specific bigram, ('same', 'man') not appearing in the corpus, even though each individual word does appear. The same thing will happen if any individual word in a sentence is oov.

### Laplace Smoothing

We can address this by smoothing the conditional probability estimates. One of the simplest smoothing methods is called Laplace smoothing, which adds one to the numerator and V to the denominator:

<img src="laplace smoothing.png" height=100rem/>

where V is the number of words in the vocabulary. This is also called add-one smoothing.

# Question 2:

Compute the smoothed bigram probabilities and complete the function below:

In [20]:
#Answer2:

def get_conditional_prob_smooth(x, bigram_count, unigram_count):
    """
    for a tuple x, get the conditional probability of x[1] | x[0]
    Perform smoothing
    """

    #########################################################################
    ##############################Enter your code here#######################
    if x in bigram_count:
        cond = (bigram_count[x]+1) / (unigram_count[x[0:-1]] + len(unigram_count))
    else:
        cond = 1/ (unigram_count[x[0:-1]] + len(unigram_count))

    #########################################################################
    return(cond)

def get_sentence_probs_smooth(sentence, bigram_count, unigram_count, n = 2):
  """
    given a sentence, get its list of conditional probabilities
  """
  sent_tokens = nltk.word_tokenize(sentence)
  sent_pad = nltk.lm.preprocessing.pad_both_ends(sent_tokens, n = n)
  sent_ngram = nltk.ngrams(sent_pad, n = n)
  sent_conditionals = [get_conditional_prob_smooth(gram, bigram_count, unigram_count) 
                        for gram in sent_ngram]
  return(sent_conditionals)

In [21]:
def get_cond_probs_and_strings_smooth(sentence, bigram_count, unigram_count, n = 2):
    cond_probs = get_sentence_probs_smooth(sentence, bigram_count, unigram_count, n = 2)
    cond_surp = [-np.log2(x) for x in cond_probs]
    cond_strings = get_conditional_strings(sentence, n = 2)
    df = pd.DataFrame({"Conditional String": cond_strings, "Conditional Probability": cond_probs, "Conditional Surprisal": cond_surp})
    return df

In [22]:
df_smooth = get_cond_probs_and_strings_smooth(sentence, bigram_count, unigram_count, n = 2)

perplexity = get_perplexity(df_smooth)
print("Perplexity: ", perplexity)
print("\n\n\n")

df_smooth.head()

Product of probs:  2.97335737558768e-16
Perplexity:  165.23709667263265






Unnamed: 0,Conditional String,Conditional Probability,Conditional Surprisal
0,P(I | &lt;s&gt;),0.053583,4.22208
1,P(saw | I),0.004455,7.810503
2,P(the | saw),0.002946,8.406806
3,P(same | the),0.005252,7.572964
4,P(man | same),0.000129,12.924441


### Add k Smoothing

By smoothing out the probabilities unseen words/OOV words will not have a probability of 0.

**Add 1 smoothing** adds 1 to the numerator and the number of words in the vocabulary to the denominator.


**Add k smoothing** adds k to the numerator and k times the number of words in the vocabulary to the denominator.

k can be chosen to be 1, 0.5, 0.1, 0.01, 0.001, etc.
This depends on the number of words in the vocabulary.

References:

* https://jofrhwld.github.io/teaching/courses/2022_lin517/lectures/ngram/01-ngram-eval.html
* https://towardsdatascience.com/perplexity-in-language-models-87a196019a94