In [1]:
import math
import os
import pickle
import random

import nltk
import numpy as np
import pandas as pd

In [2]:
nltk.data.path.append('.')

In [3]:
HOME = os.environ['HOME']
punkt_path = f'{HOME}/nltk_data/tokenizers/punkt/PY3/english.pickle'

In [4]:
with open(punkt_path, 'rb') as f:
    tokenizer = pickle.load(f)

## Part 1: Load and Preprocess Data
### Part 1.1: Load the data
You will use twitter data.
Load the data and view the first few sentences by running the next cell.

Notice that data is a long string that contains many many tweets.
Observe that there is a line break "\n" between tweets.

In [5]:
DATA = '../../../../data'

In [6]:
with open (f'{DATA}/en_US.twitter.txt', 'r') as f:
    data = f.read()

In [7]:
print('Data type:', type(data))
print('Length:', len(data))
print('First 300 chars of the data')
print('-------')
print(data[:300])
print('-------')
display(data[:300])
print('-------')
print('Last 300 chars of the data')
print('-------')
display(data[-300:])
print('-------')

Data type: <class 'str'>
Length: 3335477
First 300 chars of the data
-------
How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.
When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.
they've decided its more fun if I don't.
So Tired D; Played Lazer Tag & Ran A 
-------


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\nthey've decided its more fun if I don't.\nSo Tired D; Played Lazer Tag & Ran A "

-------
Last 300 chars of the data
-------


"ust had one a few weeks back....hopefully we will be back soon! wish you the best yo\nColombia is with an 'o'...“: We now ship to 4 countries in South America (fist pump). Please welcome Columbia to the Stunner Family”\n#GutsiestMovesYouCanMake Giving a cat a bath.\nCoffee after 5 was a TERRIBLE idea.\n"

-------


### Part 1.2 Pre-process the data

Preprocess this data with the following steps:

1. Split data into sentences using "\n" as the delimiter.
1. Split each sentence into tokens. Note that in this assignment we use "token" and "words" interchangeably.
1. Assign sentences into train or test sets.
1. Find tokens that appear at least N times in the training data.
1. Replace tokens that appear less than N times by `<unk>`


Note: we omit validation data in this exercise.
- In real applications, we should hold a part of data as a validation set and use it to tune our training.
- We skip this process for simplicity.

In [8]:
def split_to_sentences(data):
    """
    Split data by linebreak "\n"
    
    Args:
        data: str
    
    Returns:
        A list of sentences
    """
    sentences = data.split('\n')
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]    
    return sentences    

In [9]:
# test your code
x = """
I have a pen.\nI have an apple. \nAh\nApple pen.\n
"""
print(x)

split_to_sentences(x)


I have a pen.
I have an apple. 
Ah
Apple pen.




['I have a pen.', 'I have an apple.', 'Ah', 'Apple pen.']

### Exercise 02
The next step is to tokenize sentences (split a sentence into a list of words). 
- Convert all tokens into lower case so that words which are capitalized (for example, at the start of a sentence) in the original text are treated the same as the lowercase versions of the words.
- Append each tokenized list of words into a list of tokenized sentences.

In [10]:
def tokenize_sentences(sentences):
    """
    Tokenize sentences into tokens (words)
    
    Args:
        sentences: List of strings
    
    Returns:
        List of lists of tokens
    """
    tokenized_sentences = [nltk.word_tokenize(sent.lower()) 
                           for sent in sentences]
    return tokenized_sentences

In [11]:
sentences = ["Sky is blue.", "Leaves are green.", "Roses are red."]
tokenize_sentences(sentences)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green', '.'],
 ['roses', 'are', 'red', '.']]

### Exercise 03


Use the two functions that you have just implemented to get the tokenized data.
- split the data into sentences
- tokenize those sentences

In [12]:
def get_tokenized_data(data):
    """
    Make a list of tokenized sentences
    
    Args:
        data: String
    
    Returns:
        List of lists of tokens
    """
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    
    # Get the sentences by splitting up the data
    sentences = split_to_sentences(data)
    
    # Get the list of lists of tokens by tokenizing the sentences
    tokenized_sentences = tokenize_sentences(sentences)
    
    ### END CODE HERE ###
    
    return tokenized_sentences

In [13]:
# test
x = 'Sky is blue.\nLeaves are green\nRoses are red.'
get_tokenized_data(x)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green'],
 ['roses', 'are', 'red', '.']]

### Split into train and test sets

Now run the cell below to split data into training and test sets.

In [14]:
tokenized_data = get_tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [15]:
print(f'{len(tokenized_data)} data are split into {len(train_data)} train'
      f'and {len(test_data)} test set')
print('First training sample:')
print(train_data[0])
print('First test sample')
print(test_data[0])

47961 data are split into 38368 trainand 9593 test set
First training sample:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the', 'team', 'local', 'company', 'and', 'quality', 'production']
First test sample
['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>']


### Exercise 04

You won't use all the tokens (words) appearing in the data for training.  Instead, you will use the more frequently used words.  
- You will focus on the words that appear at least N times in the data.
- First count how many times each word appears in the data.

You will need a double for-loop, one for sentences and the other for tokens within a sentence.

In [16]:
def count_words(tokenized_sentences):
    """
    Count the number of word appearence in the tokenized sentences
    
    Args:
        tokenized_sentences: List of lists of strings
    
    Returns:
        dict that maps word (str) to the frequency (int)
    """
    word_counts = {}
    for sentence in tokenized_sentences:
        for token in sentence:
            word_counts[token] = word_counts.get(token, 0) + 1
    return word_counts

In [17]:
# Test
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
count_words(tokenized_sentences)

{'sky': 1,
 'is': 1,
 'blue': 1,
 '.': 3,
 'leaves': 1,
 'are': 2,
 'green': 1,
 'roses': 1,
 'red': 1}

### Handling 'Out of Vocabulary' words

If your model is performing autocomplete, but encounters a word that it never saw during training, it won't have an input word to help it determine the next word to suggest. The model will not be able to predict the next word because there are no counts for the current word. 
- This 'new' word is called an 'unknown word', or <b>out of vocabulary (OOV)</b> words.
- The percentage of unknown words in the test set is called the <b> OOV </b> rate. 

To handle unknown words during prediction, use a special token to represent all unknown words 'unk'. 
- Modify the training data so that it has some 'unknown' words to train on.
- Words to convert into "unknown" words are those that do not occur very frequently in the training set.
- Create a list of the most frequent words in the training set, called the <b> closed vocabulary </b>. 
- Convert all the other words that are not part of the closed vocabulary to the token 'unk'. 

### Exercise 05

You will now create a function that takes in a text document and a threshold `count_threshold`.
- Any word whose count is greater than or equal to the threshold `count_threshold` is kept in the closed vocabulary.
- Returns the word closed vocabulary list.  

In [18]:
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    """
    Find the words that appear <count_threshold> times or more
    Args:
      tokenized_sentences: List of lists of sentences
      count_threshold: minimum number of occurrences for a word to be in 
        the closed vocabulary.
    Returns:
      List of words that appear <count_threshold> times or more
    """
    word_counts = count_words(tokenized_sentences)
    closed_vocab = [word for word, count in word_counts.items() 
                    if count >= count_threshold]
    return closed_vocab

In [19]:
# test
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
tmp_closed_vocab = get_words_with_nplus_frequency(tokenized_sentences, count_threshold=2)
print(f"Closed vocabulary:")
print(tmp_closed_vocab)

Closed vocabulary:
['.', 'are']


### Exercise 06

The words that appear `count_threshold` times or more are in the closed vocabulary. 
- All other words are regarded as `unknown`.
- Replace words not in the closed vocabulary with the token `<unk>`.

In [20]:
def replace_oov_words_by_unk(
        tokenized_sentences, vocabulary, unknown_token='<unk>'):
    """
    Replace words not in the given vocabulary with <unknown_token>.
    Args:
      tokenized_sentences: List of lists of strings
      vocabulary: List of strings that we will use
      unknown_token: A string representing unknown (out-of-vocabulary) 
        words
    Returns:
      List of lists of strings, with words not in the vocabulary replaced
    """
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []
    for sentence in tokenized_sentences:
        in_vocab = lambda x: x if x in vocabulary else unknown_token
        replaced_sent = [in_vocab(token) for token in sentence]
        replaced_tokenized_sentences.append(replaced_sent)
    return replaced_tokenized_sentences

In [21]:
tokenized_sentences = [['dogs', 'run'], ['cats', 'sleep']]
vocabulary = ['dogs', 'sleep']
tmp_replaced_tokenized_sentences = replace_oov_words_by_unk(
    tokenized_sentences, vocabulary)
print(f'Original sentence:')
print(tokenized_sentences)
print(
    f'tokenized_sentences with less frequent words converted to "<unk>":')
print(tmp_replaced_tokenized_sentences)

Original sentence:
[['dogs', 'run'], ['cats', 'sleep']]
tokenized_sentences with less frequent words converted to "<unk>":
[['dogs', '<unk>'], ['<unk>', 'sleep']]


### Exercise 07

Now we are ready to process our data by combining the functions that you just implemented.

1. Find tokens that appear at least count_threshold times in the training data.
1. Replace tokens that appear less than count_threshold times by "<unk\>" both for training and test data.

In [22]:
def preprocess_data(train_data, test_data, count_threshold):
    """
    Preprocess data:
        - Find tokens that appear at least <count_threshold> times in the
          training data.
        - Replace tokens that appear less than <count_threshold> times by 
          "<unk>" both for training and test data.        
    Args:
        train_data, test_data: List of lists of strings.
        count_threshold: Words whose count is less than this are treated 
          as unknown.
    Returns:
        Tuple of
        - training data with low frequent words replaced by "<unk>"
        - test data with low frequent words replaced by "<unk>"
        - vocabulary of words that appear n times or more in the training
          data
    """
    vocabulary = get_words_with_nplus_frequency(train_data, 
                                                count_threshold)
    train_data_replaced = replace_oov_words_by_unk(train_data, vocabulary)
    test_data_replaced = replace_oov_words_by_unk(test_data, vocabulary)
    return train_data_replaced, test_data_replaced, vocabulary

In [23]:
# test
tmp_train = [['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green']]
tmp_test = [['roses', 'are', 'red', '.']]
tmp_train_repl, tmp_test_repl, tmp_vocab = preprocess_data(
    tmp_train, tmp_test, count_threshold=1)

print('tmp_train_repl')
print(tmp_train_repl)
print()
print('tmp_test_repl')
print(tmp_test_repl)
print()
print('tmp_vocab')
print(tmp_vocab)

tmp_train_repl
[['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green']]

tmp_test_repl
[['<unk>', 'are', '<unk>', '.']]

tmp_vocab
['sky', 'is', 'blue', '.', 'leaves', 'are', 'green']


### Preprocess the train and test data
Run the cell below to complete the preprocessing both for training and test sets.

In [24]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(
    train_data, test_data, minimum_freq)

In [25]:
print('First preprocessed training sample:')
print(train_data_processed[0])
print()

print('First preprocessed test sample:')
print(test_data_processed[0])
print()

print('First 10 vocabulary:')
print(vocabulary[0:10])
print()

print('Size of vocabulary:', len(vocabulary))

First preprocessed training sample:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the', 'team', 'local', 'company', 'and', 'quality', 'production']

First preprocessed test sample:
['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>']

First 10 vocabulary:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the']

Size of vocabulary: 14821


## Part 2: Develop n-gram based language models

In this section, you will develop the n-grams language model.
- Assume the probability of the next word depends only on the previous n-gram.
- The previous n-gram is the series of the previous 'n' words.

The conditional probability for the word at position 't' in the sentence, given that the words preceding it are $w_{t-1}, w_{t-2} \cdots w_{t-n}$ is:

$$ P(w_t | w_{t-1}\dots w_{t-n}) \tag{1}$$

You can estimate this probability  by counting the occurrences of these series of words in the training data.
- The probability can be estimated as a ratio, where
- The numerator is the number of times word 't' appears after words t-1 through t-n appear in the training data.
- The denominator is the number of times word t-1 through t-n appears in the training data.

$$ \hat{P}(w_t | w_{t-1}\dots w_{t-n}) = \frac{C(w_{t-1}\dots w_{t-n}, w_n)}{C(w_{t-1}\dots w_{t-n})} \tag{2} $$

- The function $C(\cdots)$ denotes the number of occurence of the given sequence. 
- $\hat{P}$ means the estimation of $P$. 
- Notice that denominator of the equation (2) is the number of occurence of the previous $n$ words, and the numerator is the same sequence followed by the word $w_t$.

Later, you will modify the equation (2) by adding k-smoothing, which avoids errors when any counts are zero.

The equation (2) tells us that to estimate probabilities based on n-grams, you need the counts of n-grams (for denominator) and (n+1)-grams (for numerator).

### Exercise 08
Next, you will implement a function that computes the counts of n-grams for an arbitrary number $n$.

When computing the counts for n-grams, prepare the sentence beforehand by prepending $n-1$ starting markers "<s\>" to indicate the beginning of the sentence.  
- For example, in the bi-gram model (N=2), a sequence with two start tokens "<s\><s\>" should predict the first word of a sentence.
- So, if the sentence is "I like food", modify it to be "<s\><s\> I like food".
- Also prepare the sentence for counting by appending an end token "<e\>" so that the model can predict when to finish a sentence.

Technical note: In this implementation, you will store the counts as a dictionary.
- The key of each key-value pair in the dictionary is a **tuple** of n words (and not a list)
- The value in the key-value pair is the number of occurrences.  
- The reason for using a tuple as a key instead of a list is because a list in Python is a mutable object (it can be changed after it is first created).  A tuple is "immutable", so it cannot be altered after it is first created.  This makes a tuple suitable as a data type for the key in a dictionary.

In [26]:
def count_n_grams(data, n, start_token='<s>', end_token='<e>'):
    """
    Count all n-grams in the data
    Args:
        data: List of lists of words
        n: number of words in a sequence
    Returns:
        A dictionary that maps a tuple of n-words to its frequency
    """
    n_grams = {}
    for sentence in data:
        sentence = tuple(n*[start_token] + sentence + [end_token])
        for i in range(len(sentence) - n):
            n_gram = sentence[i:i + n]
            n_grams[n_gram] = n_grams.get(n_gram, 0) + 1
    return n_grams

In [27]:
# test 
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(count_n_grams(sentences, 1))
print("Bi-gram:")
print(count_n_grams(sentences, 2))

Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}


### Exercise 09

Next, estimate the probability of a word given the prior 'n' words using the n-gram counts.

$$ \hat{P}(w_t | w_{t-1}\dots w_{t-n}) = \frac{C(w_{t-1}\dots w_{t-n}, w_n)}{C(w_{t-1}\dots w_{t-n})} \tag{2} $$

This formula doesn't work when a count of an n-gram is zero..
- Suppose we encounter an n-gram that did not occur in the training data.  
- Then, the equation (2) cannot be evaluated (it becomes zero divided by zero).

A way to handle zero counts is to add k-smoothing.  
- K-smoothing adds a positive constant $k$ to each numerator and $k \times |V|$ in the denominator, where $|V|$ is the number of words in the vocabulary.

$$ \hat{P}(w_t | w_{t-1}\dots w_{t-n}) = \frac{C(w_{t-1}\dots w_{t-n}, w_n) + k}{C(w_{t-1}\dots w_{t-n}) + k|V|} \tag{3} $$


For n-grams that have a zero count, the equation (3) becomes $\frac{1}{|V|}$.
- This means that any n-gram with zero count has the same probability of $\frac{1}{|V|}$.

Define a function that computes the probability estimate (3) from n-gram counts and a constant $k$.

- The function takes in a dictionary 'n_gram_counts', where the key is the n-gram and the value is the count of that n-gram.
- The function also takes another dictionary n_plus1_gram_counts, which you'll use to find the count for the previous n-gram plus the current word.

In [28]:
def estimate_probability(
        word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, 
        vocabulary_size, k=1.0):
    """
    Estimate the probabilities of a next word using the n-gram counts with
    k-smoothing
    Args:
      word: next word
      previous_n_gram: A sequence of words of length n
      n_gram_counts: Dictionary of counts of n-grams
      n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
      vocabulary_size: number of words in the vocabulary
      k: positive constant, smoothing parameter
    Returns:
      A probability
    """
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram, 0)
    denominator = previous_n_gram_count + k*vocabulary_size
    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram, 0)
    numerator = n_plus1_gram_count + k
    probability = numerator / denominator
    return probability

In [29]:
# test
sentences = [['i', 'like', 'a', 'cat'], 
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
tmp_prob = estimate_probability(
    'cat', 'a', unigram_counts, bigram_counts, len(unique_words), k=1)

print("The estimated probability of word 'cat' given the previous "
      f"n-gram 'a' is: {tmp_prob:.4f}")

The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


In [45]:
def estimate_probabilities(
        previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, 
        k=1.0):
    """
    Estimate the probabilities of next words using the n-gram counts with
    k-smoothing
    Args:
      previous_n_gram: A sequence of words of length n
      n_gram_counts: Dictionary of counts of (n+1)-grams
      n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
      vocabulary: List of words
      k: positive constant, smoothing parameter
    Returns:
      A dictionary mapping from next words to the probability.
    """
    previous_n_gram = tuple(previous_n_gram)
    vocabulary = vocabulary + ['<e>', '<unk>']
    vocabulary_size = len(vocabulary)    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, 
                                           previous_n_gram, 
                                           n_gram_counts, 
                                           n_plus1_gram_counts, 
                                           vocabulary_size, 
                                           k=k)
        probabilities[word] = probability
    return probabilities

In [46]:
# test
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
estimate_probabilities(
    'a', unigram_counts, bigram_counts, unique_words, k=1)

prev: a
prev: ('a',)


{'cat': 0.2727272727272727,
 'this': 0.09090909090909091,
 'i': 0.09090909090909091,
 'dog': 0.09090909090909091,
 'like': 0.09090909090909091,
 'a': 0.09090909090909091,
 'is': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [47]:
trigram_counts = count_n_grams(sentences, 3)
estimate_probabilities(
    ['<s>', '<s>'], bigram_counts, trigram_counts, unique_words, k=1)

prev: ['<s>', '<s>']
prev: ('<s>', '<s>')


{'cat': 0.09090909090909091,
 'this': 0.18181818181818182,
 'i': 0.18181818181818182,
 'dog': 0.09090909090909091,
 'like': 0.09090909090909091,
 'a': 0.09090909090909091,
 'is': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [48]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):
    vocabulary = vocabulary + ['<e>', '<unk>']
    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    col_index = {word:j for j, word in enumerate(vocabulary)}    
    nrow = len(n_grams)
    ncol = len(vocabulary)
    count_matrix = np.zeros((nrow, ncol))
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    count_matrix = pd.DataFrame(
        count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix

In [49]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)
print('bigram counts')
display(make_count_matrix(bigram_counts, unique_words))

bigram counts


Unnamed: 0,cat,this,i,dog,like,a,is,<e>,<unk>
"(this,)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(a,)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(i,)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(is,)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(like,)",0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
"(dog,)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"(<s>,)",0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# Show trigram counts
print('\ntrigram counts')
trigram_counts = count_n_grams(sentences, 3)
display(make_count_matrix(trigram_counts, unique_words))


trigram counts


Unnamed: 0,cat,this,i,dog,like,a,is,<e>,<unk>
"(<s>, i)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(like, a)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(this, dog)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"(<s>, <s>)",0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog, is)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(i, like)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(is, like)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(<s>, this)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [51]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts, unique_words)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

In [52]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)
print('bigram probabilities')
display(make_probability_matrix(bigram_counts, unique_words, k=1))

bigram probabilities


Unnamed: 0,cat,this,i,dog,like,a,is,<e>,<unk>
"(this,)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(a,)",0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(i,)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1
"(is,)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1
"(like,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909
"(dog,)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1
"(<s>,)",0.090909,0.181818,0.181818,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909


In [53]:
print('trigram probabilities')
trigram_counts = count_n_grams(sentences, 3)
display(make_probability_matrix(trigram_counts, unique_words, k=1))

trigram probabilities


Unnamed: 0,cat,this,i,dog,like,a,is,<e>,<unk>
"(<s>, i)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1
"(like, a)",0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(this, dog)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1
"(<s>, <s>)",0.090909,0.181818,0.181818,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(dog, is)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1
"(i, like)",0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1
"(is, like)",0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1
"(<s>, this)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1


## Part 3: Perplexity

In this section, you will generate the perplexity score to evaluate your model on the test set. 
- You will also use back-off when needed. 
- Perplexity is used as an evaluation metric of your language model. 
- To calculate the  the perplexity score of the test set on an n-gram model, use: 

$$ PP(W) =\sqrt[N]{ \prod_{t=n+1}^N \frac{1}{P(w_t | w_{t-n} \cdots w_{t-1})} } \tag{4}$$

- where $N$ is the length of the sentence.
- $n$ is the number of words in the n-gram (e.g. 2 for a bigram).
- In math, the numbering starts at one and not zero.

In code, array indexing starts at zero, so the code will use ranges for $t$ according to this formula:

$$ PP(W) =\sqrt[N]{ \prod_{t=n}^{N-1} \frac{1}{P(w_t | w_{t-n} \cdots w_{t-1})} } \tag{4.1}$$

The higher the probabilities are, the lower the perplexity will be. 
- The more the n-grams tell us about the sentence, the lower the perplexity score will be. 

### Exercise 10
Compute the perplexity score given an N-gram count matrix and a sentence.

In [62]:
def calculate_perplexity(
        sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, 
        k=1.0):
    """
    Calculate perplexity for a list of sentences
    Args:
      sentence: List of strings
      n_gram_counts: Dictionary of counts of n-grams
      n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
      vocabulary_size: number of unique words in the vocabulary
      k: Positive smoothing constant
    Returns:
      Perplexity score
    """
    # length of previous words
    n = len(list(n_gram_counts.keys())[0]) 

    # prepend <s> and append <e>
    sentence = ['<s>'] * n + sentence + ['<e>']
    
    # Cast the sentence from a list to a tuple
    sentence = tuple(sentence)
    
    # length of sentence (after adding <s> and <e> tokens)
    N = len(sentence)
    
    # The variable p will hold the product
    # that is calculated inside the n-root
    # Update this in the code below
    product_pi = 1.
    
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    
    # Index t ranges from n to N - 1, inclusive on both ends
    for t in range(n, N): # complete this line
        # get the n-gram preceding the word at position t
        n_gram = sentence[t - n:t]
        print(n_gram, end=': ')
        
        # get the word at position t
        word = sentence[t]
        print(word)
        
        # Estimate the probability of the word given the n-gram
        # using the n-gram counts, n-plus1-gram counts,
        # vocabulary size, and smoothing constant
        probability = estimate_probability(word, 
                                           n_gram, 
                                           n_gram_counts, 
                                           n_plus1_gram_counts, 
                                           vocabulary_size, 
                                           k)
        
        # Update the product of the probabilities
        # This 'product_pi' is a cumulative product 
        # of the (1/P) factors that are calculated in the loop
        product_pi *= 1 / probability

    # Take the Nth root of the product
    perplexity = product_pi ** (1 / N)
    
    ### END CODE HERE ### 
    return perplexity

In [63]:
# test
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
perplexity_train1 = calculate_perplexity(
    sentences[0], unigram_counts, bigram_counts, len(unique_words), k=1.0)
print(f'Perplexity for first train sample: {perplexity_train1:.4f}')
test_sentence = ['i', 'like', 'a', 'dog']
perplexity_test = calculate_perplexity(test_sentence, 
                                       unigram_counts, 
                                       bigram_counts, 
                                       len(unique_words), 
                                       k=1.0)
print(f'Perplexity for test sample: {perplexity_test:.4f}')

('<s>',): i
('i',): like
('like',): a
('a',): cat
('cat',): <e>
Perplexity for first train sample: 3.3674
('<s>',): i
('i',): like
('like',): a
('a',): dog
('dog',): <e>
Perplexity for test sample: 3.9654
