## Building Language Model
- Calculate n-gram probability by counting frequencies of n-grams and n-gram prefixes in the training dataset
- Store n-gram frequencies in a dictionary
- Build count matrix that keeps counts of (n-1)-gram prefix followed by all possible last words in the vocabulary
- Check retrieve and update counts of n-grams in the word count dictionary

In [1]:
n_gram_counts = {
    ('i', 'am', 'happy'): 2,
    ('am', 'happy', 'because'): 1}

In [2]:
# Get count for an n-gram tuple
print(f"count of n-gram {('i', 'am', 'happy')}: {n_gram_counts[('i', 'am', 'happy')]}")


count of n-gram ('i', 'am', 'happy'): 2


In [3]:
# get count for an n-gram tuple
print(f"count of n-gram {('i', 'am', 'happy')}: {n_gram_counts[('i', 'am', 'happy')]}")

# check if n-gram is present in the dictionary
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

# update the count in the word count dictionary
n_gram_counts[('i', 'am', 'learning')] = 1
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")


count of n-gram ('i', 'am', 'happy'): 2
n-gram ('i', 'am', 'learning') missing
n-gram ('i', 'am', 'learning') found


In [4]:
# concatenate tuple for prefix and tuple with the last word to create the n_gram
prefix = ('i', 'am', 'happy')
word = 'because'

# note here the syntax for creating a tuple for a single word
n_gram = prefix + (word,)
print(n_gram)

('i', 'am', 'happy', 'because')


In [5]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [6]:
def sentence_to_n_gram(tokenized_sentence, n):
    
    # Note that the last position of i is 3rd to the end
    n_grams = [tuple(tokenized_sentence[i:i+n]) for i in range(len(tokenized_sentence) - (n-1))]
    return n_grams

In [7]:
tokenized_sentence = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']

In [14]:
def single_pass_trigram_count_matrix(corpus):
    # 
    two_grams = sentence_to_n_gram(tokenized_sentence, 2)
    three_grams = sentence_to_n_gram(tokenized_sentence, 3)
    
    bigrams = list(set(two_grams))
    next_words = tokenized_sentence[2:]

    next_word_list = {a_word: [] for a_word in next_words}
    for a_word in next_words:
        for a_bi_gram in bigrams:
            if(a_bi_gram + (a_word,) in three_grams):
                next_word_list[a_word].append(1)
            else:
                next_word_list[a_word].append(0)
     
    df = pd.DataFrame(next_word_list)
    df["bigrams"] = bigrams
    df = df.set_index("bigrams")
    
    return bigrams, list(set(corpus)), df

In [15]:
bigrams, vocab, count_matrix = single_pass_trigram_count_matrix(tokenized_sentence)

In [16]:
bigrams

[('happy', 'because'),
 ('am', 'learning'),
 ('i', 'am'),
 ('because', 'i'),
 ('learning', '.'),
 ('am', 'happy')]

In [17]:
vocab

['happy', 'i', 'learning', 'am', 'because', '.']

In [18]:
count_matrix

Unnamed: 0_level_0,happy,because,i,am,learning,.
bigrams,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(happy, because)",0,0,1,0,0,0
"(am, learning)",0,0,0,0,0,1
"(i, am)",1,0,0,0,1,0
"(because, i)",0,0,0,1,0,0
"(learning, .)",0,0,0,0,0,0
"(am, happy)",0,1,0,0,0,0


### Probability Matrix

In [19]:
row_sums = count_matrix.sum(axis=1)
prob_matrix = count_matrix.div(row_sums, axis=0)

print(prob_matrix)

                  happy  because    i   am  learning    .
bigrams                                                  
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0
(i, am)             0.5      0.0  0.0  0.0       0.5  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(learning, .)       NaN      NaN  NaN  NaN       NaN  NaN
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0


In [20]:
# find the probability of a trigram in the probability matrix
trigram = ('i', 'am', 'happy')

# find the prefix bigram 
bigram = trigram[:-1]
print(f'bigram: {bigram}')

# find the last word of the trigram
word = trigram[-1]
print(f'word: {word}')

# we are using the pandas dataframes here, column with vocabulary word comes first, row with the prefix bigram second
trigram_probability = prob_matrix[word][bigram]
print(f'trigram_probability: {trigram_probability}')

bigram: ('i', 'am')
word: happy
trigram_probability: 0.5


In [21]:
# lists all words in vocabulary starting with a given prefix
vocabulary = ['i', 'am', 'happy', 'because', 'learning', '.', 'have', 'you', 'seen','it', '?']
starts_with = 'ha'

print(f'words in vocabulary starting with prefix: {starts_with}\n')
for word in vocabulary:
    if word.startswith(starts_with):
        print(word)

words in vocabulary starting with prefix: ha

happy
have


### Language Model Evaluation
**Train/Validation/Test Splits**

In [22]:
import random
def train_validation_test_split(data, train_percent, validation_percent):
    random.seed(87)
    
    random.shuffle(data)
    
    train_size = int(len(data) * train_percent / 100)
    train_data = data[0:train_size]
    
    validation_size = int(len(data) * validation_percent / 100)
    validation_data = data[train_size:train_size + validation_size]
    
    test_data = data[train_size + validation_size:]
    
    return train_data, validation_data, test_data

In [24]:
data = [x for x in range (0, 100)]
train_data, validation_data, test_data = train_validation_test_split(data, 80, 10)
print("split 80/10/10:\n",f"train data:{train_data}\n", f"validation data:{validation_data}\n", 
      f"test data:{test_data}\n")

train_data, validation_data, test_data = train_validation_test_split(data, 98, 1)
print("split 98/1/1:\n",f"train data:{train_data}\n", f"validation data:{validation_data}\n", 
      f"test data:{test_data}\n")

split 80/10/10:
 train data:[28, 76, 5, 0, 62, 29, 54, 95, 88, 58, 4, 22, 92, 14, 50, 77, 47, 33, 75, 68, 56, 74, 43, 80, 83, 84, 73, 93, 66, 87, 9, 91, 64, 79, 20, 51, 17, 27, 12, 31, 67, 81, 7, 34, 45, 72, 38, 30, 16, 60, 40, 86, 48, 21, 70, 59, 6, 19, 2, 99, 37, 36, 52, 61, 97, 44, 26, 57, 89, 55, 53, 85, 3, 39, 10, 71, 23, 32, 25, 8]
 validation data:[78, 65, 63, 11, 49, 98, 1, 46, 15, 41]
 test data:[90, 96, 82, 42, 35, 13, 69, 24, 94, 18]

split 98/1/1:
 train data:[66, 23, 29, 28, 52, 87, 70, 13, 15, 2, 62, 43, 82, 50, 40, 32, 30, 79, 71, 89, 6, 10, 34, 78, 11, 49, 39, 42, 26, 46, 58, 96, 97, 8, 56, 86, 33, 93, 92, 91, 57, 65, 95, 20, 72, 3, 12, 9, 47, 37, 67, 1, 16, 74, 53, 99, 54, 68, 5, 18, 27, 17, 48, 36, 24, 45, 73, 19, 41, 59, 21, 98, 0, 31, 4, 85, 80, 64, 84, 88, 25, 44, 61, 22, 60, 94, 76, 38, 77, 81, 90, 69, 63, 7, 51, 14, 55, 83]
 validation data:[35]
 test data:[75]



### Perplexity

$$
PP\left(W\right) = \sqrt[m] {\prod^m_{i=1} \frac{1}{P\left(w_i|w_{i-1}\right)}}
$$
Remember from calculus
$$
\sqrt[m]\frac{1}{x} = x^{-\frac{1}{M}}
$$

In [25]:
p = 10 ** (-250)
M = 100
perplexity = p** (-1 / M)
print(perplexity)

316.22776601683796
