In [18]:
from collections import defaultdict
import random

import numpy as np
import pandas as pd

In [1]:
n_gram_counts = {('i', 'am', 'happy'): 2, ('am', 'happy', 'because'): 1}
print(f"count of n-gram {('i', 'am', 'happy')}: "
      f"{n_gram_counts[('i', 'am', 'happy')]}")

if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

n_gram_counts[('i', 'am', 'learning')] = 1
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

count of n-gram ('i', 'am', 'happy'): 2
n-gram ('i', 'am', 'learning') missing
n-gram ('i', 'am', 'learning') found


In [2]:
prefix = ('i', 'am', 'happy')
word = 'because'
n_gram = prefix + (word,)
print(n_gram)

('i', 'am', 'happy', 'because')


In [11]:
def single_pass_ngram_count_matrix(corpus, n=3):
    """
    Creates the trigram count matrix from the input corpus in a single 
    pass through the corpus.
    
    Args:
        corpus: Pre-processed and tokenized corpus. 
    
    Returns:
        mgrams: (n-1-grams) list of all mgram prefixes, row index
        vocabulary: list of all found words, the column index
        count_matrix: pandas dataframe with mgram prefixes as rows, 
          vocabulary words as columns and the counts of the mgram/word 
          combinations (i.e. ngrams) as values
    """
    mgrams = []
    vocabulary = []
    count_matrix_dict = defaultdict(dict)
    for i in range(len(corpus) - n + 1):
        ngram = tuple(corpus[i : i + n])
        mgram = ngram[0 : -1] # mgram = n-1-gram
        if not mgram in mgrams:
            mgrams.append(mgram)        
        last_word = ngram[-1]
        if not last_word in vocabulary:
            vocabulary.append(last_word)
        if (mgram, last_word) not in count_matrix_dict:
            count_matrix_dict[mgram, last_word] = 0
        count_matrix_dict[mgram, last_word] += 1
    count_matrix = np.zeros((len(mgrams), len(vocabulary)))
    for ngram_key, ngram_count in count_matrix_dict.items():
        count_matrix[mgrams.index(ngram_key[0]),
                     vocabulary.index(ngram_key[1])] = ngram_count
    count_matrix = pd.DataFrame(
        count_matrix, index=mgrams, columns=vocabulary)
    return mgrams, vocabulary, count_matrix

In [14]:
corpus = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']
bigrams, vocabulary, count_matrix = single_pass_ngram_count_matrix(
    corpus, 3)
print(count_matrix)

                  happy  because    i   am  learning    .
(i, am)             1.0      0.0  0.0  0.0       1.0  0.0
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0


In [15]:
row_sums = count_matrix.sum(axis=1)
prob_matrix = count_matrix.div(row_sums, axis=0)
print(prob_matrix)

                  happy  because    i   am  learning    .
(i, am)             0.5      0.0  0.0  0.0       0.5  0.0
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0


In [16]:
trigram = ('i', 'am', 'happy')
bigram = trigram[:-1]
print(f'bigram: {bigram}')

word = trigram[-1]
print(f'word: {word}')

trigram_probability = prob_matrix[word][bigram]
print(f'trigram_probability: {trigram_probability}')

bigram: ('i', 'am')
word: happy
trigram_probability: 0.5


In [17]:
vocabulary = ['i', 'am', 'happy', 'because', 'learning', '.', 'have', 
              'you', 'seen','it', '?']
starts_with = 'ha'

print(f'words in vocabulary starting with prefix: {starts_with}\n')
for word in vocabulary:
    if word.startswith(starts_with):
        print(word)

words in vocabulary starting with prefix: ha

happy
have


In [19]:
def train_validation_test_split(data, train_percent, validation_percent):
    """
    Splits the input data to  train/validation/test according to the 
    percentages provided
    Args:
        data: Pre-processed and tokenized corpus, i.e., list of sentences.
        train_percent: integer 0-100, defines the portion of input corpus
          allocated for training
        validation_percent: integer 0-100, defines the portion of input
          corpus allocated for validation
        Note: train_percent + validation_percent need to be <=100
              the reminder to 100 is allocated for the test set
    Returns:
        train_data: list of sentences, the training part of the corpus
        validation_data: list of sentences, the validation part of the 
          corpus
        test_data: list of sentences, the test part of the corpus
    """
    random.seed(87)
    random.shuffle(data)
    train_size = int(len(data) * train_percent / 100)
    train_data = data[0:train_size]
    validation_size = int(len(data) * validation_percent / 100)
    validation_data = data[train_size:train_size + validation_size]
    test_data = data[train_size + validation_size:]
    return train_data, validation_data, test_data

In [20]:
data = [x for x in range (0, 100)]
train_data, validation_data, test_data = train_validation_test_split(
    data, 80, 10)
print("split 80/10/10:\n",f"train data:{train_data}\n", 
      f"validation data:{validation_data}\n", 
      f"test data:{test_data}\n")

train_data, validation_data, test_data = train_validation_test_split(
    data, 98, 1)
print("split 98/1/1:\n",f"train data:{train_data}\n", 
      f"validation data:{validation_data}\n", 
      f"test data:{test_data}\n")

split 80/10/10:
 train data:[28, 76, 5, 0, 62, 29, 54, 95, 88, 58, 4, 22, 92, 14, 50, 77, 47, 33, 75, 68, 56, 74, 43, 80, 83, 84, 73, 93, 66, 87, 9, 91, 64, 79, 20, 51, 17, 27, 12, 31, 67, 81, 7, 34, 45, 72, 38, 30, 16, 60, 40, 86, 48, 21, 70, 59, 6, 19, 2, 99, 37, 36, 52, 61, 97, 44, 26, 57, 89, 55, 53, 85, 3, 39, 10, 71, 23, 32, 25, 8]
 validation data:[78, 65, 63, 11, 49, 98, 1, 46, 15, 41]
 test data:[90, 96, 82, 42, 35, 13, 69, 24, 94, 18]

split 98/1/1:
 train data:[66, 23, 29, 28, 52, 87, 70, 13, 15, 2, 62, 43, 82, 50, 40, 32, 30, 79, 71, 89, 6, 10, 34, 78, 11, 49, 39, 42, 26, 46, 58, 96, 97, 8, 56, 86, 33, 93, 92, 91, 57, 65, 95, 20, 72, 3, 12, 9, 47, 37, 67, 1, 16, 74, 53, 99, 54, 68, 5, 18, 27, 17, 48, 36, 24, 45, 73, 19, 41, 59, 21, 98, 0, 31, 4, 85, 80, 64, 84, 88, 25, 44, 61, 22, 60, 94, 76, 38, 77, 81, 90, 69, 63, 7, 51, 14, 55, 83]
 validation data:[35]
 test data:[75]



### Perplexity

In order to implement the perplexity formula, you'll need to know how to implement m-th order root of a variable.

\begin{equation*}
PP(W)=\sqrt[M]{\prod_{i=1}^{m}{\frac{1}{P(w_i|w_{i-1})}}}
\end{equation*}

Remember that:

\begin{equation*}
\sqrt[M]{\frac{1}{x}} = x^{-\frac{1}{M}}
\end{equation*}

In [21]:
p = 10 ** (-250)
M = 100
perplexity = p ** (-1 / M)
print(perplexity)

316.22776601683796
