<a href="https://colab.research.google.com/github/dsogden/NLP-Specialization/blob/main/Chap2_W3_Building_Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
n_gram_counts = {
    ('i', 'am', 'happy'): 2,
    ('am', 'happy', 'because'): 1
}

print(f"count of n-gram {('i', 'am', 'happy')}: {n_gram_counts[('i', 'am', 'happy')]}")

# check if n-gram is present in the dictionary
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

# update the count in the word count dictionary
n_gram_counts[('i', 'am', 'learning')] = 1
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

count of n-gram ('i', 'am', 'happy'): 2
n-gram ('i', 'am', 'learning') missing
n-gram ('i', 'am', 'learning') found


In [2]:
prefix = ('i', 'am', 'happy')
word = 'because'

n_gram = prefix + (word, )
print(n_gram)

('i', 'am', 'happy', 'because')


In [10]:
import numpy as np
import pandas as pd
from collections import defaultdict

def single_pass_trigram_count_matrix(corpus):
    '''
    Creates the trigram count matrix from the input corpus in a single pass
    through the corpus.

    Args:
        corpus: Pre-processed and tokenized corpus.

    Returns:
        bigrams: list of all birgram prefixes, row index
        vocabulary: list of all found words, the column index
        count_matrix: pandas dataframe with bigram prefixes as rows,
                    vocabulary words as columns and the counts of the
                    bigram/word combinations as values
    '''

    bigrams = []
    vocabulary = []
    count_matrix_dict = defaultdict(dict)
    offset = 3
    n = len(corpus)
    for i in range(len(corpus) - 3 + 1):
        # the sliding window starts at position i and contains 3 words
        trigram = tuple(corpus[i : i + 3])

        bigram = trigram[0 : -1]
        if not bigram in bigrams:
            bigrams.append(bigram)

        last_word = trigram[-1]
        if not last_word in vocabulary:
            vocabulary.append(last_word)

        if (bigram,last_word) not in count_matrix_dict:
            count_matrix_dict[bigram,last_word] = 0

        count_matrix_dict[bigram,last_word] += 1

    # convert the count_matrix to np.array to fill in the blanks
    count_matrix = np.zeros((len(bigrams), len(vocabulary)))
    for trigram_key, trigam_count in count_matrix_dict.items():
        count_matrix[bigrams.index(trigram_key[0]), \
                     vocabulary.index(trigram_key[1])]\
        = trigam_count

    count_matrix = pd.DataFrame(count_matrix, index=bigrams, columns=vocabulary)
    return bigrams, vocabulary, count_matrix

In [14]:
corpus = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']
bigrams, vocabulary, count_matrix = single_pass_trigram_count_matrix(corpus)
count_matrix

Unnamed: 0,happy,because,i,am,learning,.
"(i, am)",1.0,0.0,0.0,0.0,1.0,0.0
"(am, happy)",0.0,1.0,0.0,0.0,0.0,0.0
"(happy, because)",0.0,0.0,1.0,0.0,0.0,0.0
"(because, i)",0.0,0.0,0.0,1.0,0.0,0.0
"(am, learning)",0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
row_sums = np.sum(count_matrix, axis=1)
prob_matrix = count_matrix.div(row_sums, axis=0)
prob_matrix

Unnamed: 0,happy,because,i,am,learning,.
"(i, am)",0.5,0.0,0.0,0.0,0.5,0.0
"(am, happy)",0.0,1.0,0.0,0.0,0.0,0.0
"(happy, because)",0.0,0.0,1.0,0.0,0.0,0.0
"(because, i)",0.0,0.0,0.0,1.0,0.0,0.0
"(am, learning)",0.0,0.0,0.0,0.0,0.0,1.0


# Pr