## Build unigram dictionary

In [4]:
import nltk
from nltk.corpus import brown
from nltk import bigrams, ngrams, trigrams

sentences=['a b a','b a a b','a a a','b a b b','b b a b','a a a b'] # data 

unigrams=[]

for elem in sentences:
    unigrams.extend(elem.split())
   
from collections import Counter
unigram_counts=Counter(unigrams)
unigram_total=len(unigrams)

for word in unigram_counts:
    unigram_counts[word]/=unigram_total
 
print(unigram_counts)

Counter({'a': 0.5454545454545454, 'b': 0.45454545454545453})


## Build bigram dictionary

In [5]:
def bigram_model(sentences):
    model={}
    for sent in sentences:
         for w1,w2 in ngrams(sent.split(),2, pad_left=True,pad_right=True):
            if w1 not in model:
                model[w1]={}
            if w2 not in model[w1]:
                model[w1][w2]=0
            model[w1][w2]+=1
    for w1 in model:
        tot_count=float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2]/=tot_count
     
    return model

bigram_counts= bigram_model(sentences)
print(bigram_counts)

{None: {'a': 0.5, 'b': 0.5}, 'a': {'b': 0.4166666666666667, None: 0.16666666666666666, 'a': 0.4166666666666667}, 'b': {'a': 0.4, None: 0.4, 'b': 0.2}}


## Build trigram dictionary

In [6]:
def trigram_model(sentences):
    model={}
    for sent in sentences:
         for w1,w2,w3 in ngrams(sent.split(),3, pad_left=True,pad_right=True):
            if (w1,w2) not in model:
                model[(w1,w2)]={}
            if w3 not in model[(w1,w2)]:
                model[(w1,w2)][w3]=0
            model[(w1,w2)][w3]+=1
    for (w1,w2) in model:
        tot_count=float(sum(model[(w1,w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1,w2)][w3]/=tot_count
     
    return model

trigram_counts= trigram_model(sentences)
print(trigram_counts)

{(None, None): {'a': 0.5, 'b': 0.5}, (None, 'a'): {'b': 0.3333333333333333, 'a': 0.6666666666666666}, ('a', 'b'): {'a': 0.2, None: 0.6, 'b': 0.2}, ('b', 'a'): {None: 0.25, 'a': 0.25, 'b': 0.5}, ('a', None): {None: 1.0}, (None, 'b'): {'a': 0.6666666666666666, 'b': 0.3333333333333333}, ('a', 'a'): {'b': 0.4, 'a': 0.4, None: 0.2}, ('b', None): {None: 1.0}, ('b', 'b'): {None: 0.5, 'a': 0.5}}


## Test Scores of each model

In [1]:
test_sentences=['a b a b','b a b a','a b b','b a a a a a b','a a a','b b b b a']

import numpy as np

test_unigram_arr=[]

print('Unigram test probabilities\n')
for elem in test_sentences:
    p_val=np.prod([unigram_counts[i] for i in elem.split()])
    test_unigram_arr.append(p_val)
    print('The sequence '+elem+' has unigram probablity of '+ str(round(p_val,4)))


print('\nBigram test probabilities\n')

test_bigram_arr=[]

for elem in test_sentences:
    p_val=1
    for w1,w2 in bigrams(elem.split(),pad_left=True,pad_right=True):
        p_val*=bigram_counts[w1][w2]
    print('The sequence '+ elem +' has bigram probablity of '+ str(round(p_val,4)))
    
    test_bigram_arr.append(p_val)


test_trigram_arr=[]
print('\nTrigram test probabilities\n')
for elem in test_sentences:
    p_val=1
    for w1,w2,w3 in trigrams(elem.split(),pad_left=True,pad_right=True):
        try:
            p_val*=trigram_counts[(w1,w2)][w3]
        except Exception as e:
            p_val=0
            break
    print('The sequence '+ elem +' has trigram probablity of '+ str(round(p_val,4)))
    
    test_trigram_arr.append(p_val)
            

ModuleNotFoundError: No module named 'numpy'

In [2]:
import matplotlib.pyplot as plt

x_axis=[i for i in range(1,4)]

y_axis=[np.mean(test_unigram_arr), np.mean(test_bigram_arr), np.mean(test_trigram_arr)]

plt.scatter(x_axis,y_axis)
plt.show()

ModuleNotFoundError: No module named 'matplotlib'