In [1]:
import sys
from collections import defaultdict
from collections import Counter
import math
import random
import os

In [2]:
def corpus_reader(corpusfile, lexicon=None): 
    with open(corpusfile,'r') as corpus: 
        for line in corpus: 
            if line.strip():
                sequence = line.lower().strip().split()
                if lexicon: 
                    yield [word if word in lexicon else "UNK" for word in sequence]
                else: 
                    yield sequence

def get_lexicon(corpus):
    word_counts = defaultdict(int)
    for sentence in corpus:
        for word in sentence: 
            word_counts[word] += 1
    return set(word for word in word_counts if word_counts[word] > 1)  


In [44]:
def get_ngrams(seq, n):
    """
    COMPLETE THIS FUNCTION (PART 1)
    Given a sequence, this function should return a list of n-grams, where each n-gram is a Python tuple.
    This should work for arbitrary values of 1 <= n < len(sequence).
    """
    end = len(seq)
    error = 0
    result = []
    start = 0
    
    if n<1 or n>=end:
        error = 1
    assert error<1, "value of 'n' should be 1 <= n < length of sequence"
    
    sequence = seq.copy()
    sequence.insert(0,'START')
    sequence.append('STOP')
    end+=2
    
    while start+n<end+1:
        result.append(tuple(sequence[start:start+n]))
        start+=1
    return result

In [74]:
class TrigramModel(object):
    
    def __init__(self, corpusfile):
    
        # Iterate through the corpus once to build a lexicon 
        generator = corpus_reader(corpusfile)
        self.lexicon = get_lexicon(generator)
        self.lexicon.add("UNK")
        self.lexicon.add("START")
        self.lexicon.add("STOP")
    
        # Now iterate through the corpus again and count ngrams
        generator = corpus_reader(corpusfile, self.lexicon)
        self.count_ngrams(generator)


    def count_ngrams(self, corpus):
        """
        COMPLETE THIS METHOD (PART 2)
        Given a corpus iterator, populate dictionaries of unigram, bigram,
        and trigram counts. 
        """
        
        one_g = []
        two_g = []
        three_g = []
        for sequence in corpus:
            one_g.extend(get_ngrams(sequence,1))
            two_g.extend(get_ngrams(sequence,2))
            three_g.extend(get_ngrams(sequence,3))
            
        self.unigramcounts = Counter(one_g)
        self.bigramcounts = Counter(two_g)
        self.trigramcounts = Counter(three_g)

        return None