In [3]:
import string
import random

# N Gram Model

Say we have two word "This tea is good!" and "Good is this tea!", we as human can conclude that the word "This tea is good!" is more likely to appear than the word "Good is this tea!" in other word the word "This tea is good!" have a higher probabillity to appear than its counterparts. Probability on how words formed plays an important part on how us humans learn a language, we human subconciously learn that one word has more probability to appear than the other as we grow. What if a machine can learn how to form word by knowing which word most likely to appear? and thats where Language Models come in.  

Language Models are a model that assign probabilities to a sequence of words. One of the simplest language models are N gram model. N gram is a sequence of n word, for example a 2-gram is a sequence that consist of two words, "Turn your" and "your homework" is a 2-gram or we can call bigram. For more details on how N gram models work refer to this page [N Gram Model Stanford](https://web.stanford.edu/~jurafsky/slp3/3.pdf)

We can train a N gram language model on a corpus and assign the probability of a word appearing given the previous word, that way we can generate a word by predicting the most likely word that can exist given a previous word. 

In this notebook I will train my N Gram model to generate a word that can talk like Shakespeare by training the model in shakespeare dataset.

In [4]:
class NGramModel : 
    def __init__(self, n, filepath, config) : 
        self.filepath = filepath
        self.n = n
        self.ngram_dict_count = {}
        self.ngram_precedings_count = {}
        self.ngram_prob = {}
        self.config = config
        self.sentences = self._load_sentences()
        self._build_LM()
        
    def _get_ngram(self, sentence, n) : 
        
        sentence = (n-1) * ["<s>"] + sentence.split()
        ngrams = []
        precedings = []
        for i in range(n - 1, len(sentence)) :
            prec = tuple(sentence[(i - n + 1):i])
            ngram = tuple([prec, sentence[i]])
            precedings.append(prec)
            ngrams.append(ngram)

        return ngrams, precedings
        
    def _preprocess(self, sentence, config) : 
        sent = sentence
        if config['preprocessing'].get('remove_punct') == True : 
            sent = " ".join([s.translate(str.maketrans('', '', string.punctuation)) for s in sent.split()])

        if config['preprocessing'].get('to_lower') == True : 
            sent = sent.lower()

        return sent
        
    def _load_sentences(self) :
        with open(self.filepath) as f:
            corpus = f.read().replace('\n', ' ').strip()
        
        sentences = corpus.split(self.config['preprocessing']['split_by'])
        for i, sentence in enumerate(sentences) : 
            sentences[i] = self._preprocess(sentence, config)
        return sentences
            
    def _build_LM(self) : 
        ngram_dict_count = {}
        ngram_precedings_count = {}
        ngram_prob = {}

        for sentence in self.sentences : 

            ngrams, precedings = self._get_ngram(sentence, n=self.n)

            for i in range(len(ngrams)) : 
                ngram = ngrams[i]
                prec = precedings[i]

                if ngram in ngram_dict_count : 
                    ngram_dict_count[ngram] += 1 
                else : 
                    ngram_dict_count[ngram] = 1 

                if prec in ngram_precedings_count : 
                    ngram_precedings_count[prec] += 1
                else : 
                    ngram_precedings_count[prec] = 1
                
        for ngram in ngram_dict_count.keys() : 
            prec = ngram[0]
            word = ngram[1]

            prob = ngram_dict_count[ngram] / ngram_precedings_count[ngram[0]]
            if prec in ngram_prob : 
                ngram_prob[prec]['word'].append(word)
                ngram_prob[prec]['prob'].append(prob)

            else : 
                ngram_prob[prec] = { 'word': [word], 'prob' : [prob] }


        self.ngram_dict_count = ngram_dict_count
        self.ngram_precedings_count = ngram_precedings_count
        self.ngram_prob = ngram_prob
        
    def generate(self, length=10) :
        first_word_choice = self.ngram_prob[tuple((self.n-1) * ["<s>"])]
        word_list = (self.n-1) * ["<s>"] + [random.choices(first_word_choice['word'], first_word_choice['prob'])[0]]
        i = 1
        while i < length + self.n - 1 :
            try : 
                prec = word_list[i: self.n+i]
                word_choice = self.ngram_prob[tuple(prec)]
                generated_word = random.choices(word_choice['word'], word_choice['prob'])[0]
                word_list.append(generated_word)
                i += 1
            except : 
                i += 1
          
            
        return " ".join(word_list[self.n-1:])

# Generating Words

In [7]:
config = {
    'preprocessing': {
        'split_by': '.',
        'to_lower': True, 
        'remove_punct': True,
    }
}
model = NGramModel(n=4, filepath='./corpus/shakespeare.txt', config=config)
model.generate(length=20)

'ill canvass thee between a pair of shears between us'

# References
* N-gram Language Models https://web.stanford.edu/~jurafsky/slp3/3.pdf