# *N-Gram Language Model*

### Importing Libraries

In [None]:
import nltk
import argparse
from itertools import product
import math

### Preprocess Data

In [None]:
SOS = "<s> "
EOS = "</s>"
UNK = "<UNK>"

In [None]:
def add_sentence_tokens(sentences, n):
    sos = SOS * (n-1) if n > 1 else SOS
    return ['{}{} {}'.format(sos, s, EOS) for s in sentences]

def replace_singletons(tokens):
    vocab = nltk.FreqDist(tokens)
    return [token if vocab[token] > 2 else UNK for token in tokens]
    # return [token for token in tokens]

def preprocess(text, n):
    sentences = text.split('\n')
    sentences = add_sentence_tokens(sentences, n)
    tokens = ' '.join(sentences).split(' ')
    tokens = replace_singletons(tokens)
    return tokens

### Language Model Class

In [None]:
class Language_Model:

    def __init__(self, train_text, n, laplace = 1):
        self.n = n
        self.laplace = laplace
        self.tokens = preprocess(train_text, n)
        self.vocab  = nltk.FreqDist(self.tokens)
        if(UNK not in self.vocab):
            self.vocab[UNK] = 0
        self.model  = self._create_model()
        self.masks  = list(reversed(list(product((0,1), repeat=n))))

    def _smooth(self):
        vocab_size = len(self.vocab)

        n_grams = nltk.ngrams(self.tokens, self.n)
        self.n_vocab = nltk.FreqDist(n_grams)

        m_grams = nltk.ngrams(self.tokens, self.n-1)
        self.m_vocab = nltk.FreqDist(m_grams)

        def smoothed_count(n_gram, n_count):
            m_gram = n_gram[:-1]
            m_count = self.m_vocab[m_gram]
            return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)

        return { n_gram: smoothed_count(n_gram, count) for n_gram, count in self.n_vocab.items() }

    def _create_model(self):
        if self.n == 1:
            num_tokens = len(self.tokens)
            return { (unigram, ): count / num_tokens for unigram, count in self.vocab.items() }
        else:
            return self._smooth()

    def _calculate_prob(self, ngram):
        if(ngram in  self.model):
            return self.model[ngram]
        else:
            numerator, denominator = 1, len(self.vocab)
            if(ngram in self.n_vocab):
                numerator += self.n_vocab[ngram]
            m_gram = ngram[:-1]
            if(m_gram in self.m_vocab):
                denominator += self.m_vocab[m_gram]
            return numerator / denominator

    def perplexity(self, test_data):
        test_tokens = preprocess(test_data, self.n)
        test_tokens = [token if token in self.vocab else UNK for token in test_tokens]
        test_ngrams = nltk.ngrams(test_tokens, self.n)

        N = len(test_tokens)
        probabilities = [self._calculate_prob(ngram) for ngram in test_ngrams]
        return math.exp((-1/N) * sum(map(math.log, probabilities)))

    def _best_candidate(self, prev, i=0, without=[]):
        blacklist  = ["<UNK>"] + without
        candidates = ((ngram[-1],prob) for ngram,prob in self.model.items() if ngram[:-1]==prev)
        candidates = filter(lambda candidate: candidate[0] not in blacklist, candidates)
        candidates = sorted(candidates, key=lambda candidate: candidate[1], reverse=True)
        if len(candidates) == 0:
            return ("</s>", 1)
        else:
            return candidates[0 if prev != () and prev[-1] != "<s>" else i]
     
    def generate_random_sentences(self, num, min_len=12, max_len=24):
        for i in range(num):
            sent, prob = ["<s>"] * max(1, self.n-1), 1
            # print(sent)
            while sent[-1] != "</s>":
                prev = () if self.n == 1 else tuple(sent[-(self.n-1):])
                blacklist = sent + (["</s>"] if len(sent) < min_len else [])
                next_token, next_prob = self._best_candidate(prev, i, without=blacklist)
                sent.append(next_token)
                prob *= next_prob
                
                if len(sent) >= max_len:
                    sent.append("</s>")

            yield ' '.join(sent), -1/math.log(prob)

    def generate_sentences(self, text = " ", min_len=12, max_len=24):
        sent, prob = ["<s>"] * max(1, self.n-1) + text.lower().split(" "), 1
        while sent[-1] != "</s>":
            prev = () if self.n == 1 else tuple(sent[-(self.n-1):])
            blacklist = sent + (["</s>"] if len(sent) < min_len else [])
            next_token, next_prob = self._best_candidate(prev, without=blacklist)
            sent.append(next_token)
            prob *= next_prob
                  
            if len(sent) >= max_len:
                sent.append("</s>")

        return ' '.join(sent), -1/math.log(prob)

### Load Data

In [None]:
f = open('train.txt', 'r+')
train_text = f.read()
f.close()

f = open('test.txt', 'r+')
test_text = f.read()
f.close()

### Unigram Model

In [None]:
n = 1
model = Language_Model(train_text,n)
print("Unigram Model Perplexity : ",model.perplexity(test_text))

Unigram Model Perplexity :  663.5046858332885


In [None]:
print("Random Sentences Generated:")
for sentence, prob in model.generate_random_sentences(10):
      print("{} ({:.5f})".format(sentence, prob))

input_text = "The company Said that"
sentence, prob = model.generate_sentences(input_text)
print("\n\nPredicted Sentence for input text \"{}\": ".format(input_text))
print("{} ({:.5f})".format(sentence, prob))

Random Sentences Generated:
<s> the of to in and said a mln for dlrs vs </s> (0.02054)
<s> of to in and said a mln for dlrs vs it </s> (0.01978)
<s> to in and said a mln for dlrs vs it pct of on is from its that at by be cts year will </s> (0.00904)
<s> in and said a mln for dlrs vs it pct on to is from its that at by be cts year will with </s> (0.00890)
<s> and said a mln for dlrs vs it pct on is in from its that at by be cts year will with billion </s> (0.00876)
<s> said a mln for dlrs vs it pct on is from and its that at by be cts year will with billion net </s> (0.00864)
<s> a mln for dlrs vs it pct on is from its said that at by be cts year will with billion net was </s> (0.00853)
<s> mln for dlrs vs it pct on is from its that a at by be cts year will with billion net was us </s> (0.00841)
<s> for dlrs vs it pct on is from its that at mln by be cts year will with billion net was us he </s> (0.00830)
<s> dlrs vs it pct on is from its that at by for be cts year will with billion net

### Bi-Gram Model

In [None]:
n = 2
model = Language_Model(train_text,n)
print("Bi-gram Model Perplexity : ",model.perplexity(test_text))

Bi-gram Model Perplexity :  676.1738793566435


In [None]:
print("Random Sentences Generated:")
for sentence, prob in model.generate_random_sentences(10):
      print("{} ({:.5f})".format(sentence, prob))

input_text = "The company Said that"
sentence, prob = model.generate_sentences(input_text)
print("\n\nPredicted Sentence for input text \"{}\": ".format(input_text))
print("{} ({:.5f})".format(sentence, prob))

Random Sentences Generated:
<s> the company said it has been made a share in 1986 </s> (0.02266)
<s> it said the company also be a share in 1986 87 03 09 pct of its board </s> (0.01294)
<s> shr loss of the company said it has been made a share </s> (0.02084)
<s> he said it has been made a share in the company </s> (0.02241)
<s> in the company said it has been made a share of its board </s> (0.01802)
<s> but the company said it has been made a share in 1986 </s> (0.01982)
<s> a share in the company said it has been made by an agreement to be used for one of its board </s> (0.01078)
<s> us and the company said it has been made a share </s> (0.02116)
<s> this year shr loss of the company said it has been made a share </s> (0.01778)
<s> they said it has been made a share in the company </s> (0.02133)


Predicted Sentence for input text "The company Said that": 
<s> the company said that it has been made a share in 1986 </s> (0.02667)


### Tri-Gram Model

In [None]:
n = 3
model = Language_Model(train_text,n)
print("Tri-gram Model Perplexity : ", model.perplexity(test_text))

Tri-gram Model Perplexity :  2731.673073427145


In [None]:
print("Random Sentences Generated:")
for sentence, prob in model.generate_random_sentences(10):
      print("{} ({:.5f})".format(sentence, prob))

input_text = "The company Said that"
sentence, prob = model.generate_sentences(input_text)
print("\n\nPredicted Sentence for input text \"{}\": ".format(input_text))
print("{} ({:.5f})".format(sentence, prob))

Random Sentences Generated:
<s> <s> the company said it has agreed to sell its shares in a statement </s> (0.01445)
<s> <s> it said the company also announced an offering of up to one billion dlrs in cash and notes </s> (0.00886)
<s> <s> shr loss one ct vs profit two cts net 119 mln dlrs </s> (0.01261)
<s> <s> he said the company also announced an offering of up to one billion dlrs in cash and notes </s> (0.00882)
<s> <s> in a statement that the us agriculture department said it has agreed to sell its shares </s> (0.01066)
<s> <s> but the company said it has agreed to sell its shares in a statement </s> (0.01251)
<s> <s> a spokesman for the first quarter of 1986 and 1985 </s> (0.01601)
<s> <s> us officials said the company also announced an offering of up to one billion dlrs in cash and notes </s> (0.00797)
<s> <s> this is a major trade bill that would be the first quarter of 1986 </s> (0.01051)
<s> <s> they said the company also announced an offering of up to one billion dlrs in cash 