## Part 1 - Extraction and Frequencies

In [1]:
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk.collocations import *
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk.lm import MLE
from nltk.util import ngrams
from nltk.corpus import stopwords
import nltk, re, pprint, string
from nltk import word_tokenize, sent_tokenize
import numpy
import random
import mpmath as mp
import sys

In [2]:
bnc_reader = BNCCorpusReader(root="BNC/Texts", fileids=r'[A-K]/\w*/\w*\.xml')
fileids = ['aca/A6U.xml']

In [3]:
raw_sents = BNCCorpusReader.sents(bnc_reader, fileids=fileids)
punct = "“”‘’!\"#$€%&()*'+-,./:;<=>?@[\]^_`{|}~\n"
temp = []
        
for sentence in raw_sents:
    temp.append("<s>")
    for word in sentence:
        if word not in punct:
            temp.append(word)
    temp.append("</s>")
    
tokens = [x.lower() for x in temp]
# word_list = []
# for word in tokens:
#     if word is not word_list:
#         word_list.append(word)

In [None]:
frequencies = []
for instance in tokens:
    frequencies.append(tokens.count(instance))
    
word_freq = list(zip(tokens, frequencies))
print(word_freq)

In [4]:
train_words, test_words = train_test_split(tokens, test_size = 0.2)

## Part 2 - Language Models

### Vanilla Model

In [5]:
def vanilla_uni(train_words):
    unigram = Counter(train_words)
    
    for word in unigram:
        unigram[word] = unigram[word]/len(train_words)
        
    return unigram
def vanilla_bi(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    counter = Counter(train_words)
    
    for word in bigram:
        bigram[word] = bigram[word]/counter[word[0]]
        
    return bigram
    
def vanilla_tri(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    trigram = Counter([(word, train_words[i + 1], train_words[i + 2]) for i, word in enumerate(train_words[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word]/bigram[(word[0], word[1])]
        
    return trigram

### Laplace Model

In [6]:
def laplace_uni(train_words):
    unigram = Counter(train_words)
    
    for word in unigram:
        unigram[word] = unigram[word]+1/len(train_words)
        
    return unigram
def laplace_bi(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    counter = Counter(train_words)
    
    for word in bigram:
        bigram[word] = bigram[word]+1/counter[word[0]]
        
    return bigram
    
def laplace_tri(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    trigram = Counter([(word, train_words[i + 1], train_words[i + 2]) for i, word in enumerate(train_words[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word]+1/bigram[(word[0], word[1])]
        
    return trigram

### UNK Model

In [7]:
def unk_uni(train_words):
    
    counter = Counter(train_words)
    model = {}
    model["<UNK>"] = 0
    
    for word in counter:
        if counter[word] <= 2:
            model["<UNK>"] += 1
            
        else:
            model[word] = counter[word]
        
    return laplace_uni(train_words)

def unk_bi(train_words):
    
    unigram_model = unk_uni(train_words)
    
    for i, word in enumerate(train_words):
        if not (word in unigram_model):
            train_words[i] = "<UNK>"
            
    return laplace_bi(train_words)

def unk_tri(train_words):
    
    unigram = unk_uni(train_words)
    
    for i, word in enumerate(train_words):
        if not (word in unigram):
            train_words[i] = "<UNK>"
            
    return laplace_tri(train_words)

### Probability

In [8]:
def uni_prob(model,unigram):
    total = sum(model.values())
    top = model[unigram]
    if top == 0:
        return 0
    probability = top/total
    return probability

def bi_prob(model_bi, model_uni, bigram):
    first = bigram.split()[0]
    second = bigram.split()[1]
    total = model_uni[first]
    top = model_bi[first,second]
    if top == 0:
        return 0
    if total == 0:
        return 0    
    probability = top/total
    return probability

def tri_prob(model_tri, model_bi, trigram):
    first = trigram.split()[0]
    second = trigram.split()[1]
    third = trigram.split()[2]
    total = model_bi[second,third]
    top = model_tri[first,second,third]
    if top == 0:
        return 0
    if total == 0:
        return 0  
    probability = top/total
    return probability

In [9]:
def probability (sentence, model):
    sent = "<s> "+ sentence + " </s>"
    words = sent.split()
    uni_lambda = 0.1
    bi_lambda = 0.3
    tri_lambda = 0.6
    
    unigrams_probability = []
    bigrams_probability = []
    trigrams_probability = []
    
    if model == "Vanilla":
        # unigram
                
        for word in words:
            unigrams_probability.append(uni_prob(vanilla_uni(train_words),word))
        
        # bigram
        
        bigrams = nltk.ngrams(words, 2)
        for pair in bigrams:
            bigram = ' '.join(pair)
            bigrams_probability.append(bi_prob(vanilla_bi(train_words), vanilla_uni(train_words), bigram))
        
        # trigram
        trigrams = nltk.ngrams(words, 3)
        for trio in trigrams:
            trigram = ' '.join(trio)
            trigrams_probability.append(tri_prob(vanilla_tri(train_words),vanilla_bi(train_words),trigram))
        
    elif model == "Laplace":
        # unigram
                
        for word in words:
            unigrams_probability.append(uni_prob(laplace_uni(train_words),word))
        
        # bigram
        
        bigrams = nltk.ngrams(words, 2)
        for pair in bigrams:
            bigram = ' '.join(pair)
            bigrams_probability.append(bi_prob(laplace_bi(train_words), laplace_uni(train_words), bigram))
        
        # trigram
        trigrams = nltk.ngrams(words, 3)
        for trio in trigrams:
            trigram = ' '.join(trio)
            trigrams_probability.append(tri_prob(laplace_tri(train_words),laplace_bi(train_words),trigram))
        
    elif model == "UNK":
        # unigram
                
        for word in words:
            unigrams_probability.append(uni_prob(unk_uni(train_words),word))
        
        
        # bigram
        
        bigrams = nltk.ngrams(words, 2)
        for pair in bigrams:
            bigram = ' '.join(pair)
            bigrams_probability.append(bi_prob(unk_bi(train_words), unk_uni(train_words), bigram))
        
        # trigram
        trigrams = nltk.ngrams(words, 3)
        for trio in trigrams:
            trigram = ' '.join(trio)
            trigrams_probability.append(tri_prob(unk_tri(train_words),unk_bi(train_words),trigram))
        
    prob1 = numpy.prod(unigrams_probability)
    prob2 = numpy.prod(bigrams_probability)
    prob3 = numpy.prod(trigrams_probability) 
    
    probability = (uni_lambda*prob1)+(bi_lambda*prob2)+(tri_lambda*prob3)
    return probability

In [10]:
print (probability ("why does it seem important that the answer",model="UNK"))

6.452427525511457e-27


### Perplexity

In [11]:
def perplexity(test_words, model):
    
    p = mp.mpf(1)
    
    N = mp.mpf(0)
    
    for line in test_words:
        N += len(line)
        line = ' '.join(line)
        
        if model[line] > 0:
            p = p * (1/model[line])
        else:
            p = p * sys.maxsize
            
    p = pow(p, 1/float(N))
    return p

In [12]:
perplexity(test_words, vanilla_bi(train_words))

mpf('7314.567537385079')

# Generation

In [85]:
def uni_generate(model, sentence, last = "", count = None):
    
    if(count != 0 and sentence[-1] != last):
        
        weights = numpy.array(list(model.values()))
        norm = weights/numpy.sum(weights)
        
        resample = numpy.random.multinomial(1, norm)
        key = list(resample).index(1)
        value = list(model.keys())[key]
        
        sentence.append(value)
        if count != None:
            uni_generate(model, sentence, last, count-1)
        else:
            uni_generate(model, sentence, last)
            
    return sentence

def bi_generate(model, sentence, last, count = None):
    if(count != 0 and sentence[-1] != last):
        
        bigrams = []
        b = []
        last_word = sentence[-1]
        
        
        for entry in model:
            if entry[0] == last_word:
                bigrams.append((entry,model[entry]))
        if(bigrams == []):
            return sentence 
        
        v = [x[1] for x in bigrams]
        k = [x[0] for x in bigrams]
        weights = numpy.array(v)
        norm = weights / numpy.sum(weights)
        resample = numpy.random.multinomial(1, norm)
        key = list(resample).index(1)
        value = k[key]

        sentence.append(value[1])

        if count != None:
            bi_generate(model, sentence, last, count-1)
        else:
            bi_generate(model, sentence, last)
        
    return sentence


def tri_generate(bi_model, tri_model, sentence, last = "", count = None):
    if(len(sentence) == 1):
        sentence = BigramGenerate(bi_model, sentence, last, count=1)
        
    if(count != 0 and sentence[-1] != last):
        trigrams = []
        
        for entry in tri_model:
            if(entry[0] == sentence[-2] and entry[1] == sentence[-1]):
                print("yes")
                trigrams[word] = tri_model[word]
                
        if(trigrams == []):
            return sentence
        
        weights = np.array(list(bigrams.values()))
        norm = weights / np.sum(weights)
        resample = np.random.multinomial(1, norm)
        key = list(resample).index(1)
        value = list(bigrams.keys())[key] 
        
        sentence.append(value[2])
        if count != None:
            tri_generate(bi_model, tri_model, sentence, last, count-1)
            
        else:
            tri_generate(bi_model, tri_model, sentence, last)
            
            
    return sentence

In [86]:
model = "1"
phrase = "when we go"

if model == "1" or model == "Vanilla" or model == "vanilla":
    sent1 = ["<s>"]
    w = phrase.split()
    for word in w:
        sent1.append(word)
        
    sent2 = sent1.copy()
    sent3 = sent1.copy()
        
    print("Generating vanilla model...")
    print("GENERATED VANILLA SENTENCES")
#     print("Unigram: "+ str (uni_generate(model = vanilla_uni(train_words), sentence = sent1, last = "</s>")))
#     print("Bigram: "+ str (bi_generate(model = vanilla_bi(train_words), sentence = sent2, last = "</s>")))
    print("Trigram: "+ str (tri_generate(bi_model = vanilla_bi(train_words), tri_model = vanilla_tri(train_words), sentence = sent3, last = "</s>")))

Generating vanilla model...
GENERATED VANILLA SENTENCES
Trigram: ['<s>', 'when', 'we', 'go']


In [None]:
model = input("Which model would you like to use? (1) Vanilla (2) Laplace (3) UNK : ")
phrase = input("Enter phrase to continue to generate: ")
generate(model,phrase)