## Part 1 - Extraction and Frequencies

In [1]:
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk.collocations import *
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk.lm import MLE
from nltk.util import ngrams
from nltk.corpus import stopwords
import nltk, re, pprint, string
from nltk import word_tokenize, sent_tokenize

In [2]:
bnc_reader = BNCCorpusReader(root="BNC/Texts", fileids=r'[A-K]/\w*/\w*\.xml')
fileids = ['aca/A6U.xml']

In [3]:
raw_tokens = BNCCorpusReader.words(bnc_reader, fileids=fileids)
raw_sents = BNCCorpusReader.sents(bnc_reader, fileids=fileids)
punct = "“”‘’!\"#$€%&()*'+-,./:;<=>?@[\]^_`{|}~\n"
tokens = []
sents = []

for word in raw_tokens:
    if word not in punct:
        tokens.append(word)
        
for sentence in raw_sents:
    for word in sentence:
        if word in punct:
            sentence.remove(word)
    sents.append(sentence)
# word_list = []
# for word in tokens:
#     if word is not word_list:
#         word_list.append(word)

In [4]:
frequencies = []
for instance in tokens:
    frequencies.append(tokens.count(instance))
    
word_freq = list(zip(tokens, frequencies))
print(word_freq)



In [5]:
train_sent, test_sent = train_test_split(sents, test_size = 0.2)
train_words, test_words = train_test_split(tokens, test_size = 0.2)

## Part 2 - Language Models

### Vanilla Model

In [6]:
def vanilla_uni(train_words):
    unigram = Counter(train_words)
    
    for word in unigram:
        unigram[word] = unigram[word]/len(train_words)
        
    return unigram
def vanilla_bi(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    counter = Counter(train_words)
    
    for word in bigram:
        bigram[word] = bigram[word]/counter[word[0]]
        
    return bigram
    
def vanilla_tri(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train[:-1])])
    trigram = Counter([(word, train_words[i + 1], train_words[i + 2]) for i, word in enumerate(train[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word]/bigram[(word[0], word[1])]
        
    return trigram

### Laplace Model

In [87]:
def laplace_uni(train_words):
    unigram = Counter(train_words)
    
    for word in unigram:
        unigram[word] = unigram[word]+1/len(train_words)
        
    return unigram
def laplace_bi(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    counter = Counter(train_words)
    
    for word in bigram:
        bigram[word] = bigram[word]+1/counter[word[0]]
        
    return bigram
    
def laplace_tri(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train[:-1])])
    trigram = Counter([(word, train_words[i + 1], train_words[i + 2]) for i, word in enumerate(train[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word]+1/bigram[(word[0], word[1])]
        
    return trigram

### UNK Model

In [51]:
def unk_uni(train_words):
    
    counter = Counter(train_words)
    model = {}
    model["<UNK>"] = 0
    
    for word in counter:
        if counter[word] <= 2:
            model["<UNK>"] += 1
            
        else:
            model[word] = counter[word]
        
    return laplace_uni(train_words)

def unk_uni(train_words):
    
    unigram_model = unk_uni(train_words)
    
    for i, word in enumerate(train_words):
        if not (word in unigram_model):
            train_words[i] = "<UNK>"
            
    return laplace_bi(train_words)

def unk_bi(train_words):
    
    unigram = unk_uni(train_words)
    
    for i, word in enumerate(train_words):
        if not (word in unigram):
            train_words[i] = "<UNK>"
            
    return laplace_tri(train_words)

### Probability

In [9]:
# def raw_unigram_probability(unigram):

# #     Returns the raw (unsmoothed) unigram probability.

#     uni = []
#     uni.append(unigram)
#     assert len(uni)==1, "Input should be only 1 word"
#     return unigramcounts[unigram]/total_words

# def raw_bigram_probability(bigram):

# #     Returns the raw (unsmoothed) bigram probability

#     assert len(bigram)==2, "Input should be 2 words"
#     return bigramcounts[bigram]/unigramcounts[bigram[0]]

# def raw_trigram_probability(trigram):

# #     Returns the raw (unsmoothed) trigram probability

#     assert len(trigram)==3, "Input should be 3 words"
#     return trigramcounts[trigram]/bigramcounts[trigram[:2]]

In [10]:
# def smoothed_trigram_probability(trigram):
# #         Returns the smoothed trigram probability (using linear interpolation). 
#     assert len(trigram)==3, "Input should be 3 words"
#     uni_lambda = 0.1
#     bi_lambda = 0.3
#     tri_lambda = 0.6
#     u,v,w = trigram[0],trigram[1],trigram[2]
#     prob =  (lambda1* raw_unigram_probability(w))+\
#     (lambda2* raw_bigram_probability((v,w)))+\
#     (lambda3* raw_trigram_probability((u,v,w)))
#     return prob

In [100]:
def uni_prob(model,unigram):
    total = sum(model.values())
    probability = model[unigram]/total
    return probability

def bi_prob(model_bi, model_uni, bigram):
    first = bigram.split()[0]
    second = bigram.split()[1]
    total = model_uni[first]
    probability = model_bi[first,second]/total
    return probability

def tri_prob(model_tri, model_bi, trigram):
    first = trigram.split()[0]
    second = trigram.split()[1]
    third = trigam.split()[2]
    total = model_bi[second,third]
    probability = model_tri[first,second,third]/total
    return probability

In [101]:
tri_prob(laplace_tri(train_words),laplace_bi(train_words),"the in of")

NameError: name 'train' is not defined

In [67]:
print(laplace_bi(train_words))

