## Part 1 - Extraction and Frequencies

In [144]:
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk.collocations import *
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk.lm import MLE
from nltk.util import ngrams
from nltk.corpus import stopwords
import nltk, re, pprint, string
from nltk import word_tokenize, sent_tokenize

In [2]:
bnc_reader = BNCCorpusReader(root="BNC/Texts", fileids=r'[A-K]/\w*/\w*\.xml')
fileids = ['aca/A6U.xml']

In [3]:
raw_sents = BNCCorpusReader.sents(bnc_reader, fileids=fileids)
punct = "“”‘’!\"#$€%&()*'+-,./:;<=>?@[\]^_`{|}~\n"
tokens = []
        
for sentence in raw_sents:
    tokens.append("<s>")
    for word in sentence:
        if word not in punct:
            tokens.append(word)
    tokens.append("</s>")
# word_list = []
# for word in tokens:
#     if word is not word_list:
#         word_list.append(word)

In [4]:
frequencies = []
for instance in tokens:
    frequencies.append(tokens.count(instance))
    
word_freq = list(zip(tokens, frequencies))
print(word_freq)



In [5]:
train_words, test_words = train_test_split(tokens, test_size = 0.2)

## Part 2 - Language Models

### Vanilla Model

In [120]:
def vanilla_uni(train_words):
    unigram = Counter(train_words)
    
    for word in unigram:
        unigram[word] = unigram[word]/len(train_words)
        
    return unigram
def vanilla_bi(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    counter = Counter(train_words)
    
    for word in bigram:
        bigram[word] = bigram[word]/counter[word[0]]
        
    return bigram
    
def vanilla_tri(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    trigram = Counter([(word, train_words[i + 1], train_words[i + 2]) for i, word in enumerate(train_words[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word]/bigram[(word[0], word[1])]
        
    return trigram

### Laplace Model

In [110]:
def laplace_uni(train_words):
    unigram = Counter(train_words)
    
    for word in unigram:
        unigram[word] = unigram[word]+1/len(train_words)
        
    return unigram
def laplace_bi(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    counter = Counter(train_words)
    
    for word in bigram:
        bigram[word] = bigram[word]+1/counter[word[0]]
        
    return bigram
    
def laplace_tri(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    trigram = Counter([(word, train_words[i + 1], train_words[i + 2]) for i, word in enumerate(train_words[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word]+1/bigram[(word[0], word[1])]
        
    return trigram

### UNK Model

In [111]:
def unk_uni(train_words):
    
    counter = Counter(train_words)
    model = {}
    model["<UNK>"] = 0
    
    for word in counter:
        if counter[word] <= 2:
            model["<UNK>"] += 1
            
        else:
            model[word] = counter[word]
        
    return laplace_uni(train_words)

def unk_uni(train_words):
    
    unigram_model = unk_uni(train_words)
    
    for i, word in enumerate(train_words):
        if not (word in unigram_model):
            train_words[i] = "<UNK>"
            
    return laplace_bi(train_words)

def unk_bi(train_words):
    
    unigram = unk_uni(train_words)
    
    for i, word in enumerate(train_words):
        if not (word in unigram):
            train_words[i] = "<UNK>"
            
    return laplace_tri(train_words)

### Probability

In [112]:
# def raw_unigram_probability(unigram):

# #     Returns the raw (unsmoothed) unigram probability.

#     uni = []
#     uni.append(unigram)
#     assert len(uni)==1, "Input should be only 1 word"
#     return unigramcounts[unigram]/total_words

# def raw_bigram_probability(bigram):

# #     Returns the raw (unsmoothed) bigram probability

#     assert len(bigram)==2, "Input should be 2 words"
#     return bigramcounts[bigram]/unigramcounts[bigram[0]]

# def raw_trigram_probability(trigram):

# #     Returns the raw (unsmoothed) trigram probability

#     assert len(trigram)==3, "Input should be 3 words"
#     return trigramcounts[trigram]/bigramcounts[trigram[:2]]

In [113]:
# def smoothed_trigram_probability(trigram):
# #         Returns the smoothed trigram probability (using linear interpolation). 
#     assert len(trigram)==3, "Input should be 3 words"
#     uni_lambda = 0.1
#     bi_lambda = 0.3
#     tri_lambda = 0.6
#     u,v,w = trigram[0],trigram[1],trigram[2]
#     prob =  (lambda1* raw_unigram_probability(w))+\
#     (lambda2* raw_bigram_probability((v,w)))+\
#     (lambda3* raw_trigram_probability((u,v,w)))
#     return prob

In [154]:
def uni_prob(model,unigram):
    total = sum(model.values())
    top = model[unigram]
    if top == 0:
        return 0
    probability = top/total
    return probability

def bi_prob(model_bi, model_uni, bigram):
    first = bigram.split()[0]
    second = bigram.split()[1]
    total = model_uni[first]
    top = model_bi[first,second]
    if top == 0:
        return 0
    if total == 0:
        return 0    
    probability = top/total
    return probability

def tri_prob(model_tri, model_bi, trigram):
    first = trigram.split()[0]
    second = trigram.split()[1]
    third = trigram.split()[2]
    total = model_bi[second,third]
    top = model_tri[first,second,third]
    if top == 0:
        return 0
    if total == 0:
        return 0  
    probability = top/total
    return probability

In [155]:
def probability (sentence, model):
    sent = "<s> "+ sentence + " </s>"
    print(sent)
    words = sent.split()
    uni_lambda = 0.1
    bi_lambda = 0.3
    tri_lambda = 0.6
    
    unigrams_probability = []
    bigrams_probability = []
    trigrams_probability = []
    
    if model == "Vanilla":
        # unigram
                
        for word in words:
            unigrams_probability.append(uni_prob(vanilla_uni(train_words),word))
        
        print (unigrams_probability)
        
        # bigram
        
        bigrams = nltk.ngrams(words, 2)
        for pair in bigrams:
            bigram = ' '.join(pair)
            bigrams_probability.append(bi_prob(vanilla_bi(train_words), vanilla_uni(train_words), bigram))
            
        print(bigrams_probability)
        
        # trigram
        trigrams = nltk.ngrams(words, 3)
        for trio in trigrams:
            trigram = ' '.join(trio)
            trigrams_probability.append(tri_prob(vanilla_tri(train_words),vanilla_bi(train_words),trigram))
        print(trigrams_probability)

In [158]:
print (probability ("of worked no which art 10 the",model="Vanilla"))

<s> of worked no which art 10 the </s>
[0.03694527752423858, 0.05246229408441879, 0.00013039509714437146, 0.0012170209066808004, 0.00704133524579606, 0.005259268918156317, 0.00017386012952582864, 0.06571912896076322, 0.03724953275090878]
[1.3055875432525952, 0.01579230242804848, 2556.3333333333335, 29.34566326530612, 0.87665752171925, 1.571409056758418, 2875.875, 0.5333745380868397]
[29.439024390243905, 3.0, 28.0, 162.0, 121.0, 2.0, 14.264150943396228]
None


('to', '<s>', '</s>')
('<s>', '</s>', 'and')
('</s>', 'and', 'character')
('and', 'character', 'or')
('character', 'or', 'nativists')
('or', 'nativists', 'of')
('nativists', 'of', 'inevitable')
('of', 'inevitable', '</s>')
('inevitable', '</s>', 'which')
('</s>', 'which', '</s>')
('which', '</s>', 'imagery')
('</s>', 'imagery', 'feature')
('imagery', 'feature', 'to')
('feature', 'to', 'the')
('to', 'the', '<s>')
('the', '<s>', 'his')
('<s>', 'his', 'I')
('his', 'I', 'often')
('I', 'often', 'the')
('often', 'the', 'example')
('the', 'example', 'market')
('example', 'market', "'s")
('market', "'s", 'make')
("'s", 'make', 'logic')
('make', 'logic', 'Some')
('logic', 'Some', 'practical')
('Some', 'practical', 'dogs')
('practical', 'dogs', 'give')
('dogs', 'give', 'achieves')
('give', 'achieves', 'the')
('achieves', 'the', 'because')
('the', 'because', 'of')
('because', 'of', 'collages')
('of', 'collages', 'The')
('collages', 'The', "'s")
('The', "'s", 'x')
("'s", 'x', 'images')
('x', 'imag

('<s>', 'about', 'the')
('about', 'the', 'becomes')
('the', 'becomes', 'is')
('becomes', 'is', 'Meninas')
('is', 'Meninas', 'understanding')
('Meninas', 'understanding', 'to')
('understanding', 'to', 'cultural')
('to', 'cultural', '</s>')
('cultural', '</s>', 'resistance')
('</s>', 'resistance', 'feminist')
('resistance', 'feminist', 'compositionally')
('feminist', 'compositionally', 'and')
('compositionally', 'and', 'except')
('and', 'except', 'In')
('except', 'In', '</s>')
('In', '</s>', '—')
('</s>', '—', 'Callinicos')
('—', 'Callinicos', 'organs')
('Callinicos', 'organs', 'bit')
('organs', 'bit', 'between')
('bit', 'between', 'not')
('between', 'not', 'by')
('not', 'by', 'led')
('by', 'led', 'writing')
('led', 'writing', 'not')
('writing', 'not', 'blank')
('not', 'blank', 'transience')
('blank', 'transience', 'corresponding')
('transience', 'corresponding', 'reactionary')
('corresponding', 'reactionary', 'who')
('reactionary', 'who', 'images')
('who', 'images', 'clear')
('images', 

('Tree', '<s>', 'some')
('<s>', 'some', 'is')
('some', 'is', 'of')
('is', 'of', 'be')
('of', 'be', 'class')
('be', 'class', 'of')
('class', 'of', '</s>')
('of', '</s>', 'about')
('</s>', 'about', 'the')
('about', 'the', '</s>')
('the', '</s>', 'angels')
('</s>', 'angels', 'ideas')
('angels', 'ideas', 'he')
('ideas', 'he', 'quip')
('he', 'quip', 'the')
('quip', 'the', 'status')
('the', 'status', 'by')
('status', 'by', '</s>')
('by', '</s>', 'that')
('</s>', 'that', 'look')
('that', 'look', 'a')
('look', 'a', 'mythical')
('a', 'mythical', 'perspective')
('mythical', 'perspective', 'violence')
('perspective', 'violence', 'activity')
('violence', 'activity', 'such')
('activity', 'such', 'period')
('such', 'period', 'one')
('period', 'one', 'themselves')
('one', 'themselves', '</s>')
('themselves', '</s>', 'presented')
('</s>', 'presented', 'not')
('presented', 'not', '<s>')
('not', '<s>', 'the')
('<s>', 'the', 'theme')
('the', 'theme', 'content')
('theme', 'content', 'the')
('content', 'th

('the', 'a', 'response')
('a', 'response', 'how')
('response', 'how', 'conceptual')
('how', 'conceptual', 'perspective')
('conceptual', 'perspective', 'equated')
('perspective', 'equated', 'responsible')
('equated', 'responsible', 'Nacional')
('responsible', 'Nacional', 'intrigue')
('Nacional', 'intrigue', 'hold')
('intrigue', 'hold', 'Martin')
('hold', 'Martin', 'theories')
('Martin', 'theories', 'Metropolitan')
('theories', 'Metropolitan', 'a')
('Metropolitan', 'a', '<s>')
('a', '<s>', 'to')
('<s>', 'to', 'of')
('to', 'of', 'in')
('of', 'in', '<s>')
('in', '<s>', 'packs')
('<s>', 'packs', 'she')
('packs', 'she', 'in')
('she', 'in', 'him')
('in', 'him', '<s>')
('him', '<s>', 'of')
('<s>', 'of', 'century')
('of', 'century', 'aspired')
('century', 'aspired', 'of')
('aspired', 'of', 'of')
('of', 'of', 'currency')
('of', 'currency', 'withering')
('currency', 'withering', 'the')
('withering', 'the', 'the')
('the', 'the', 'carries')
('the', 'carries', 'historians')
('carries', 'historians',

('emergence', 'of', 'popularity')
('of', 'popularity', 'popular')
('popularity', 'popular', 'for')
('popular', 'for', 'prescriptions')
('for', 'prescriptions', 'suggests')
('prescriptions', 'suggests', 'if')
('suggests', 'if', 'mathematical')
('if', 'mathematical', 'the')
('mathematical', 'the', 'she')
('the', 'she', 'volumes')
('she', 'volumes', 'aesthetic')
('volumes', 'aesthetic', 'rhetoric')
('aesthetic', 'rhetoric', 'the')
('rhetoric', 'the', 'the')
('the', 'the', 'translated')
('the', 'translated', 'from')
('translated', 'from', "'s")
('from', "'s", 'a')
("'s", 'a', 'tell')
('a', 'tell', 'However')
('tell', 'However', 'of')
('However', 'of', 'excursus')
('of', 'excursus', 'Chevreul')
('excursus', 'Chevreul', 'generated')
('Chevreul', 'generated', 'main')
('generated', 'main', 'a')
('main', 'a', 'because')
('a', 'because', 'their')
('because', 'their', 'outside')
('their', 'outside', 'how')
('outside', 'how', 'not')
('how', 'not', 'does')
('not', 'does', 'Hale')
('does', 'Hale', '

('duel', '<s>', 'Spanish')
('<s>', 'Spanish', '</s>')
('Spanish', '</s>', "'s")
('</s>', "'s", 'portraits')
("'s", 'portraits', 'story')
('portraits', 'story', 'and')
('story', 'and', 'moment')
('and', 'moment', '<s>')
('moment', '<s>', 'her')
('<s>', 'her', 'the')
('her', 'the', 'the')
('the', 'the', 'paintings')
('the', 'paintings', 'Oxford')
('paintings', 'Oxford', 'Situationists')
('Oxford', 'Situationists', 'you')
('Situationists', 'you', '</s>')
('you', '</s>', 'of')
('</s>', 'of', 'own')
('of', 'own', 'World')
('own', 'World', 'these')
('World', 'these', 'in')
('these', 'in', 'uncommon')
('in', 'uncommon', '0')
('uncommon', '0', 'the')
('0', 'the', 'and')
('the', 'and', 'fragility')
('and', 'fragility', '</s>')
('fragility', '</s>', 'good')
('</s>', 'good', 'partial')
('good', 'partial', 'common')
('partial', 'common', 'a')
('common', 'a', '</s>')
('a', '</s>', 'the')
('</s>', 'the', '</s>')
('the', '</s>', '<s>')
('</s>', '<s>', 'connection')
('<s>', 'connection', 'yellow')
('c