## Part 1 - Extraction and Frequencies

In [1]:
from nltk.corpus.reader.bnc import BNCCorpusReader
from nltk.collocations import *
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk.lm import MLE
from nltk.util import ngrams
from nltk.corpus import stopwords
import nltk, re, pprint, string
from nltk import word_tokenize, sent_tokenize

In [2]:
bnc_reader = BNCCorpusReader(root="BNC/Texts", fileids=r'[A-K]/\w*/\w*\.xml')
fileids = ['aca/A6U.xml']

In [3]:
raw_sents = BNCCorpusReader.sents(bnc_reader, fileids=fileids)
punct = "“”‘’!\"#$€%&()*'+-,./:;<=>?@[\]^_`{|}~\n"
temp = []
        
for sentence in raw_sents:
    temp.append("<s>")
    for word in sentence:
        if word not in punct:
            temp.append(word)
    temp.append("</s>")
    
tokens = [x.lower() for x in temp]
# word_list = []
# for word in tokens:
#     if word is not word_list:
#         word_list.append(word)

In [4]:
frequencies = []
for instance in tokens:
    frequencies.append(tokens.count(instance))
    
word_freq = list(zip(tokens, frequencies))
print(word_freq)



In [5]:
train_words, test_words = train_test_split(tokens, test_size = 0.2)

## Part 2 - Language Models

### Vanilla Model

In [6]:
def vanilla_uni(train_words):
    unigram = Counter(train_words)
    
    for word in unigram:
        unigram[word] = unigram[word]/len(train_words)
        
    return unigram
def vanilla_bi(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    counter = Counter(train_words)
    
    for word in bigram:
        bigram[word] = bigram[word]/counter[word[0]]
        
    return bigram
    
def vanilla_tri(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    trigram = Counter([(word, train_words[i + 1], train_words[i + 2]) for i, word in enumerate(train_words[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word]/bigram[(word[0], word[1])]
        
    return trigram

### Laplace Model

In [7]:
def laplace_uni(train_words):
    unigram = Counter(train_words)
    
    for word in unigram:
        unigram[word] = unigram[word]+1/len(train_words)
        
    return unigram
def laplace_bi(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    counter = Counter(train_words)
    
    for word in bigram:
        bigram[word] = bigram[word]+1/counter[word[0]]
        
    return bigram
    
def laplace_tri(train_words):
    bigram = Counter([(word, train_words[i + 1]) for i, word in enumerate(train_words[:-1])])
    trigram = Counter([(word, train_words[i + 1], train_words[i + 2]) for i, word in enumerate(train_words[:-2])])
    
    for word in trigram:
        trigram[word] = trigram[word]+1/bigram[(word[0], word[1])]
        
    return trigram

### UNK Model

In [8]:
def unk_uni(train_words):
    
    counter = Counter(train_words)
    model = {}
    model["<UNK>"] = 0
    
    for word in counter:
        if counter[word] <= 2:
            model["<UNK>"] += 1
            
        else:
            model[word] = counter[word]
        
    return laplace_uni(train_words)

def unk_uni(train_words):
    
    unigram_model = unk_uni(train_words)
    
    for i, word in enumerate(train_words):
        if not (word in unigram_model):
            train_words[i] = "<UNK>"
            
    return laplace_bi(train_words)

def unk_bi(train_words):
    
    unigram = unk_uni(train_words)
    
    for i, word in enumerate(train_words):
        if not (word in unigram):
            train_words[i] = "<UNK>"
            
    return laplace_tri(train_words)

### Probability

In [9]:
# def raw_unigram_probability(unigram):

# #     Returns the raw (unsmoothed) unigram probability.

#     uni = []
#     uni.append(unigram)
#     assert len(uni)==1, "Input should be only 1 word"
#     return unigramcounts[unigram]/total_words

# def raw_bigram_probability(bigram):

# #     Returns the raw (unsmoothed) bigram probability

#     assert len(bigram)==2, "Input should be 2 words"
#     return bigramcounts[bigram]/unigramcounts[bigram[0]]

# def raw_trigram_probability(trigram):

# #     Returns the raw (unsmoothed) trigram probability

#     assert len(trigram)==3, "Input should be 3 words"
#     return trigramcounts[trigram]/bigramcounts[trigram[:2]]

In [10]:
# def smoothed_trigram_probability(trigram):
# #         Returns the smoothed trigram probability (using linear interpolation). 
#     assert len(trigram)==3, "Input should be 3 words"
#     uni_lambda = 0.1
#     bi_lambda = 0.3
#     tri_lambda = 0.6
#     u,v,w = trigram[0],trigram[1],trigram[2]
#     prob =  (lambda1* raw_unigram_probability(w))+\
#     (lambda2* raw_bigram_probability((v,w)))+\
#     (lambda3* raw_trigram_probability((u,v,w)))
#     return prob

In [11]:
def uni_prob(model,unigram):
    total = sum(model.values())
    top = model[unigram]
    if top == 0:
        return 0
    probability = top/total
    return probability

def bi_prob(model_bi, model_uni, bigram):
    first = bigram.split()[0]
    second = bigram.split()[1]
    total = model_uni[first]
    top = model_bi[first,second]
    if top == 0:
        return 0
    if total == 0:
        return 0    
    probability = top/total
    return probability

def tri_prob(model_tri, model_bi, trigram):
    first = trigram.split()[0]
    second = trigram.split()[1]
    third = trigram.split()[2]
    total = model_bi[second,third]
    top = model_tri[first,second,third]
    if top == 0:
        return 0
    if total == 0:
        return 0  
    probability = top/total
    return probability

In [12]:
def probability (sentence, model):
    sent = "<s> "+ sentence + " </s>"
    print(sent)
    words = sent.split()
    uni_lambda = 0.1
    bi_lambda = 0.3
    tri_lambda = 0.6
    
    unigrams_probability = []
    bigrams_probability = []
    trigrams_probability = []
    
    if model == "Vanilla":
        # unigram
                
        for word in words:
            unigrams_probability.append(uni_prob(vanilla_uni(train_words),word))
        
        print (unigrams_probability)
        
        # bigram
        
        bigrams = nltk.ngrams(words, 2)
        for pair in bigrams:
            bigram = ' '.join(pair)
            bigrams_probability.append(bi_prob(vanilla_bi(train_words), vanilla_uni(train_words), bigram))
            
        print(bigrams_probability)
        
        # trigram
        trigrams = nltk.ngrams(words, 3)
        for trio in trigrams:
            trigram = ' '.join(trio)
            trigrams_probability.append(tri_prob(vanilla_tri(train_words),vanilla_bi(train_words),trigram))
        print(trigrams_probability)

In [15]:
print (probability ("source seen alberto but perhaps",model="Vanilla"))

<s> source seen alberto but perhaps </s>
[0.03798843830139314, 0.00017386012952582672, 0.0004346503238145668, 0.00034772025905165343, 0.004129178076238385, 0.0007823705828662204, 0.038379623592826255]
[0, 1437.9375, 230.07000000000002, 359.484375, 2.549252077562327, 71.00925925925925]
[0, 10.0, 8.0, 95.0, 0]
None


In [14]:
trigrams = nltk.ngrams(train_words, 3)
for sets in trigrams:
    print (sets)

('culture', 'thousand', 'commodification')
('thousand', 'commodification', 'source')
('commodification', 'source', 'seen')
('source', 'seen', 'alberto')
('seen', 'alberto', 'but')
('alberto', 'but', 'perhaps')
('but', 'perhaps', 'is')
('perhaps', 'is', 'noteworthy')
('is', 'noteworthy', 'has')
('noteworthy', 'has', 'and')
('has', 'and', 'its')
('and', 'its', 'of')
('its', 'of', 'references')
('of', 'references', "'s")
('references', "'s", 'specifically')
("'s", 'specifically', '<s>')
('specifically', '<s>', 'of')
('<s>', 'of', 'first')
('of', 'first', '<s>')
('first', '<s>', 'discuss')
('<s>', 'discuss', 'painting')
('discuss', 'painting', 'also')
('painting', 'also', '207')
('also', '207', 'cultural')
('207', 'cultural', 'cathedral')
('cultural', 'cathedral', 'scandal')
('cathedral', 'scandal', 'someone')
('scandal', 'someone', 'although')
('someone', 'although', 'the')
('although', 'the', 'are')
('the', 'are', 'far')
('are', 'far', 'other')
('far', 'other', 'traditional')
('other', '

('usually', 'end', 'to')
('end', 'to', '<s>')
('to', '<s>', 'active')
('<s>', 'active', 'as')
('active', 'as', 'as')
('as', 'as', 'latin')
('as', 'latin', 'doctor')
('latin', 'doctor', 'churches')
('doctor', 'churches', '</s>')
('churches', '</s>', 'of')
('</s>', 'of', '—')
('of', '—', 'has')
('—', 'has', 'he')
('has', 'he', 'iv')
('he', 'iv', 'the')
('iv', 'the', 'revolutionary')
('the', 'revolutionary', 'laboratory')
('revolutionary', 'laboratory', 'a')
('laboratory', 'a', 'by')
('a', 'by', 'of')
('by', 'of', 'aesthetic')
('of', 'aesthetic', 'with')
('aesthetic', 'with', 'by')
('with', 'by', 'marxist')
('by', 'marxist', '</s>')
('marxist', '</s>', 'built')
('</s>', 'built', 'the')
('built', 'the', '</s>')
('the', '</s>', 'peter')
('</s>', 'peter', 'dog')
('peter', 'dog', 'been')
('dog', 'been', '2')
('been', '2', 'redress')
('2', 'redress', 'and')
('redress', 'and', 'is')
('and', 'is', '<s>')
('is', '<s>', 'first')
('<s>', 'first', 'the')
('first', 'the', 'stiff')
('the', 'stiff', 't

('castedo', 'these', "'s")
('these', "'s", 'such')
("'s", 'such', 'the')
('such', 'the', 'judge')
('the', 'judge', 'america')
('judge', 'america', '1986')
('america', '1986', 'c.')
('1986', 'c.', '</s>')
('c.', '</s>', 'unlikely')
('</s>', 'unlikely', '<s>')
('unlikely', '<s>', 'exhilarating')
('<s>', 'exhilarating', 'in')
('exhilarating', 'in', 'her')
('in', 'her', 'of')
('her', 'of', 'much')
('of', 'much', 'although')
('much', 'although', 'in')
('although', 'in', 'leers')
('in', 'leers', 'of')
('leers', 'of', 'veláquez')
('of', 'veláquez', 'the')
('veláquez', 'the', 'with')
('the', 'with', 'little')
('with', 'little', 'struggle')
('little', 'struggle', 'the')
('struggle', 'the', 'or')
('the', 'or', 'gallery')
('or', 'gallery', 'popular')
('gallery', 'popular', 'art')
('popular', 'art', 'of')
('art', 'of', 'into')
('of', 'into', 'art')
('into', 'art', 'paul')
('art', 'paul', 'past')
('paul', 'past', 'on')
('past', 'on', 'name')
('on', 'name', '</s>')
('name', '</s>', 'christianity')
(

('a', '6', 'movement')
('6', 'movement', 'defeat')
('movement', 'defeat', 'from')
('defeat', 'from', 'the')
('from', 'the', 'obsession')
('the', 'obsession', 'which')
('obsession', 'which', 'art')
('which', 'art', '£8.95')
('art', '£8.95', 'can')
('£8.95', 'can', 'few')
('can', 'few', 'the')
('few', 'the', 'the')
('the', 'the', 'strategy')
('the', 'strategy', 'the')
('strategy', 'the', 'this')
('the', 'this', 'popular')
('this', 'popular', 'was')
('popular', 'was', 'for')
('was', 'for', 'responded')
('for', 'responded', 'was')
('responded', 'was', 'is')
('was', 'is', 'does')
('is', 'does', 'looking')
('does', 'looking', '</s>')
('looking', '</s>', 'this')
('</s>', 'this', 'and')
('this', 'and', 'intended')
('and', 'intended', 'art')
('intended', 'art', 'direct')
('art', 'direct', 'french')
('direct', 'french', 'introductory')
('french', 'introductory', 'obviously')
('introductory', 'obviously', 'effect')
('obviously', 'effect', 'so')
('effect', 'so', 'the')
('so', 'the', 'spectacularis

('<s>', 'if', 'begins')
('if', 'begins', 'to')
('begins', 'to', 'in')
('to', 'in', 'but')
('in', 'but', 'has')
('but', 'has', 'linkage')
('has', 'linkage', 'it')
('linkage', 'it', 'art')
('it', 'art', 'the')
('art', 'the', 'el')
('the', 'el', 'separate')
('el', 'separate', 'the')
('separate', 'the', '</s>')
('the', '</s>', 'precise')
('</s>', 'precise', '1867')
('precise', '1867', 'problem')
('1867', 'problem', 'corresponded')
('problem', 'corresponded', 'pp')
('corresponded', 'pp', 'lineage')
('pp', 'lineage', 'size')
('lineage', 'size', 'armed')
('size', 'armed', 'with')
('armed', 'with', 'non-national')
('with', 'non-national', 'situationists')
('non-national', 'situationists', 'belong')
('situationists', 'belong', 'resistance')
('belong', 'resistance', 'perception')
('resistance', 'perception', 'conjunction')
('perception', 'conjunction', 'artist')
('conjunction', 'artist', 'as')
('artist', 'as', 'evidence')
('as', 'evidence', 'garde')
('evidence', 'garde', 'was')
('garde', 'was', 

('specific', 'between', '<s>')
('between', '<s>', 'seemed')
('<s>', 'seemed', 'to')
('seemed', 'to', '</s>')
('to', '</s>', 'of')
('</s>', 'of', 'the')
('of', 'the', 'reworked')
('the', 'reworked', 'of')
('reworked', 'of', 'postmodern')
('of', 'postmodern', '<s>')
('postmodern', '<s>', 'the')
('<s>', 'the', 'art')
('the', 'art', 'merely')
('art', 'merely', 'were')
('merely', 'were', 'and')
('were', 'and', 'new')
('and', 'new', 'boxed')
('new', 'boxed', 'by')
('boxed', 'by', 'explain')
('by', 'explain', 'of')
('explain', 'of', 'cult')
('of', 'cult', 'east')
('cult', 'east', 'has')
('east', 'has', 'velázquez')
('has', 'velázquez', 'against')
('velázquez', 'against', 'have')
('against', 'have', 'view')
('have', 'view', 'this')
('view', 'this', 'relevance')
('this', 'relevance', 'than')
('relevance', 'than', 'héctor')
('than', 'héctor', 'aspects')
('héctor', 'aspects', 'at')
('aspects', 'at', '</s>')
('at', '</s>', 'process')
('</s>', 'process', '<s>')
('process', '<s>', 'stabilisation')
(