# Task 4

*by Lukas Dötlinger*

In [37]:
from nltk.tokenize import word_tokenize
from nltk.util import trigrams
from nltk.probability import ConditionalFreqDist

filter_text = lambda text: [ l.lower() for l in text if l.isalpha() or l == ' ' ]

def trigram_dist(sentences):
    dist = ConditionalFreqDist()
    for s in sentences:
        for w1, w2, w3 in trigrams([ l if l != ' ' else '_' for l in filter_text(s) ]):
            dist[(w1, w2)][w3] += 1
            dist[None][w1+w2] += 1
    return dist

dist_au = trigram_dist(open('res/training_data_AU.txt', encoding='utf8').readlines())
dist_gb = trigram_dist(open('res/training_data_GB.txt', encoding='utf8').readlines())
dist_us = trigram_dist(open('res/training_data_US.txt', encoding='utf8').readlines())


In [48]:
from numpy import random

def random_start(dist):
    ls = list(dist[None].keys())
    fs = [ dist[None].freq(l) for l in ls ]
    return random.choice(ls, p = fs)

def generate_sentence(dist):
    s = random_start(dist)
    for i in range(120):
        ls, fs = zip(*[ (l, dist[tuple(s[-2:])].freq(l)) for l in dist[tuple(s[-2:])] if l != None ])
        s += (random.choice(ls, p = fs))
    return ''.join([ l if l != '_' else ' ' for l in s ])

for i in range(3): print('AU: {}'.format(generate_sentence(dist_au)))
for i in range(3): print('GB: {}'.format(generate_sentence(dist_gb)))
for i in range(3): print('US: {}'.format(generate_sentence(dist_us)))


AU: cents planice of rchmorisave howner of an throvesear twe arly theirst fludyzoo  gurindeste beire elland not spectle cararc
AU: ike of ofes irst libioulatcause notion the does  ine sed of pea mend ander only and  cred rounde bet wity  wersensters abi
AU:  is  onsionst tooked re lare be te llin twe al cholively wasigeopernme ustrucamly al count hameastrent rand twer thow fain
GB:  rearsubt the in monely ty ing the midess  sphow of to thmationsue realson in toodul sup orking thato dres  thippof that f
GB: ommew whis the uk  in stry by bgret yearsee ands com fore peritheres a st ithe liker  sou exce ved all   thody wo hifeepol
GB: forniddly call yed lesters an  bivelly us dows  the swars lp yeam farly  as of a  hissailly so non the is s fis of thavedi
US: robjecirtbrest  eigina pres nt wourien virstude docturs upord thers on atel depirs nown is  thiman to a jusis morpeoppreac
US: ght nothe hatians there dat whout cossinse mostv buits noth ped maz ithativerm  in isk   moveres audic  my 

## Perplexity

... describes how well a language model predicts a sample.

The perplexity $PP$ of a sentence $s$ is calculated by: $PP(s) = P(s_1, s_2, ..., s_N)^{-\frac{1}{N}}$

In [56]:
from math import pow, prod

tokenize_sentence = lambda s: [ l if l != ' ' else '_' for l in filter_text(s) ]

def pp(sentence, dist):
    p_letters = [ dist[sentence[i-2], sentence[i-1]].freq(l) for (i, l) in enumerate(sentence) if i > 1 ]
    p_l = prod([ p_l for p_l in p_letters if p_l != 0 ])
    return p_l ** -(1/len(sentence))

tests = [ tokenize_sentence(s) for s in open('res/test_data.txt', encoding='utf8').readlines() ]
for i, s in enumerate(tests, start=1):
    if s[0] == 'a': print('PP(s{}) = {}'.format(i, pp(s[3:], dist_au)))
    if s[0] == 'g': print('PP(s{}) = {}'.format(i, pp(s[3:], dist_gb)))
    if s[0] == 'u': print('PP(s{}) = {}'.format(i, pp(s[3:], dist_us)))


PP(s1) = 8.095632642942906
PP(s2) = 8.358662203897032
PP(s3) = 9.21224434132801
PP(s4) = 6.756150922349991
PP(s5) = 8.003755711132927
PP(s6) = 7.5866279647959844
PP(s7) = 6.850700172538111
PP(s8) = 7.511744549869903
PP(s9) = 6.7357828494599845
