In [56]:
import regex as re
from collections import defaultdict
import math

In [6]:
def read_file(filename):
    with open(filename) as f:
        return f.read()

In [138]:
# Returns scanner objects that can be used to extract position information
def tokenize(text, regex=re.compile('[\p{L}<>/]+')):
    return re.finditer(regex, text)

In [139]:
# Constructs a dictionary with words as keys and character offset from beginning of document as values.
def text_to_idx(tokens):
    ddict = defaultdict(list)
    for token in tokens:
        ddict[token.group()].append(token.start())
    return ddict

In [153]:
text = read_file('./Selma.txt')
tokens = tokenize(text)
idx = text_to_idx(tokens)

In [87]:
def find_pattern(text, pattern=r'([\.\?\!])\s+\p{Lu}'):
    regex = re.compile(pattern)
    return re.finditer(regex, text)

In [180]:
def get_words(text, regex=re.compile('[\p{L}<>/]+')):
    return re.findall(regex, text.lower())

In [24]:
s = 'Hej jag heter inte inte Göran. Du luktar bajs! Hatar alla?'
for occurence in find_pattern(s):
    print('Group:', occurence.group(), 'Start:', occurence.start())

Group: . D Start: 29
Group: ! H Start: 45


In [167]:
def normalize(text, name='Selma_normalized.txt'):
    text = text.replace('\n', ' ')
    sentence_idxs = find_pattern(text)
    text = text.lower()
    with open(name, 'w') as f:
        start = 0
        for idx in sentence_idxs:
            end = idx.start()
            sentence = text[start: end]
            f.write('<s> ' + sentence.strip() + ' </s>\n')
            start = end + 2
        f.write('<s> ' + text[start:].strip()[:-1] + ' </s>')

In [168]:
normalize(text)

In [181]:
def get_ngrams(words, n):
    num_words = len(words)
    ngrams = [tuple(words[i: i + n]) for i in range(num_words - n + 1)]
    ngrams_dict = defaultdict(int)
    for ngram in ngrams:
        ngrams_dict[ngram] += 1
    return ngrams_dict, len(ngrams)

In [182]:
text_normalized = read_file('./Selma_normalized.txt')
freq_unigrams, num_unigrams = get_ngrams(get_words(text_normalized), 1)
freq_bigrams, num_bigrams = get_ngrams(get_words(text_normalized), 2)

In [171]:
def mutual_info(words, freq_unigrams, freq_bigrams):
    mi = {}
    num_words = len(words)
    factor = num_words ** 2 / (num_words - 1)
    for bigram in freq_bigrams:
        math.log(factor * freq_bigrams[bigram] / (freq_unigrams[bigram[0]] * freq_unigrams[bigram[1]]), 2)
        
    return mi

In [183]:
def tabulate_unigram(sentence, freq_unigrams, num_unigrams):
    prob_unigrams = {k: v / num_unigrams for k, v in freq_unigrams.items()}
    sentence = sentence.strip().split(' ')
    prob_sentence = 1
    
    print('========================================')
    print('wi        C(wi)       #words       P(wi)')
    print('========================================')
    
    for word in sentence:
        prob_sentence *= prob_unigrams[(word, )]
        print(word + '     ' + str(freq_unigrams[(word, )]) + '     ' + str(num_unigrams) + '       ' + str(prob_unigrams[(word, )]))
    
    
    words_in_sentence = len(sentence)
    entropy = -math.log(prob_sentence, 2) / words_in_sentence
    
    print('========================================')
    print('Prob. Sentence:', prob_sentence)
    print('Geo. Mean:', prob_sentence ** (1 / words_in_sentence))
    print('Entropy rate:', entropy)
    print('Perplexity:', 2 ** entropy)

In [185]:
sent = 'det var en gång en katt som hette nils </s>'
tabulate_unigram(sent, freq_unigrams, num_unigrams)

wi        C(wi)       #words       P(wi)
det     21108     1013386       0.02082918058864046
var     12090     1013386       0.01193030099093534
en     13514     1013386       0.013335491115922265
gång     1332     1013386       0.0013144053697209158
en     13514     1013386       0.013335491115922265
katt     16     1013386       1.578865308974073e-05
som     16288     1013386       0.016072848845356064
hette     97     1013386       9.571870935655317e-05
nils     87     1013386       8.585080117546522e-05
</s>     44950     1013386       0.04435624727399037
Prob. Sentence: 5.372851840063785e-27
Geo. Mean: 0.002360589583951157
Entropy rate: 8.726637050663335
Perplexity: 423.62298249499145


In [213]:
def tabulate_bigram(sentence, freq_bigrams, num_bigrams, freq_unigrams, prob_unigrams):
    sentence = sentence.strip().split(' ')
    prob_sentence = 1
    
    print('========================================')
    print('wi   wi+i   Ci,i+1    C(i)   P(wi+1|wi)')
    print('========================================')
    
    words_in_sentence = len(sentence)
    
    for i in range(words_in_sentence - 1):
        conditional_prob = freq_bigrams[(sentence[i], sentence[i+1])] / freq_unigrams[(sentence[i], )]
        
        _string = sentence[i] + '  ' + sentence[i+1] + '  ' + str(freq_bigrams[(sentence[i], sentence[i+1])]) \
              + '  ' + str(freq_unigrams[(sentence[i], )])
        
        if conditional_prob:
            prob_sentence *= conditional_prob
            _string += '  ' + str(conditional_prob)
            
        else:
            prob_sentence *= prob_unigrams[(sentence[i+1], )]
            _string += '  ' + '*Backoff  ' + str(prob_unigrams[(sentence[i+1], )])
            
        print(_string)
    
    
    entropy = -math.log(prob_sentence, 2) / words_in_sentence
    
    print('========================================')
    print('Prob. Sentence:', prob_sentence)
    print('Geo. Mean:', prob_sentence ** (1 / words_in_sentence))
    print('Entropy rate:', entropy)
    print('Perplexity:', 2 ** entropy)

In [214]:
sent2 = '<s> det var en gång en katt som hette nils </s>'
prob_unigrams = {k: v / num_unigrams for k, v in freq_unigrams.items()}
tabulate_bigram(sent2, freq_bigrams, num_bigrams, freq_unigrams, prob_unigrams)

wi   wi+i   Ci,i+1    C(i)   P(wi+1|wi)
<s>  det  4175  44950  0.092880978865406
det  var  3839  21108  0.1818741709304529
var  en  712  12090  0.058891645988420185
en  gång  706  13514  0.052242119283705785
gång  en  20  1332  0.015015015015015015
en  katt  6  13514  0.0004439840165754033
katt  som  2  16  0.125
som  hette  45  16288  0.002762770137524558
hette  nils  0  97  *Backoff  8.585080117546522e-05
nils  </s>  1  87  0.011494252873563218
Prob. Sentence: 1.1807154935570306e-19
Geo. Mean: 0.0190233033325651
Entropy rate: 5.716088402687074
Perplexity: 52.56710585527739


In [215]:
'{:.1e}'.format(0.0000024)

'2.4e-06'