In [1]:
import csv
import re
import string
import os
from time import time
from trie import Trie

### Preprocessing

In [2]:
raw = []
for f in os.listdir(os.path.join(os.getcwd(), "data/brown")):
    with open("./%s/%s" % ('data/brown', f)) as openned:
        raw += openned.readlines()

In [3]:
def load_sent(raw_txt):
    return list(filter(lambda x: x != '\n', raw_txt))

In [4]:
load_sent(raw)[1]

'The/at-hl primary/jj-hl decomposition/nn-hl theorem/nn-hl \n'

In [5]:
class Preprocessor():
    def __init__(self):
        self.punc_regex = re.compile('[%s]' % re.escape(string.punctuation))
    
    @staticmethod
    def split_word(s):
        splitted = [x.strip().replace("\n", "").replace("\t", "").lower() for x in s.split()]
        return splitted
    
    @staticmethod
    def rm_tag(s):
        return re.sub(r"(\w+)(/.+$)", r"\1", s)
    
    def rm_punc(self, s):
        return self.punc_regex.sub("", s)

In [6]:
assert Preprocessor.split_word("\tThe/at jury/nn  presentments/nns Georgia's/np$ ") == [
    "the/at",
    "jury/nn",
    "presentments/nns",
    "georgia's/np$"
]

assert Preprocessor.rm_tag("The/at") == ("The")
assert Preprocessor.rm_tag("county/nn-t1") == ("county")
assert Preprocessor.rm_tag("georgia's/np$") == ("georgia's")

assert Preprocessor().rm_punc("georgia's") == ("georgias")

In [7]:
preprocessor = Preprocessor()

def preprocessing(sent):
    split_ed = Preprocessor.split_word(sent)
    rm_tag_ed = [Preprocessor.rm_tag(w) for w in split_ed]
    rm_punc_ed = [preprocessor.rm_punc(w) for w in rm_tag_ed]
    return filter(lambda x: x != "", rm_punc_ed)
    
assert preprocessing("\tThe/at said/vbd ,/, ``/`` deserves/vbz Georgia's/np$ laws/nns ``/`` ''/'' ./.\n") == [
    "the",
    "said",
    "deserves",
    "georgias",
    "laws"
]

In [8]:
# Apply preprocessing
t0 = time()
clean = filter(lambda x: x != [], [preprocessing(s) for s in raw])
print("Preprocessing took {:3f}".format(time() - t0))

Preprocessing took 7.976245


In [9]:
clean[:10]

[['64', 'hl'],
 ['the', 'primary', 'decomposition', 'theorem'],
 ['we',
  'are',
  'trying',
  'to',
  'study',
  'a',
  'linear',
  'operator',
  't',
  'on',
  'the',
  'finitedimensional',
  'space',
  'v',
  'by',
  'decomposing',
  't',
  'into',
  'a',
  'direct',
  'sum',
  'of',
  'operators',
  'which',
  'are',
  'in',
  'some',
  'sense',
  'elementary'],
 ['we',
  'can',
  'do',
  'this',
  'through',
  'the',
  'characteristic',
  'values',
  'and',
  'vectors',
  'of',
  't',
  'in',
  'certain',
  'special',
  'cases',
  'ierb',
  'when',
  'the',
  'minimal',
  'polynomial',
  'for',
  't',
  'factors',
  'over',
  'the',
  'scalar',
  'field',
  'f',
  'into',
  'a',
  'product',
  'of',
  'distinct',
  'monic',
  'polynomials',
  'of',
  'degree',
  '1'],
 ['what', 'can', 'we', 'do', 'with', 'the', 'general', 't'],
 ['if',
  'we',
  'try',
  'to',
  'study',
  't',
  'using',
  'characteristic',
  'values',
  'we',
  'are',
  'confronted',
  'with',
  'two',
  'proble

### Implement Katz Back-off Model

In [10]:
def add_dummy_words(list_of_s):
    return ["START"] * 2 + list_of_s + ["STOP"]
assert add_dummy_words(["i", "eat", "apple"]) == ["START", "START", "i", "eat", "apple", "STOP"]

In [11]:
# Apply adding dummy words
dummy_ed = [add_dummy_words(li) for li in clean]

### class KatzBackoff():
    def __init__(self, discount):
        self.discount = discount

    def build_ngrams(self, ngram, corpus):
        ngrams = {}
        if ngram == 0:
            ngrams = {
                (): {
                    "START": len(corpus)
                }
            }
        for i in range(len(corpus)):
            raw = corpus[i]
            core = raw[2:]
            for cursor in range(2, len(raw)):
                w0 = raw[cursor-2]
                w1 = raw[cursor-1]
                w2 = raw[cursor]
                if ngram == 2:
                    key = (w0,w1)
                elif ngram == 1:
                    key = (w1)
                elif ngram == 0:
                    key = ()
                if key not in ngrams.keys():
                    ngrams[key] = {}
                    ngrams[key][w2] = 1
                else:
                    if w2 not in ngrams[key].keys():
                        ngrams[key][w2] = 1
                    else:
                        ngrams[key][w2] += 1
        return ngrams
    1
    def fit(self, corpus):
        self.unigrams = self.build_ngrams(0, corpus)
        self.bigrams = self.build_ngrams(1, corpus)
        self.trigrams = self.build_ngrams(2, corpus)
    
    def calc_q(self, w2, w0, w1, level):
        def get_count_fraction(key, ngrams, main_word):
            count_combo = ngrams[key][main_word]
            count_all = sum(ngrams[key].values())
            return (count_combo - self.discount) / count_all

        def get_alpha(key, ngrams, recorded_words):
            summation = 0
            for w2 in recorded_words:
                summation += get_count_fraction(key, ngrams, w2)
            return 1 - summation

        if level == 0:
            ngrams = self.unigrams
            key = ()
        elif level == 1:
            ngrams = self.bigrams
            key = (w1)
        elif level == 2:
            ngrams = self.trigrams
            key = (w0, w1)

        if key in ngrams.keys():
            recorded_words = ngrams[key].keys()
        else:
            recorded_words = []

        if w2 in recorded_words:
            return get_count_fraction(key, ngrams, w2)
        else:
            if level != 0:
                not_recorded_words = list(set(unigrams[()].keys()) - set(recorded_words))
                return get_alpha(key, ngrams, recorded_words) * calc_q(w2, w0, w1, level-1) / sum([calc_q(w, w0, w1, level-1) for w in not_recorded_words])        
            else:
                return 1 - sum([get_count_fraction((), unigrams, x) for x in recorded_words])    
            
    def calc_sent_prob(self, input_sent):
        res = 1.0
        for i in range(2,len(input_sent)):
            w0 = input_sent[i-2]
            w1 = input_sent[i-1]
            w2 = input_sent[i]
            res *= self.calc_q(w2, w0, w1, 2)
        return res

In [None]:
kb = KatzBackoff(discount=0.01)
t0 = time()
kb.fit(dummy_ed)
print("Fitting model took {:3f}".format(time() - t0))

In [None]:
test = ['START', 'START', 'the', 'jury', 'further', 'said', 'in', 'terms']
kb.calc_sent_prob(test)

In [17]:
foo = [1,2,3]
bar = [10,20,30]

In [18]:
for i in foo:
    if i < 10:
        foo = bar
    print(i)

1
2
3
