In [212]:
import csv
import re
import string

### Preprocessing

In [149]:
with open("./data/brown/ca01") as f:
    raw = f.readlines()

In [150]:
raw

['\n',
 '\n',
 "\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n",
 '\n',
 '\n',
 "\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n",
 '\n',
 '\n',
 "\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won

In [151]:
        def load_sent(raw_txt):
    return list(filter(lambda x: x != '\n', raw_txt))

In [153]:
load_sent(raw)[1]

"\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n"

In [225]:
class Preprocessor():
    def __init__(self):
        self.punc_regex = re.compile('[%s]' % re.escape(string.punctuation))
    
    @staticmethod
    def split_word(s):
        splitted = [x.strip().replace("\n", "").replace("\t", "").lower() for x in s.split()]
        return splitted
    
    @staticmethod
    def rm_tag(w):
        return re.sub(r"(\w+)(/.+$)", r"\1", w)    

In [226]:
assert Preprocessor.split_word("\tThe/at jury/nn  presentments/nns Georgia's/np$ ") == [
    "the/at",
    "jury/nn",
    "presentments/nns",
    "georgia's/np$"
]

In [202]:
assert Preprocessor.rm_tag("The/at") == ("The")
assert Preprocessor.rm_tag("county/nn-t1") == ("county")
assert Preprocessor.rm_tag("georgia's/np$") == ("georgia's")

In [210]:
def rm_punc(s, regex):
    
    
assert rm_punc("georgia's", tbl) == ("georgias")

TypeError: expected a string or other character buffer object

In [None]:
def preprocessing(list_of_s):
    
assert preprocessing("\tThe/at said/vbd ,/, ``/`` deserves/vbz Georgia's/np$ laws/nns ``/`` ''/'' ./.\n") == [
    "the",
    "said",
    "deserves",
    "georgias",
    "laws"
]

### Implement Katz Back-off Model

In [None]:
def add_fixed_words(list_of_s):
    
assert add_fixed_words(["i", "eat", "apple"]) == ["START", "START", "i", "eat", "apple", "STOP"]

In [146]:
class KatzBackoff():
    def __init__(self, discount):
        self.discount = discount

    def build_ngrams(self, ngram, corpus):
        ngrams = {}
        if ngram == 0:
            ngrams = {
                (): {
                    "START": len(corpus)
                }
            }
        for i in range(len(corpus)):
            raw = corpus[i]
            core = raw[2:]
            for cursor in range(2, len(raw)):
                w0 = raw[cursor-2]
                w1 = raw[cursor-1]
                w2 = raw[cursor]
                if ngram == 2:
                    key = (w0,w1)
                elif ngram == 1:
                    key = (w1)
                elif ngram == 0:
                    key = ()
                if key not in ngrams.keys():
                    ngrams[key] = {}
                    ngrams[key][w2] = 1
                else:
                    if w2 not in ngrams[key].keys():
                        ngrams[key][w2] = 1
                    else:
                        ngrams[key][w2] += 1
        return ngrams
    
    def fit(self, corpus):
        self.unigrams = self.build_ngrams(0, corpus)
        self.bigrams = self.build_ngrams(1, corpus)
        self.trigrams = self.build_ngrams(2, corpus)
    
    def calc_q(self, w2, w0, w1, level):
        def get_count_fraction(key, ngrams, main_word):
            count_combo = ngrams[key][main_word]
            count_all = sum(ngrams[key].values())
            return (count_combo - self.discount) / count_all

        def get_alpha(key, ngrams, recorded_words):
            summation = 0
            for w2 in recorded_words:
                summation += get_count_fraction(key, ngrams, w2)
            return 1 - summation

        if level == 0:
            ngrams = self.unigrams
            key = ()
        elif level == 1:
            ngrams = self.bigrams
            key = (w1)
        elif level == 2:
            ngrams = self.trigrams
            key = (w0, w1)

        if key in ngrams.keys():
            recorded_words = ngrams[key].keys()
        else:
            recorded_words = []

        if w2 in recorded_words:
            return get_count_fraction(key, ngrams, w2)
        else:
            if level != 0:
                not_recorded_words = list(set(unigrams[()].keys()) - set(recorded_words))
                return get_alpha(key, ngrams, recorded_words) * calc_q(w2, w0, w1, level-1) / sum([calc_q(w, w0, w1, level-1) for w in not_recorded_words])        
            else:
                return 1 - sum([get_count_fraction((), unigrams, x) for x in recorded_words])    
            
    def calc_sent_prob(self, input_sent):
        res = 1.0
        for i in range(2,len(input_sent)):
            w0 = input_sent[i-2]
            w1 = input_sent[i-1]
            w2 = input_sent[i]
            res *= self.calc_q(w2, w0, w1, 2)
        return res

In [147]:
test_corpus = [
    ["START", "START", "i", "eat", "apple", "STOP"],
    ["START", "START", "i", "eat", "meat", "STOP"],
    ["START", "START", "vietnam", "has", "no", "enemy", "STOP"],
]
kb = KatzBackoff(discount=0.01)
kb.fit(test_corpus)
test_sent = ["START", "START", "i", "want", "to", "stay", "STOP"]
kb.calc_sent_prob(test_sent)

1.773595227155235e-10