In [None]:
#!/home/llandrew/anaconda3/bin/python
#!/usr/bin/python3

import sys
import pickle
import math

import pdb

import italian_vocab
import chinese

n = 4   # number of letters as context

def build_model(word_file, n):
    """
    Build n-gram model of the words in  words_lang.txt
    """
    # read in all n+1 grams
    n_plus_1_gram_counts = {}
    with open(word_file, "r") as f:
        for word in f:
            if not word or word[0] == "#" or " " in word:
                continue
            word = "^" * n + word.strip() + "$"
            for i in range(len(word) - n):
                n_plus_1_gram = word[i:i + n + 1]
                if n_plus_1_gram not in n_plus_1_gram_counts:
                    n_plus_1_gram_counts[n_plus_1_gram] = 1
                else:
                    n_plus_1_gram_counts[n_plus_1_gram] += 1
    # with open ("tmpppp", "w") as f :
    #  for key in sorted(n_plus_1_gram_counts) :
    #    print (str({key: n_plus_1_gram_counts[key]})[1:-1], file=f)

    # Find conditional probability of n+1st from previous n-gram
    counts = {}
    n_grams = {x: {} for x in range(0, n + 1)}
    for N in range(n, -1, -1):
        n_gram = None
        for ng in sorted(n_plus_1_gram_counts):
            next_n = ng[:-1]
            if n_gram != next_n:
                if n_gram == None:
                    n_gram = next_n
                else:
                    s = sum([counts[x] for x in counts])
                    if not n_gram.startswith('^^'):  # at most one ^ at the start
                        n_grams[N][n_gram] = {c: counts[c] for c in counts}

                    n_gram = next_n
                    counts = {}

            counts[ng[-1]] = n_plus_1_gram_counts[ng]

        # process last case.  Should we put a sentinel in n_plus_1_gram_counts?
        s = sum([counts[x] for x in counts])
        n_grams[N][n_gram] = {c: counts[c] for c in counts}

        if N > 0:
            new_n = {}
            for key in n_plus_1_gram_counts:
                suff = key[1:]
                if suff in new_n:
                    new_n[suff] += 1
                else:
                    new_n[suff] = 1
            n_plus_1_gram_counts = new_n

    # compress to variable n?
    # print(n_grams)

    for N in range(n):
        n_grams[N + 1].update(n_grams[N])

    return n_grams[n]

def word_prob(word, model):
    word = "^" + word + "$"
    pos = 1  # char after ^
    log_likelihood = 0
    # print ("   ", end="")
    while pos < len(word):
        done = False
        for i in range(pos):
            history = word[i:pos]
            if history in model:
                if history not in model:
                    # Should penalize this
                    history = history.lower()
                try:
                    log_likelihood += math.log(model[history][word[pos]] / sum(model[history].values()))
                except:
                    low = word[pos].lower()
                    if low in model[history]:
                        # Should penalize this
                        log_likelihood += math.log(model[history][low] / sum(model[history].values()))
                    else:
                        # pdb.set_trace ()
                        log_likelihood += -20
                # print (int(log_likelihood), end = " ")
                log_likelihood -= 3 * i  # penalize shorter histories
                done = True
                pos += 1
                break

        if not done:  # Transisition so unlikely, it was never seen
            # Should use "smoothing", but this will disappear
            # when the variable-length prefixes are implemented
            log_likelihood += -20
            # print (int(log_likelihood), end = " ")
            pos += 1

    # print()

    return log_likelihood / (len(word) - 1)  # didn't guess "^"


def word_prob_blend(word, model, n=4):
    word = "^" * n + word + "$"
    pos = n  # char after n ^
    log_likelihood = 0
    # print ("   ", end="")
    while pos < len(word):
        prefix = word[pos - n:pos]
        prob = blend(prefix, word[pos], model)
        log_likelihood += math.log (prob)
        pos += 1


    return log_likelihood / (len(word) - 1)  # didn't guess "^"

def blend(prefix, x, model, n=4):
    prob = 0
    alpha = 0.5
    beta = 0.75
    possible_x = 26 + 26 + 1

    # first half of algorithm
    if prefix in model:
        base = (len(model[prefix])*beta + alpha) / (sum(model[prefix].values()) + alpha)
        if x in model[prefix]:
            prob += (model[prefix][x] - beta) / (sum(model[prefix].values()) + alpha)
    else:
        base = (0 + alpha) / (0 + alpha)
        
    if len(prefix) == 0:
        prob += base / possible_x
    else:
        prob += base * blend(prefix[1:], x, model, n)
        
    return prob


In [None]:
n_gram_model = build_model("words_manual_en.txt", 4)
print(n_gram_model)

In [None]:
# verify
n_gram_model = build_model("words_manual_en.txt", 4)
cand = [chr(i) for i in range(ord('A'), ord('Z') + 1)] + [chr(i) for i in range(ord('a'), ord('z') + 1)] + ["$"]

def verify(prefix, model, n=4):
    s = 0
    for c in cand:
        prob = blend(prefix, c, model)
        s += blend(prefix, c, model)
        
    return s

print(verify("ralv", n_gram_model))
print(verify("^All", n_gram_model))
print(verify("Trum", n_gram_model))
print(verify("Russ", n_gram_model))
print(verify("Koch", n_gram_model))
print(verify("Rack", n_gram_model))

In [None]:
n_gram_model = build_model("words_manual_en.txt", 4)
count = 0
with open ("checked_words.txt", "r") as f :
    for word in f:
        score = word_prob (word, n_gram_model)
        blend_score = word_prob_blend (word, n_gram_model)
        print(f"word: {word}, original_score: {score}, blend_score: {blend_score}")
        
        count += 1
        if count == 100:
            break


In [None]:
# derivative first half (log P(s) / alpha)
import numpy as np
def derivative_of_P_to_alpha(alpha, beta, model, prefix, x):
    if len(prefix) == 0:
        return 0
    next_prefix = prefix[1:]
    
    if prefix in model:
        ms = sum(model[prefix].values())
        us = len(model[prefix])
        if x in model[prefix]:
            ms_x = model[prefix][x]
        else:
            ms_x = 0
    else:
        ms = 0
        us = 0
        ms_x = 0
        
    eq1 = (beta-ms_x) / (ms+alpha)**2
    eq2 = (ms-us*beta) / (ms+alpha)**2 * blend(next_prefix, x, model)
    eq3 = (alpha + us*beta) / (ms+alpha) * derivative_of_P_to_beta(alpha, beta, model, next_prefix, x)
    
    return eq1 + eq2 + eq3
        
def derivative_of_P_to_beta(alpha, beta, model, prefix, x):
    if len(prefix) == 0:
        return 0
    next_prefix = prefix[1:]
    
    if prefix in model:
        ms = sum(model[prefix].values())
        us = len(model[prefix])
        if x in model[prefix]:
            ms_x = model[prefix][x]
        else:
            ms_x = 0
    else:
        ms = 0
        us = 0
        ms_x = 0
        
    eq1 = -1 / (ms+alpha)
    eq2 = us / (ms+alpha) * blend(next_prefix, x, model, n)
    eq3 = (alpha + us*beta) / (ms+alpha) * derivative_of_P_to_alpha(alpha, beta, model, next_prefix, x)
    
    return eq1 + eq2 + eq3
        

def update_param(alpha_matrix, beta_matrix, prefix, x, model, n=4):
    learning_rate = 0.003
    param_derivative_mapping = [1, len(set(prefix))]
    # add a computation for alpha beta
    alpha = np.dot(alpha_matrix[n], param_derivative_mapping)
    beta = np.dot(beta_matrix[n], param_derivative_mapping)
    
    
    for ind, alpha_derivative in zip(range(len(alpha_matrix[n])), param_derivative_mapping):
        derivative_alpha = 1 / blend(prefix, x, model, n) * derivative_of_P_to_alpha(alpha, beta, n_gram_model, prefix, x) * alpha_derivative
        alpha_matrix[n][ind] += learning_rate * derivative_alpha
        
    for ind, beta_derivative in zip(range(len(beta_matrix[n])), param_derivative_mapping):
        derivative_beta = 1 / blend(prefix, x, model, n) * derivative_of_P_to_beta(alpha, beta, n_gram_model, prefix, x) * beta_derivative
        beta_matrix[n][ind] += learning_rate * derivative_beta

In [None]:
# check derivative

a1 = derivative_of_P_to_alpha(0.5, 0.75, n_gram_model, "epri", "m")
a2 = derivative_of_P_to_alpha(0.5, 0.75, n_gram_model, "epri", "x")
a3 = derivative_of_P_to_alpha(0.5, 0.75, n_gram_model, "epri", "f")

print(a1, a2, a3)

b1 = derivative_of_P_to_alpha(0.5, 0.75, n_gram_model, "Acke", "l")
b2 = derivative_of_P_to_alpha(0.5, 0.75, n_gram_model, "Acke", "x")
b3 = derivative_of_P_to_alpha(0.5, 0.75, n_gram_model, "Acke", "f")

print(b1, b2, b3)

c1 = derivative_of_P_to_beta(0.5, 0.75, n_gram_model, "epri", "m")
c2 = derivative_of_P_to_beta(0.5, 0.75, n_gram_model, "epri", "x")
c3 = derivative_of_P_to_beta(0.5, 0.75, n_gram_model, "epri", "f")

print(c1, c2, c3)

d1 = derivative_of_P_to_beta(0.5, 0.75, n_gram_model, "Acke", "l")
d2 = derivative_of_P_to_beta(0.5, 0.75, n_gram_model, "Acke", "x")
d3 = derivative_of_P_to_beta(0.5, 0.75, n_gram_model, "Acke", "f")

print(d1, d2, d3)

In [None]:
alpha_matrix = [[], [], [], [], [1, 0]]
beta_matrix = [[], [], [], [], [1, 0]]

update_param(alpha_matrix, beta_matrix, "Acke", "l", n_gram_model, n=4)

print(alpha_matrix)
print(beta_matrix)