In [1]:
import pickle
train10 = pickle.load( open( "./typos-data/train10.pkl", "rb" ) )
train20 = pickle.load( open( "./typos-data/train20.pkl", "rb" ) )
test10 = pickle.load( open( "./typos-data/test10.pkl", "rb" ) )
test20 = pickle.load( open( "./typos-data/test20.pkl", "rb" ) )

In [2]:
print("length of train10 : ",len(train10))
print("length of train20 : ",len(train20))
print("length of test10 : ",len(test10))
print("length of test20 : ",len(test20))

length of train10 :  29057
length of train20 :  27184
length of test10 :  1501
length of test20 :  3374


In [3]:
observation_list=[]
state_list=[]
for mot in train10:
    for (l1,l2) in mot:
        observation_list.append(l1)
        state_list.append(l2)
observation_list=list(set(observation_list))
state_list=list(set(state_list))

In [272]:
import nltk
from numpy import array, ones, zeros, multiply
import numpy as np
import sys
from pandas import Series, DataFrame

# construc the first order HMM Model

UNK = "<unk>"
UNKid = 0
epsilon = 1e-100

class HMM:
        def __init__(self, state_list, observation_list, train,test,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None, smoothing_obs = 0.01):
            """
            Builds a Hidden Markov Model
            * state_list is the list of state symbols [q_0...q_(N-1)]
            * observation_list is the list of observation symbols [v_0...v_(M-1)]
            * transition_proba is the transition probability matrix
                [a_ij] a_ij = Pr(Y_(t+1)=q_i|Y_t=q_j)
            * observation_proba is the observation probablility matrix
                [b_ki] b_ki = Pr(X_t=v_k|Y_t=q_i)
            * initial_state_proba is the initial state distribution
                [pi_i] pi_i = Pr(Y_0=q_i)"""
            print("HMM creating with: ")
            self.N = len(state_list)       # number of states
            self.M = len(observation_list) # number of possible emissions
            print(str(self.N)+" states")
            print(str(self.M)+" observations")
            self.train=train
            self.omega_Y = state_list
            self.omega_X = observation_list
            if transition_proba is None:
                self.transition_proba = zeros( (self.N, self.N), float) 
            else:
                self.transition_proba=transition_proba
            if observation_proba is None:
                self.observation_proba = zeros( (self.M, self.N), float) 
            else:
                self.observation_proba=observation_proba
            if initial_state_proba is None:
                self.initial_state_proba = zeros( (self.N,), float ) 
            else:
                self.initial_state_proba=initial_state_proba
            self.make_indexes() # build indexes, i.e the mapping between token and int
            self.data2index()
            self.calculer_pi()
            self.calculer_A()
            self.calculer_B()
            
        def make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
            self.Y_index = {}
            for i in range(self.N):
                self.Y_index[self.omega_Y[i]] = i
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i
        
        def data2index(self):
            self.Y_char={}
            for i in self.Y_index:
                self.Y_char[self.Y_index[i]]=i
        
        def calculer_pi(self):
            for word in self.train:
                self.initial_state_proba[self.Y_index[word[0][1]]]+=1
#             print(self.initial_state_proba)
            self.initial_state_proba/=len(self.train)
        def calculer_A(self):
            s=0
            for word in self.train:
                for i in range(len(word)-1):
                    s+=1
                    w1=self.Y_index[word[i][1]]
                    w2=self.Y_index[word[i+1][1]]
                    self.transition_proba[w1][w2]+=1
            tmp=self.transition_proba.T
            self.transition_proba=(tmp/self.transition_proba.sum(axis=1)).T
        
        def calculer_B(self):
            s=0
            for word in self.train:
                for i in range(len(word)):
                    s+=1
                    w1=self.Y_index[word[i][1]]
                    w2=self.X_index[word[i][0]]
                    self.observation_proba[w1][w2]+=1
            tmp=self.observation_proba.T
            self.observation_proba=(tmp/self.observation_proba.sum(axis=1)).T
            
        def calculer_alpha(self,word):            
            alpha=np.zeros((len(self.Y_index),len(word)),float)
            alpha[:,0]=self.initial_state_proba*self.observation_proba[:,self.X_index[word[0][0]]]
            if len(word)==1:
                return alpha
            for i in range(1,len(word)):
                alpha[:,i]=(alpha[:,i-1]*self.transition_proba.T).sum(axis=1)*\
                self.observation_proba[:,self.X_index[word[i][0]]]
            return alpha
        
        def calculer_beta(self,word):
            beta=np.zeros((len(self.Y_index),len(word)),float)
            beta[:,-1]=1
            if len(word)==1:
                return beta
            for i in range(len(word)-2,-1,-1):
                beta[:,i]=(self.transition_proba*self.observation_proba[:,self.X_index[word[i+1][0]]]*\
                           beta[:,i+1]).sum(axis=1)
            return beta
        
        def FB(self,alpha,beta):
            prob = alpha * beta
            index=list(prob.argmax(axis=0))
#             print(index)
            r=[self.Y_char[i] for i in index]
            return r
    
        def viterbi(self,word):
            N=len(self.Y_index)
            delta=np.zeros(N, float)
            delta_t=np.zeros(N, float)
            tmp=np.zeros(N, float)
            index=np.zeros((len(word),N), int)
            delta=self.initial_state_proba*self.observation_proba[:,self.X_index[word[0][0]]]
            for t in range(1, len(word)):
                p=self.X_index[word[t][0]]
                for j in range(len(self.Y_index)):
                    tmp=delta*self.transition_proba[:,j]
                    index[t,j]=tmp.argmax()
                    delta_t[j]=tmp[index[t,j]]*self.observation_proba[j,p]
                delta, delta_t = delta_t, delta
            result=[delta.argmax()]
            for i in index[-1:0:-1]:
                result.append(i[result[-1]])
            result.reverse()
            r=[self.Y_char[i] for i in result]
            return r

In [273]:
model=HMM(state_list,observation_list,train10,None)

print(model.X_index)
print(model.Y_index)

HMM creating with: 
26 states
26 observations
{'d': 0, 'h': 1, 'c': 2, 'x': 3, 'm': 4, 'k': 5, 'j': 6, 'r': 7, 'n': 8, 'b': 9, 'e': 10, 'q': 11, 'f': 12, 'v': 13, 'a': 14, 't': 15, 'i': 16, 'o': 17, 'z': 18, 'l': 19, 's': 20, 'u': 21, 'p': 22, 'w': 23, 'y': 24, 'g': 25}
{'d': 0, 'h': 1, 'c': 2, 'x': 3, 'm': 4, 'k': 5, 'j': 6, 'r': 7, 'n': 8, 'b': 9, 'e': 10, 'q': 11, 'f': 12, 'v': 13, 'a': 14, 't': 15, 'i': 16, 'o': 17, 'z': 18, 'l': 19, 's': 20, 'u': 21, 'p': 22, 'w': 23, 'y': 24, 'g': 25}


In [274]:
word=train10[100]
alpha=model.calculer_alpha(word)
beta=model.calculer_beta(word)
print(word)
print(model.FB(alpha,beta))
print(model.viterbi(word))

[('c', 'c'), ('o', 'o'), ('m', 'm'), ('p', 'p'), ('l', 'l'), ('e', 'e'), ('x', 'x')]
['c', 'o', 'm', 'p', 'l', 'e', 'x']
['c', 'o', 'm', 'p', 'l', 'e', 'x']


In [275]:
word=train10[100]
alpha=model.calculer_alpha(word)
beta=model.calculer_beta(word)
print(word)
print(model.FB(alpha,beta))
print(model.viterbi(word))

[('c', 'c'), ('o', 'o'), ('m', 'm'), ('p', 'p'), ('l', 'l'), ('e', 'e'), ('x', 'x')]
['c', 'o', 'm', 'p', 'l', 'e', 'x']
['c', 'o', 'm', 'p', 'l', 'e', 'x']


In [276]:
s=0
e=0
for word in test10:
    nword=model.viterbi(word)
    for i,j in zip(word,nword):
        s+=1
        if i[1]!=j:
            e+=1
print(e/s)

0.0680327868852459


In [277]:
s=0
e=0
for word in test10:
    alpha=model.calculer_alpha(word)
    beta=model.calculer_beta(word)
    nword=model.FB(alpha,beta)
    for i,j in zip(word,nword):
        s+=1
        if i[1]!=j:
            e+=1
print(e/s)

0.06762295081967214


In [278]:
s=0
e=0
for word in test10:
    for l in word:
        s+=1
        if l[0]!=l[1]:
            e+=1
print(e/s)

0.10177595628415301
