In [3]:
import pickle
train10 = pickle.load( open( "./typos-data/train10.pkl", "rb" ) )
train20 = pickle.load( open( "./typos-data/train20.pkl", "rb" ) )
test10 = pickle.load( open( "./typos-data/test10.pkl", "rb" ) )
test20 = pickle.load( open( "./typos-data/test20.pkl", "rb" ) )
print("length of train10 : ",len(train10))
print("length of train20 : ",len(train20))
print("length of test10 : ",len(test10))
print("length of test20 : ",len(test20))

length of train10 :  29057
length of train20 :  27184
length of test10 :  1501
length of test20 :  3374


In [4]:
observation_list=[]
state_list=[]
for mot in train10:
    for (l1,l2) in mot:
        observation_list.append(l1)
        state_list.append(l2)
observation_list=list(set(observation_list))
state_list=list(set(state_list))

In [23]:
import nltk
from numpy import array, ones, zeros, multiply
import numpy as np
import sys
from pandas import Series, DataFrame

# construc the first order HMM Model

UNK = "<unk>"
UNKid = 0
epsilon = 1e-100

class HMM2:
        def __init__(self, state_list, observation_list, train,test,
                 transition_proba = None,
                 observation_proba = None,
                 initial_state_proba = None, smoothing_obs = 0.01):
            """
            Builds a Hidden Markov Model
            * state_list is the list of state symbols [q_0...q_(N-1)]
            * observation_list is the list of observation symbols [v_0...v_(M-1)]
            * transition_proba is the transition probability matrix
                [a_ij] a_ij = Pr(Y_(t+1)=q_i|Y_t=q_j)
            * observation_proba is the observation probablility matrix
                [b_ki] b_ki = Pr(X_t=v_k|Y_t=q_i)
            * initial_state_proba is the initial state distribution
                [pi_i] pi_i = Pr(Y_0=q_i)"""
            print("HMM creating with: ")
            self.N = len(state_list)       # number of states
            self.M = len(observation_list) # number of possible emissions
            print(str(self.N)+" states")
            print(str(self.M)+" observations")
            self.train=train
            self.omega_Y = state_list
            self.omega_X = observation_list
#             self.A2=zeros( (self.N*self.N,self.N), float)
            self.A2=zeros( (self.N,self.N,self.N), float)
            self.B2=zeros( (self.N*self.N,self.M), float)
            self.pi2=zeros( (self.N*self.N), float)
            if transition_proba is None:
                self.transition_proba = zeros( (self.N, self.N), float) 
            else:
                self.transition_proba=transition_proba
            if observation_proba is None:
                self.observation_proba = zeros( (self.N, self.M), float) 
            else:
                self.observation_proba=observation_proba
            if initial_state_proba is None:
                self.initial_state_proba = zeros( (self.N,), float ) 
            else:
                self.initial_state_proba=initial_state_proba
            self.make_indexes() # build indexes, i.e the mapping between token and int
#             self.make_2indexes()
            self.data2index()
            self.calculer_pi()
            self.calculer_A()
            self.calculer_B()
#             self.calculer_pi2()
            self.calculer_A2()
#             self.calculer_B2()
            
        def make_indexes(self):
            """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
            self.Y_index = {}
            for i in range(self.N):
                self.Y_index[self.omega_Y[i]] = i
            self.X_index = {}
            for i in range(self.M):
                self.X_index[self.omega_X[i]] = i

        
        def data2index(self):
            self.Y_char={}
            for i in self.Y_index:
                self.Y_char[self.Y_index[i]]=i
        
        def calculer_pi(self):
            for word in self.train:
                self.initial_state_proba[self.Y_index[word[0][1]]]+=1
#             print(self.initial_state_proba)
            self.initial_state_proba/=len(self.train)
        
    
        def calculer_A(self):
            s=0
            for word in self.train:
                for i in range(len(word)-1):
                    s+=1
                    w1=self.Y_index[word[i][1]]
                    w2=self.Y_index[word[i+1][1]]
                    self.transition_proba[w1][w2]+=1
            tmp=self.transition_proba.T
            self.transition_proba=(tmp/self.transition_proba.sum(axis=1)).T
#             self.transition_proba/=self.transition_proba.sum(axis=0)
        
        def calculer_B(self):
            s=0
            for word in self.train:
                for i in range(len(word)):
                    s+=1
                    w1=self.Y_index[word[i][1]]
                    w2=self.X_index[word[i][0]]
                    self.observation_proba[w1][w2]+=1
            tmp=self.observation_proba.T
            self.observation_proba=(tmp/self.observation_proba.sum(axis=1)).T
            
        def calculer_A2(self):
            for word in self.train:
                for i in range(2,len(word)):
                    w1=self.Y_index[word[i-2][1]]
                    w2=self.Y_index[word[i-1][1]]
                    w3=self.Y_index[word[i][1]]
                    self.A2[w1,w2,w3]+=1
            sumPlus1 = self.A2.sum(axis=0).reshape(self.N, self.N)
            self.A2 /= np.where(sumPlus1 == 0, 1, sumPlus1)
#             self.A2=self.A2/self.A2.sum(axis=0).reshape(self.N,self.N)       
            
        def viterbi(self,word):
            if len(word)<2:
                return word[0][0]
            N=len(self.Y_index)
            delta=np.zeros((N,N,len(word)),float)
            tmp=np.zeros((N,N),float)
            delta1=np.zeros(N, float) 
            index=np.zeros((N,N,len(word)),float) 
            result=[0]*len(word)
            delta1=self.initial_state_proba*self.observation_proba[:,self.X_index[word[0][0]]]
            for i in range (0,N):
                for j in range (0,N):
                    delta[i,j,1] = delta1[i]*self.transition_proba[i,j]*self.observation_proba[j,self.X_index[word[1][0]]]
            for t in range(2,len(word)): 
                p=self.X_index[word[t][0]]
                for i in range (0,N):
                    for j in range (0,N):
                        tmp=delta[:,i,t-1]*self.A2[:,i,j]
                        delta[i,j,t]=max(tmp)
                        index[i,j,t]=tmp.argmax()
                        delta[i,j,t]=delta[i,j,t]*self.observation_proba[j,p]
            delta_t=delta[:,:,len(word)-1]
            result[len(word)-1]=np.unravel_index(delta_t.argmax(),delta_t.shape)[1]
            result[len(word)-2]=np.unravel_index(delta_t.argmax(),delta_t.shape)[0]            
            for t in range(len(word)-3,-1,-1): 
                result[t] = index[int(result[t+1]),int(result[t+2]),t+2]
            r=[self.Y_char[i] for i in result]
            return r
        
        def score_viterbi(self, test):
            s=0
            e=0
            correct=0
            creat=0
            for word in test:
                nword=self.viterbi(word)
                for i,j in zip(word,nword):
                    s+=1
                    if i[1]!=j:
                        e+=1
                    if i[0]!=i[1] and i[1]==j:
                        correct+=1
                    if i[1]!=j and i[0]==i[1]:
                        creat+=1
            return {"Error rate":e/s, "Errors correct":correct, "Errors creat":creat}
        def baseline(self,test):
            s=0
            e=0
            for word in test:
                for l in word:
                    s+=1
                    if l[0]!=l[1]:
                        e+=1
            return e/s

In [24]:
model1=HMM2(state_list,observation_list,train10,None)
print("Test10, trained with train10:")
print("Baseline:",model1.baseline(test10))
print("Viterbe for test10:",model1.score_viterbi(test10))

HMM creating with: 
26 states
26 observations
Test10, trained with train10:
Baseline: 0.10177595628415301
Viterbe for test10: {'Error rate': 0.05423497267759563, 'Errors correct': 473, 'Errors creat': 125}


In [205]:
model2=HMM2(state_list,observation_list,train20,None)
print("Test20, trained with train20:")
print("Baseline:",model2.baseline(test20))
print("Viterbe for test20:",model2.score_viterbi(test20))

HMM creating with: 
26 states
26 observations




Test20, trained with train20:
Baseline: 0.19405667725121323
Viterbe for test20: {'Error rate': 0.09633934455694686, 'Errors correct': 2095, 'Errors creat': 464}
