In [2]:
# Import libraries
import pandas as pd
import numpy as np

In [3]:
# Import data from file, split data by tab and names the columns
data = pd.read_csv('train.tsv', delimiter="\t", header = None, names = ['word', 'mutation'])

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5057059 entries, 0 to 5057058
Data columns (total 2 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   word      object
 1   mutation  object
dtypes: object(2)
memory usage: 77.2+ MB


In [None]:
# Look at the top 10 rows of data
data.head(10)

Unnamed: 0,word,mutation
0,ansin,N
1,),N
2,tá,N
3,níos,N
4,lú,N
5,gaeilge,N
6,ag,N
7,na,N
8,gardaí,N
9,ná,N


In [4]:
# Check data rows and columns
data.shape

(5057059, 2)

In [None]:
# Find unique words
data.word.nunique(), data.mutation.nunique()

(126547, 5)

In [5]:
data.word.value_counts()

a              203526
<S>            200261
an             195608
.              186004
,              165915
                ...  
tremayne            1
copped              1
cleamhnaigh         1
pvg                 1
cúisíoch            1
Name: word, Length: 126547, dtype: int64

In [8]:
len(data)

5057059

In [9]:
# Split data into training and validation set (80:20)
data_train = data[((len(data) // 10) * 2):]
data_val = data[:(((len(data)+1) // 10) * 2)]

In [10]:
data_train.shape

(4045649, 2)

In [11]:
data_train.head()

Unnamed: 0,word,mutation
1011410,éascaíonn,N
1011411,an,N
1011412,ceantar,N
1011413,gainimh,N
1011414,taobh,N


In [12]:
data_val.shape

(1011412, 2)

In [13]:
data_val.head()

Unnamed: 0,word,mutation
0,ansin,N
1,),N
2,tá,N
3,níos,N
4,lú,N


In [14]:
# Turn to list
data_trainlist = list(data_train.itertuples(index=False, name=None))

In [15]:
data_vallist = list(data_val.itertuples(index=False, name=None))

In [16]:
# Show mutation in training data
mutation = {mut for word,mut in data_trainlist}
print(len(mutation))
print(mutation)
vocab = {word for word,mut in data_trainlist}

5
{'S', 'H', 'N', 'U', 'T'}


In [17]:
# Emission Probability
def word_given_mut(word, mut, train_bag = data_trainlist):
    mut_list = [pair for pair in train_bag if pair[1]==mutation]
    count_mut = len(mut_list)
    w_given_mut_list = [pair[0] for pair in mut_list if pair[0]==word]
    count_w_given_mut = len(w_given_mut_list)
 
    return (count_w_given_mut, count_mut)

In [18]:
# Transition Probability
def t2_given_t1(t2, t1, train_bag = data_trainlist):
    mutation = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in mutation if t==t1])
    count_t2_t1 = 0
    for index in range(len(mutation)-1):
        if mutation[index]==t1 and mutation[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [19]:
# Transition matrix of mutation
# Matrix(i, j) represents P(jth tag after the ith tag)
 
mut_matrix = np.zeros((len(mutation), len(mutation)), dtype='float32')
for i, t1 in enumerate(list(mutation)):
    for j, t2 in enumerate(list(mutation)): 
        mut_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(mut_matrix)

[[5.15984222e-02 1.57405145e-03 9.45583522e-01 1.18307746e-03
  6.09310264e-05]
 [3.77744064e-02 2.47701025e-03 9.58819687e-01 8.35990941e-04
  9.28878871e-05]
 [1.05536506e-01 9.06239264e-03 8.43219876e-01 3.80938463e-02
  4.08711983e-03]
 [3.79951671e-02 6.84936647e-04 9.60574746e-01 6.62356324e-04
  8.27945405e-05]
 [5.85962199e-02 2.66985176e-03 9.38382626e-01 1.40518518e-04
  2.10777769e-04]]


In [20]:
# Matrix to dataframe
mut_df = pd.DataFrame(mut_matrix, columns = list(mutation), index=list(mutation))
display(mut_df)

Unnamed: 0,S,H,N,U,T
S,0.051598,0.001574,0.945584,0.001183,6.1e-05
H,0.037774,0.002477,0.95882,0.000836,9.3e-05
N,0.105537,0.009062,0.84322,0.038094,0.004087
U,0.037995,0.000685,0.960575,0.000662,8.3e-05
T,0.058596,0.00267,0.938383,0.000141,0.000211


In [37]:
def Viterbi(words, train_bag = data_trainlist):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        p = [] 
        for mut in T:
            if key == 0:
                transition_p = mut_df.loc['N', mut]
            else:
                transition_p = mut_df.loc[state[-1], mut]
                 
            #emission_p = word_given_mut(words[key], mut)[0]/(word_given_mut(words[key], mut)[1] or not word_given_mut(words[key], mut)[1])
            emission_p = word_given_mut(words[key], mut)[0]/word_given_mut(words[key], mut)[1] if (word_given_mut(words[key], mut)[1]) != 0 else 0
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # Maximum probability
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

**Test random data**

In [50]:
import random
# Test the Viterbi algorithm
random.seed(1234)
 
# Random 100
rndom = [random.randint(1,len(data_vallist)) for x in range(100)]
val_run = [data_vallist[i] for i in rndom]
val_run_base = [tup for sent in val_run for tup in sent]
val_mut_words = [tup[0] for sent in val_run for tup in sent]

In [51]:
mut_seq = Viterbi(val_mut_words)
check = [i for i, j in zip(mut_seq, val_run_base) if i == j] 
 
accuracy = len(check)/len(mut_seq)
print('Accuracy: ',accuracy)

Accuracy:  0.0


**Test all data**

In [52]:
# Testing the algorithm
val_mut_words = [tup for sent in data_val for tup in sent]
val_nomut_words = [tup[0] for sent in data_val for tup in sent]
val_nomut_words
 
mut_seq = Viterbi(val_nomut_words)
 
# Find accuracy
check = [i for i, j in zip(val_mut_words, val_nomut_words) if i == j] 
 
accuracy = len(check)/len(mut_seq)
print('Accuracy : ',accuracy)

Accuracy :  1.0
