1. [Reduce Lemma size](#Reduce-Lemma-size)
2. [soft-em-version-1](#soft-em-version-1)
3. [soft em version 2 (adding morphologicla element-suffix)](#soft-em-version-2-(adding-morphologicla-element-suffix))
4. [experiment: toy data](#experiment:-toy-data)
5. [experiment toy data after lemma cut to 50% of vocab](#experiment-toy-data-after-lemma-cut-to-50%-of-vocab)
6. [experiment UD ENG 1000 sentence: No lemma cut](#experiment-UD-ENG-1000-sentence:-No-lemma-cut)
7. [experiment UD ENG 1000 sentence after lemma cut to 50% of vocab](#experiment-UD-ENG-1000-sentence-after-lemma-cut-to-50%-of-vocab)
8. [experiment UD ENG 1000 sentence: morph-sufiix](#experiment-UD-ENG-1000-sentence:-morph-sufiix)
9. [experiment UD ENG 1000 sentence: morph-sufiix + lemma cut](#experiment-UD-ENG-1000-sentence:-morph-sufiix-+-lemma-cut)

In [1]:
import pandas as pd
import numpy as np
import pyconll
%reload_ext autoreload
%autoreload 2
from data_preprocess import augment
from utils import reconstruct_lemma
from em import em_1
UD_ENGLISH_TRAIN = './sample-data/ud/en_ewt-ud-train.conllu'

In [2]:
def count_word_for_lemma(df):
    dfc = df.copy()
    df2 = dfc[['l','w','c_w']].drop_duplicates()
    df2['c_l']  = df2.groupby('l')['c_w'].transform('sum')
    l_freq = df2.set_index('l').to_dict()['c_l']
    dfc['c_l'] = dfc.apply(lambda row: l_freq[row['l']],axis=1)
    ranks  = dfc.groupby('w')['c_l'].rank(ascending = False, method = 'first')
    ranks.name = 'rank'
    dfc = pd.concat([dfc, ranks], axis = 1)
    dfc = dfc.sort_values(by=['c_l'], ascending=False)
    return dfc

## Reduce Lemma size
### $size = \frac{|v|}{2}$

In [3]:
def cut_lemma_count(dfc):
    df_f = dfc.loc[dfc['rank']==1]
    w_voc = set(dfc['w'])
    for i in range(2,10):
        l_voc = set(df_f['l'])
        if (len(l_voc)>=round(len(w_voc)/2)) or i>dfc['rank'].max():
            break
        else:
            index_to_add  = round(len(w_voc)/2)-len(l_voc)
            df_to_add = dfc.loc[dfc['rank']==i]
            df_f = df_f.append(df_to_add.iloc[0:index_to_add])
    return df_f

## soft em version 1

In [62]:
def em(df, iterations):
	for it in range(0,iterations):
		#E step
		df['p(l,w)'] = df['p(l)'] * df['p(w|l)']
		df['p(w)'] = df.groupby(['w'])['p(l,w)'].transform('sum')
		df['p(l|w)'] = df['p(l,w)']/ df['p(w)']
		df['ec(l)'] = df.groupby(['l'])['p(l|w)'].transform('sum')
		df['ec(l,w)'] = df['p(l,w)']
		like = np.log(np.prod((df['p(w|l)']*df['p(l)'])**df['c_w']))
		print("iteration:{}, log-likelihood: {}".format(it,like)) 
		#M step
		df['p(l)'] = df['ec(l)']/len(set(df['w']))
		df['p(w|l)'] = df['ec(l,w)'] / df['ec(l)']

	return df

## soft em version 2 (adding morphologicla element-suffix)

In [63]:
def em_morph(df, iterations):
    for it in range(0,iterations):
        #E step
        df['p(l,m,w)'] = df['p(l,m)']
        df['p(w)'] = df.groupby(['w'])['p(l,m,w)'].transform('sum')
        df['p(l,m|w)'] = df['p(l,m,w)']/df['p(w)']
        df['ec(l,m)'] = df.groupby(['l','m'])['p(l,m|w)'].transform('sum')
        df['ec(l,m,w)'] = df['p(l,m|w)']
        like = np.log(np.prod((df['p(l,m)']*df['p(l)']*df['p(m)'])))
        print("iteration:{}, log-likelihood: {}".format(it,like)) 
        #M step
        df['p(l,m)'] = df['ec(l,m)']/len(set(df['w']))

#         print('iteration', it)
#         print(df)
    return df

## experiment: toy data

In [64]:
toy = [['run', 'ru', 8,'n'],['run', 'run', 8, 'Ε'],
      ['runs', 'run', 4, 's'], ['runs', 'runs', 4, 'Ε'], 
       ['running', 'run', 3, 'ning'], ['running', 'runn', 3, 'ing'],['running', 'running',3, 'Ε'], 
       ['cat', 'ca',1, 't'],['cat', 'cat', 1,'Ε'],
       ['cats', 'cat',3, 's'], ['cats','cats',3, 'Ε']]

df = pd.DataFrame(toy,columns=["w","l","c_w", "m"])

df['p(l)']=df.groupby('l')['l'].transform('count')/len(df)
df['p(m)']=1/len(set(df['m']))
df['p(l,m)'] = df['p(l)']*df['p(m)']

print(df)
df_result = em_morph(df,5)
df_result

          w        l  c_w     m      p(l)      p(m)    p(l,m)
0       run       ru    8     n  0.090909  0.166667  0.015152
1       run      run    8     Ε  0.272727  0.166667  0.045455
2      runs      run    4     s  0.272727  0.166667  0.045455
3      runs     runs    4     Ε  0.090909  0.166667  0.015152
4   running      run    3  ning  0.272727  0.166667  0.045455
5   running     runn    3   ing  0.090909  0.166667  0.015152
6   running  running    3     Ε  0.090909  0.166667  0.015152
7       cat       ca    1     t  0.090909  0.166667  0.015152
8       cat      cat    1     Ε  0.181818  0.166667  0.030303
9      cats      cat    3     s  0.181818  0.166667  0.030303
10     cats     cats    3     Ε  0.090909  0.166667  0.015152
iteration:0, log-likelihood: -82.80814187033292
iteration:1, log-likelihood: -69.19369708127165
iteration:2, log-likelihood: -69.19369708127165
iteration:3, log-likelihood: -69.19369708127165
iteration:4, log-likelihood: -69.19369708127165


Unnamed: 0,w,l,c_w,m,p(l),p(m),"p(l,m)","p(l,m,w)",p(w),"p(l,m|w)","ec(l,m)","ec(l,m,w)"
0,run,ru,8,n,0.090909,0.166667,0.05,0.05,0.2,0.25,0.25,0.25
1,run,run,8,Ε,0.272727,0.166667,0.15,0.15,0.2,0.75,0.75,0.75
2,runs,run,4,s,0.272727,0.166667,0.15,0.15,0.2,0.75,0.75,0.75
3,runs,runs,4,Ε,0.090909,0.166667,0.05,0.05,0.2,0.25,0.25,0.25
4,running,run,3,ning,0.272727,0.166667,0.12,0.12,0.2,0.6,0.6,0.6
5,running,runn,3,ing,0.090909,0.166667,0.04,0.04,0.2,0.2,0.2,0.2
6,running,running,3,Ε,0.090909,0.166667,0.04,0.04,0.2,0.2,0.2,0.2
7,cat,ca,1,t,0.090909,0.166667,0.066667,0.066667,0.2,0.333333,0.333333,0.333333
8,cat,cat,1,Ε,0.181818,0.166667,0.133333,0.133333,0.2,0.666667,0.666667,0.666667
9,cats,cat,3,s,0.181818,0.166667,0.133333,0.133333,0.2,0.666667,0.666667,0.666667


## experiment toy data after lemma cut to 50% of vocab

In [65]:
toy = [['run', 'ru', 8],['run', 'run', 8], ['ran', 'ru', 2], ['ran', 'run', 2],
      ['runs', 'run', 4], ['runs', 'runs', 4], ['running', 'run', 3], ['running', 'runn', 3],
      ['running', 'running',3], ['cat', 'ca',1],['cat', 'cat', 1],['cats', 'cat',3], ['cats','cats',3],
      ['fahim','fah',1],['fahim','fahim',1]]
df = pd.DataFrame(toy,columns=["w","l","c_w"])
# df = df.drop_duplicates()
print("w_len:{}, l_len:{}".format(len(set(df['w'])),len(set(df['l']))))
dfc = count_word_for_lemma(df)
# dfc = cut_lemma_count(dfc)
print("In original: w_len:{}, l_len:{}".format(len(set(df['w'])),len(set(df['l']))))
print("After lemma cut: w_len:{}, l_len:{}".format(len(set(dfc['w'])),len(set(dfc['l']))))
dfc

w_len:7, l_len:10
In original: w_len:7, l_len:10
After lemma cut: w_len:7, l_len:10


Unnamed: 0,w,l,c_w,c_l,rank
1,run,run,8,17,1.0
3,ran,run,2,17,1.0
4,runs,run,4,17,1.0
6,running,run,3,17,1.0
0,run,ru,8,10,2.0
2,ran,ru,2,10,2.0
5,runs,runs,4,4,2.0
10,cat,cat,1,4,1.0
11,cats,cat,3,4,1.0
7,running,runn,3,3,2.0


In [66]:
dfc['p(l)']=1/len(set(dfc['l']))
dfc['p(w|l)']=1/df.groupby('l')['l'].transform('count')
print(dfc)
df_result = em(dfc,5)
df_result

          w        l  c_w  c_l  rank  p(l)  p(w|l)
1       run      run    8   17   1.0   0.1    0.25
3       ran      run    2   17   1.0   0.1    0.25
4      runs      run    4   17   1.0   0.1    0.25
6   running      run    3   17   1.0   0.1    0.25
0       run       ru    8   10   2.0   0.1    0.50
2       ran       ru    2   10   2.0   0.1    0.50
5      runs     runs    4    4   2.0   0.1    1.00
10      cat      cat    1    4   1.0   0.1    0.50
11     cats      cat    3    4   1.0   0.1    0.50
7   running     runn    3    3   2.0   0.1    1.00
8   running  running    3    3   3.0   0.1    1.00
12     cats     cats    3    3   2.0   0.1    1.00
9       cat       ca    1    1   2.0   0.1    1.00
13    fahim      fah    1    1   1.0   0.1    1.00
14    fahim    fahim    1    1   2.0   0.1    1.00
iteration:0, log-likelihood: -141.49256403759753
iteration:1, log-likelihood: -232.95034104319726
iteration:2, log-likelihood: -324.40811804879695
iteration:3, log-likelihood: -415.865

Unnamed: 0,w,l,c_w,c_l,rank,p(l),p(w|l),"p(l,w)",p(w),p(l|w),ec(l),"ec(l,w)"
1,run,run,8,17,1.0,0.139683,1.1e-05,1e-05,3.1e-05,0.333333,0.977778,1e-05
3,ran,run,2,17,1.0,0.139683,1.1e-05,1e-05,3.1e-05,0.333333,0.977778,1e-05
4,runs,run,4,17,1.0,0.139683,1.1e-05,1e-05,5.2e-05,0.2,0.977778,1e-05
6,running,run,3,17,1.0,0.139683,1.1e-05,1e-05,9.4e-05,0.111111,0.977778,1e-05
0,run,ru,8,10,2.0,0.190476,1.6e-05,2.1e-05,3.1e-05,0.666667,1.333333,2.1e-05
2,ran,ru,2,10,2.0,0.190476,1.6e-05,2.1e-05,3.1e-05,0.666667,1.333333,2.1e-05
5,runs,runs,4,4,2.0,0.114286,5.2e-05,4.2e-05,5.2e-05,0.8,0.8,4.2e-05
10,cat,cat,1,4,1.0,0.095238,3.1e-05,2.1e-05,6.2e-05,0.333333,0.666667,2.1e-05
11,cats,cat,3,4,1.0,0.095238,3.1e-05,2.1e-05,6.2e-05,0.333333,0.666667,2.1e-05
7,running,runn,3,3,2.0,0.063492,9.4e-05,4.2e-05,9.4e-05,0.444444,0.444444,4.2e-05


## Load UD Eng-1000

In [57]:
train = pyconll.load_from_file(UD_ENGLISH_TRAIN)
up_set = {"DET", "PUNCT"}
l_w = []
for sentence in train[0:1000]:
    for token in sentence:
        if token.upos not in up_set:
            l_w.append([token.form.lower(), token.lemma.lower(), token.upos])

df = pd.DataFrame(l_w,columns=["w","l","pos"])
df['c_w'] = df.groupby('w')['w'].transform('count')
w_freq = df.set_index('w').to_dict()['c_w']

In [58]:
dfc = count_word_for_lemma(df)
dfc.head(10)

Unnamed: 0,w,l,pos,c_w,c_l,rank
5743,is,be,AUX,216,875,75.0
9735,'m,be,AUX,5,875,2.0
828,are,be,AUX,100,875,6.0
14318,been,be,AUX,64,875,54.0
14313,was,be,AUX,141,875,118.0
3542,is,be,AUX,216,875,51.0
842,are,be,AUX,100,875,7.0
14301,was,be,AUX,141,875,117.0
9667,was,be,AUX,141,875,71.0
9686,was,be,AUX,141,875,72.0


## experiment UD ENG 1000 sentence: No lemma cut

In [59]:
dfc['p(l)']=1/len(set(dfc['l']))
dfc['p(w|l)']=1/df.groupby('l')['l'].transform('count')
print(dfc)
df_result = em(dfc,3)
df_result.head(10)

                  w            l   pos  c_w  c_l   rank      p(l)    p(w|l)
5743             is           be   AUX  216  875   75.0  0.000274  0.001362
9735             'm           be   AUX    5  875    2.0  0.000274  0.001362
828             are           be   AUX  100  875    6.0  0.000274  0.001362
14318          been           be   AUX   64  875   54.0  0.000274  0.001362
14313           was           be   AUX  141  875  118.0  0.000274  0.001362
...             ...          ...   ...  ...  ...    ...       ...       ...
10237         solid        solid   ADJ    1    1    1.0  0.000274  1.000000
10246  withstanding    withstand  VERB    1    1    1.0  0.000274  1.000000
10248     adventure    adventure  NOUN    1    1    1.0  0.000274  1.000000
10268     multitude    multitude  NOUN    1    1    1.0  0.000274  1.000000
17368   appellation  appellation  NOUN    1    1    1.0  0.000274  1.000000

[17369 rows x 8 columns]
iteration:0, log-likelihood: -inf
iteration:1, log-likelihood:

  if __name__ == '__main__':


Unnamed: 0,w,l,pos,c_w,c_l,rank,p(l),p(w|l),"p(l,w)",p(w),p(l|w),ec(l),"ec(l,w)"
5743,is,be,AUX,216,875,75.0,0.002555,1.790539e-15,1.985304e-14,4.288258e-12,0.00463,11.087747,1.985304e-14
9735,'m,be,AUX,5,875,2.0,0.002555,1.790539e-15,1.985304e-14,9.926522e-14,0.2,11.087747,1.985304e-14
828,are,be,AUX,100,875,6.0,0.002555,1.790539e-15,1.985304e-14,1.985304e-12,0.01,11.087747,1.985304e-14
14318,been,be,AUX,64,875,54.0,0.002555,1.790539e-15,1.985304e-14,1.270595e-12,0.015625,11.087747,1.985304e-14
14313,was,be,AUX,141,875,118.0,0.002555,1.790539e-15,1.985304e-14,2.799279e-12,0.007092,11.087747,1.985304e-14
3542,is,be,AUX,216,875,51.0,0.002555,1.790539e-15,1.985304e-14,4.288258e-12,0.00463,11.087747,1.985304e-14
842,are,be,AUX,100,875,7.0,0.002555,1.790539e-15,1.985304e-14,1.985304e-12,0.01,11.087747,1.985304e-14
14301,was,be,AUX,141,875,117.0,0.002555,1.790539e-15,1.985304e-14,2.799279e-12,0.007092,11.087747,1.985304e-14
9667,was,be,AUX,141,875,71.0,0.002555,1.790539e-15,1.985304e-14,2.799279e-12,0.007092,11.087747,1.985304e-14
9686,was,be,AUX,141,875,72.0,0.002555,1.790539e-15,1.985304e-14,2.799279e-12,0.007092,11.087747,1.985304e-14


## experiment UD ENG 1000 sentence after lemma cut to 50% of vocab

In [60]:
dfc = count_word_for_lemma(df)
dfc = cut_lemma_count(dfc)
print("In original: w_len:{}, l_len:{}".format(len(set(df['w'])),len(set(df['l']))))
print("After lemma cut: w_len:{}, l_len:{}".format(len(set(dfc['w'])),len(set(dfc['l']))))
dfc.head(10)

In original: w_len:4339, l_len:3645
After lemma cut: w_len:4339, l_len:3604


Unnamed: 0,w,l,pos,c_w,c_l,rank
9749,ai,be,AUX,2,875,1.0
14249,'re,be,AUX,5,875,1.0
8859,am,be,AUX,2,875,1.0
2058,'m,be,AUX,5,875,1.0
1059,'s,be,AUX,148,875,1.0
90,was,be,AUX,141,875,1.0
246,are,be,AUX,100,875,1.0
15751,s,be,AUX,4,875,1.0
7330,’s,be,AUX,24,875,1.0
197,been,be,VERB,64,875,1.0


In [61]:
dfc['p(l)']=1/len(set(dfc['l']))
dfc['p(w|l)']=1/df.groupby('l')['l'].transform('count')
print(dfc)
df_result = em(dfc,1)
df_result.head(10)

                  w            l   pos  c_w  c_l  rank      p(l)    p(w|l)
9749             ai           be   AUX    2  875   1.0  0.000277  0.001362
14249           're           be   AUX    5  875   1.0  0.000277  0.001362
8859             am           be   AUX    2  875   1.0  0.000277  0.001362
2058             'm           be   AUX    5  875   1.0  0.000277  0.001362
1059             's           be   AUX  148  875   1.0  0.000277  0.001362
...             ...          ...   ...  ...  ...   ...       ...       ...
10237         solid        solid   ADJ    1    1   1.0  0.000277  1.000000
10246  withstanding    withstand  VERB    1    1   1.0  0.000277  1.000000
10248     adventure    adventure  NOUN    1    1   1.0  0.000277  1.000000
10268     multitude    multitude  NOUN    1    1   1.0  0.000277  1.000000
17368   appellation  appellation  NOUN    1    1   1.0  0.000277  1.000000

[4339 rows x 8 columns]
iteration:0, log-likelihood: -inf


  if __name__ == '__main__':


Unnamed: 0,w,l,pos,c_w,c_l,rank,p(l),p(w|l),"p(l,w)",p(w),p(l|w),ec(l),"ec(l,w)"
9749,ai,be,AUX,2,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07
14249,'re,be,AUX,5,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07
8859,am,be,AUX,2,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07
2058,'m,be,AUX,5,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07
1059,'s,be,AUX,148,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07
90,was,be,AUX,141,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07
246,are,be,AUX,100,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07
15751,s,be,AUX,4,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07
7330,’s,be,AUX,24,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07
197,been,be,VERB,64,875,1.0,0.003227,2.70017e-08,3.780238e-07,3.780238e-07,1.0,14.0,3.780238e-07


## experiment UD ENG 1000 sentence: morph-sufiix

In [14]:
triplets =  augment(list(df['l']),list(df['w']),list(df['pos']))
df = pd.DataFrame.from_dict(triplets)
df['p_sc_s'] = df.p+df.sc+df.s
df['c_pscs'] = df.groupby(['p_sc_s'])['p_sc_s'].transform('count')
pscs_freq = df.set_index('p_sc_s').to_dict()['c_pscs']

trips = []
train_triplet = list(set(zip(df['p'],df['sc'],df['s'])))
for oup in list(df['oup']):
    trips.extend(reconstruct_lemma(oup, train_triplet))
    
df = pd.DataFrame(trips,columns=["p","sc","s","pos","l","w"])
df['p_sc_s'] = df.p+df.sc+df.s
df["c_w"] = df.apply(lambda row:w_freq[row["w"]],axis=1)
df["c_pscs"] = df.apply(lambda row:pscs_freq[row["p_sc_s"]],axis=1)
df = df.drop_duplicates()

In [15]:
df

Unnamed: 0,p,sc,s,pos,l,w,p_sc_s,c_w,c_pscs
0,Ε,Ε->Ε,Ε,0,al,al,ΕΕ->ΕΕ,59,13801
1,Ε,in->Ε,an,0,zamin,zaman,Εin->Εan,3,3
2,Ε,un->Ε,an,0,zamun,zaman,Εun->Εan,3,3
3,Ε,Ε->Ε,Ε,0,zaman,zaman,ΕΕ->ΕΕ,3,13801
4,Ε,Ε->Ε,n,0,zama,zaman,ΕΕ->Εn,3,35
...,...,...,...,...,...,...,...,...,...
62177,Ε,Ε->Ε,Ε,0,phonies,phonies,ΕΕ->ΕΕ,1,13801
62178,Ε,e->Ε,es,0,phonie,phonies,Εe->Εes,1,167
62179,Ε,ve->Ε,s,0,phonieve,phonies,Εve->Εs,1,107
62204,Ε,Ε->Ε,n,0,appellatio,appellation,ΕΕ->Εn,1,35


In [16]:
dfc = count_word_for_lemma(df)
print("In original: w_len:{}, l_len:{}".format(len(set(df['w'])),len(set(df['l']))))
print("After lemma cut: w_len:{}, l_len:{}".format(len(set(dfc['w'])),len(set(dfc['l']))))
dfc.head(10)

In original: w_len:4334, l_len:11951
After lemma cut: w_len:4334, l_len:11951


Unnamed: 0,p,sc,s,pos,l,w,p_sc_s,c_w,c_pscs,c_l,rank
439,Ε,Ε->Ε,Ε,0,and,and,ΕΕ->ΕΕ,534,13801,534,1.0
55,Ε,Ε->Ε,Ε,0,in,in,ΕΕ->ΕΕ,510,13801,510,1.0
60,Ε,Ε->Ε,Ε,0,of,of,ΕΕ->ΕΕ,509,13801,509,1.0
111,Ε,Ε->Ε,Ε,0,to,to,ΕΕ->ΕΕ,491,13801,491,1.0
138,Ε,Ε->Ε,t,0,tha,that,ΕΕ->Εt,253,2,277,1.0
2875,Ε,Ε->Ε,n,0,tha,than,ΕΕ->Εn,24,35,277,1.0
140,Ε,e->Ε,t,0,thae,that,Εe->Εt,253,4,253,5.0
139,Ε,Ε->Ε,Ε,0,that,that,ΕΕ->ΕΕ,253,13801,253,4.0
137,Ε,ot->t,Ε,0,thaot,that,Εot->tΕ,253,1,253,3.0
141,Ε,d->Ε,t,0,thad,that,Εd->Εt,253,9,253,6.0


In [17]:
dfc['p(l)']=dfc.groupby('l')['l'].transform('count')/len(dfc)
dfc['m'] = dfc['s']
dfc['p(m)']=1/len(set(dfc['m']))
dfc['p(l,m)'] = dfc['p(l)']*dfc['p(m)']
df_result = em_morph(dfc,2)
df_result

iteration 0
       p     sc   s  pos            l            w    p_sc_s  c_w  c_pscs  \
439    Ε   Ε->Ε   Ε    0          and          and    ΕΕ->ΕΕ  534   13801   
55     Ε   Ε->Ε   Ε    0           in           in    ΕΕ->ΕΕ  510   13801   
60     Ε   Ε->Ε   Ε    0           of           of    ΕΕ->ΕΕ  509   13801   
111    Ε   Ε->Ε   Ε    0           to           to    ΕΕ->ΕΕ  491   13801   
138    Ε   Ε->Ε   t    0          tha         that    ΕΕ->Εt  253       2   
...   ..    ...  ..  ...          ...          ...       ...  ...     ...   
25080  Ε   Ε->Ε   Ε    0     superior     superior    ΕΕ->ΕΕ    1   13801   
25046  Ε  in->Ε  an    0     chairmin     chairman  Εin->Εan    1       3   
25045  Ε   Ε->Ε   Ε    0     chairman     chairman    ΕΕ->ΕΕ    1   13801   
25044  Ε   Ε->Ε   n    0      chairma     chairman    ΕΕ->Εn    1      35   
62205  Ε   Ε->Ε   Ε    0  appellation  appellation    ΕΕ->ΕΕ    1   13801   

       c_l  rank      p(l)   m      p(m)    p(l,m)      p(l,m,w

Unnamed: 0,p,sc,s,pos,l,w,p_sc_s,c_w,c_pscs,c_l,rank,p(l),m,p(m),"p(l,m)","p(l,m,w)",p(w),"p(l,m|w)","ec(l,m)","ec(l,m,w)"
439,Ε,Ε->Ε,Ε,0,and,and,ΕΕ->ΕΕ,534,13801,534,1.0,0.000066,Ε,0.007576,0.000231,0.000231,0.000231,1.000000,1.000000,1.000000
55,Ε,Ε->Ε,Ε,0,in,in,ΕΕ->ΕΕ,510,13801,510,1.0,0.000066,Ε,0.007576,0.000231,0.000231,0.000231,1.000000,1.000000,1.000000
60,Ε,Ε->Ε,Ε,0,of,of,ΕΕ->ΕΕ,509,13801,509,1.0,0.000066,Ε,0.007576,0.000231,0.000231,0.000231,1.000000,1.000000,1.000000
111,Ε,Ε->Ε,Ε,0,to,to,ΕΕ->ΕΕ,491,13801,491,1.0,0.000066,Ε,0.007576,0.000231,0.000231,0.000231,1.000000,1.000000,1.000000
138,Ε,Ε->Ε,t,0,tha,that,ΕΕ->Εt,253,2,277,1.0,0.000132,t,0.007576,0.000066,0.000066,0.000231,0.285714,0.285714,0.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25080,Ε,Ε->Ε,Ε,0,superior,superior,ΕΕ->ΕΕ,1,13801,1,1.0,0.000066,Ε,0.007576,0.000115,0.000115,0.000231,0.500000,0.500000,0.500000
25046,Ε,in->Ε,an,0,chairmin,chairman,Εin->Εan,1,3,1,4.0,0.000066,an,0.007576,0.000058,0.000058,0.000231,0.250000,0.250000,0.250000
25045,Ε,Ε->Ε,Ε,0,chairman,chairman,ΕΕ->ΕΕ,1,13801,1,3.0,0.000066,Ε,0.007576,0.000058,0.000058,0.000231,0.250000,0.250000,0.250000
25044,Ε,Ε->Ε,n,0,chairma,chairman,ΕΕ->Εn,1,35,1,2.0,0.000066,n,0.007576,0.000058,0.000058,0.000231,0.250000,0.250000,0.250000


### experiment UD ENG 1000 sentence: morph-sufiix + lemma cut

In [18]:
dfc = count_word_for_lemma(df)
dfc = cut_lemma_count(dfc)
print("In original: w_len:{}, l_len:{}".format(len(set(df['w'])),len(set(df['l']))))
print("After lemma cut: w_len:{}, l_len:{}".format(len(set(dfc['w'])),len(set(dfc['l']))))
dfc.head(10)

In original: w_len:4334, l_len:11951
After lemma cut: w_len:4334, l_len:3553


Unnamed: 0,p,sc,s,pos,l,w,p_sc_s,c_w,c_pscs,c_l,rank
439,Ε,Ε->Ε,Ε,0,and,and,ΕΕ->ΕΕ,534,13801,534,1.0
55,Ε,Ε->Ε,Ε,0,in,in,ΕΕ->ΕΕ,510,13801,510,1.0
60,Ε,Ε->Ε,Ε,0,of,of,ΕΕ->ΕΕ,509,13801,509,1.0
111,Ε,Ε->Ε,Ε,0,to,to,ΕΕ->ΕΕ,491,13801,491,1.0
138,Ε,Ε->Ε,t,0,tha,that,ΕΕ->Εt,253,2,277,1.0
2875,Ε,Ε->Ε,n,0,tha,than,ΕΕ->Εn,24,35,277,1.0
194,Ε,be->wer,e,0,be,were,Εbe->were,59,179,228,1.0
90,Ε,Ε->Ε,Ε,0,be,be,ΕΕ->ΕΕ,89,13801,228,1.0
773,Ε,e->ee,n,0,be,been,Εe->een,64,64,228,1.0
214,Ε,e->Ε,eing,0,be,being,Εe->Εeing,16,17,228,1.0


In [19]:
dfc['p(l)']=dfc.groupby('l')['l'].transform('count')/len(dfc)
dfc['m'] = dfc['s']
dfc['p(m)']=1/len(set(dfc['m']))
dfc['p(l,m)'] = dfc['p(l)']*dfc['p(m)']
df_result = em_morph(dfc,2)
df_result

iteration 0
       p     sc   s  pos         l         w    p_sc_s  c_w  c_pscs  c_l  \
439    Ε   Ε->Ε   Ε    0       and       and    ΕΕ->ΕΕ  534   13801  534   
55     Ε   Ε->Ε   Ε    0        in        in    ΕΕ->ΕΕ  510   13801  510   
60     Ε   Ε->Ε   Ε    0        of        of    ΕΕ->ΕΕ  509   13801  509   
111    Ε   Ε->Ε   Ε    0        to        to    ΕΕ->ΕΕ  491   13801  491   
138    Ε   Ε->Ε   t    0       tha      that    ΕΕ->Εt  253       2  277   
...   ..    ...  ..  ...       ...       ...       ...  ...     ...  ...   
25043  Ε  un->Ε  an    0  chairmun  chairman  Εun->Εan    1       3    1   
25199  Ε  ot->t   Ε    0     gluot      glut   Εot->tΕ    1       1    1   
25171  Ε   Ε->Ε   Ε    0   william   william    ΕΕ->ΕΕ    1   13801    1   
25170  Ε   Ε->Ε   Ε    0      col.      col.    ΕΕ->ΕΕ    1   13801    1   
25080  Ε   Ε->Ε   Ε    0  superior  superior    ΕΕ->ΕΕ    1   13801    1   

       rank      p(l)   m      p(m)    p(l,m)  p(l,m,w)      p(w)  p(l,m|w)

Unnamed: 0,p,sc,s,pos,l,w,p_sc_s,c_w,c_pscs,c_l,rank,p(l),m,p(m),"p(l,m)","p(l,m,w)",p(w),"p(l,m|w)","ec(l,m)","ec(l,m,w)"
439,Ε,Ε->Ε,Ε,0,and,and,ΕΕ->ΕΕ,534,13801,534,1.0,0.000231,Ε,0.009434,0.000231,0.000231,0.000231,1.0,1.0,1.0
55,Ε,Ε->Ε,Ε,0,in,in,ΕΕ->ΕΕ,510,13801,510,1.0,0.000231,Ε,0.009434,0.000231,0.000231,0.000231,1.0,1.0,1.0
60,Ε,Ε->Ε,Ε,0,of,of,ΕΕ->ΕΕ,509,13801,509,1.0,0.000231,Ε,0.009434,0.000231,0.000231,0.000231,1.0,1.0,1.0
111,Ε,Ε->Ε,Ε,0,to,to,ΕΕ->ΕΕ,491,13801,491,1.0,0.000231,Ε,0.009434,0.000231,0.000231,0.000231,1.0,1.0,1.0
138,Ε,Ε->Ε,t,0,tha,that,ΕΕ->Εt,253,2,277,1.0,0.000461,t,0.009434,0.000231,0.000231,0.000231,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25043,Ε,un->Ε,an,0,chairmun,chairman,Εun->Εan,1,3,1,1.0,0.000231,an,0.009434,0.000231,0.000231,0.000231,1.0,1.0,1.0
25199,Ε,ot->t,Ε,0,gluot,glut,Εot->tΕ,1,1,1,1.0,0.000231,Ε,0.009434,0.000231,0.000231,0.000231,1.0,1.0,1.0
25171,Ε,Ε->Ε,Ε,0,william,william,ΕΕ->ΕΕ,1,13801,1,1.0,0.000231,Ε,0.009434,0.000231,0.000231,0.000231,1.0,1.0,1.0
25170,Ε,Ε->Ε,Ε,0,col.,col.,ΕΕ->ΕΕ,1,13801,1,1.0,0.000231,Ε,0.009434,0.000231,0.000231,0.000231,1.0,1.0,1.0
