-
Notifications
You must be signed in to change notification settings - Fork 0
/
ViterbiSeg.py
183 lines (135 loc) · 4.92 KB
/
ViterbiSeg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import copy
from helpers import *
import math
def lexical_quality(substr):
### added on 08-09-15 to include the constraint that a valid English word will have atleast one syllable with one nuclues
substr_len = len(substr)
vowels = ['a','e','i','o','u']
v_count = 0
try:
if str.isdigit(substr) == True:
return -(1.0/len(substr))
elif alpha(substr) == False:
return -1000
except Exception:
pass
if substr_len == 1 and substr != 'a' and substr != 'i':
return -1000
for character in substr:
if character in vowels:
v_count+=1
if v_count ==0 and substr != 'by':
#print 'DLG of {} is {}'.format(substr,-1000)
return -1000
if substr_len == 3 and substr[-2:]=='le' :
#print 'Illegal segment ending in le ',substr
return -1000
if substr[0:2] in {'db','km','lp','mp','ns','ms','td','kd','md','ld','bd','cd','fd','gd','hd','jd','nd','pd','qd','rd','sd','vd','wd','xd','yd','zd'}:
#print 'Illegal onset ',substr
return -1000
if substr[-2:] in {'db','km','td','kd','md','bd','cd','fd','gd','hd','jd','pd','qd','sd','vd','wd','xd','yd','zd'}:
return -1000
if substr_len > 3:
flag = False
for c in substr[0:3]:
if c in vowels:
flag = True
break
if flag == False:
if substr[0] == 's':
flag = True
if flag == False:
#print 'Beginning with three consonants : ',substr
return -1000
return 1
def calc_new_DL(unigram_freq,substr,substr_freq,new_corpus_len):
new_DL = 0
for unigram,freq in unigram_freq.iteritems():
substr_key_freq = occurrences(substr,unigram)
#print '{} with freq {} occurs {} times in {}'.format(key,value,csx,s)
new_freq = freq - substr_freq*substr_key_freq + substr_key_freq
#print 'value ',value,'cs ',cs,'csx',csx,'n ',n;
new_DL += new_freq * (math.log(new_freq,2) -math.log(1.0*new_corpus_len,2))
#if unigram == 's' or unigram == 'e':
# print 'freq of ',unigram,' is ',str(new_freq),'; code length is ',str(math.log(new_freq,2) -math.log(1.0*new_corpus_len,2))
#print 'new_DL for unigrams only ',str(-1*new_DL)
#print 'code length is ',str(math.log(substr_freq,2) - math.log(1.0*new_corpus_len,2))
new_DL += substr_freq* (math.log(substr_freq,2) - math.log(1.0*new_corpus_len,2))
new_DL = -1*new_DL
return new_DL
def DLG(corpus,unigram_freq,DL,substr):
''' calculates the gain in description length brought about by
a sequence s
freq : dict containing the frequency of unigrams
text : space removed continous corpus text in lowercase
s : the subsequence
'''
lex_qual = lexical_quality(substr)
if lex_qual <=0:
#print 'DLG of {} is {}'.format(substr,lex_qual)
return lex_qual
corpus_len = len(corpus)
substr_len = len(substr)
substr_freq = occurrences(corpus,substr)
new_corpus_len = corpus_len - substr_freq*substr_len + substr_freq + substr_len + 1
#print s+' occurs '+ str(cs) + ' times'
new_DL = calc_new_DL(unigram_freq,substr,substr_freq,new_corpus_len)
#print substr,' ',substr_freq,' times. ' ,DL,' ',new_DL
dlg = (DL - new_DL) /substr_freq
#print 'DLG of {} is {}'.format(substr,dlg)
#raw_input('')
#norm_DLG = dlg*1.0/DL
#if dlg<0 :
#dlg = 0
#print dlg
return dlg #normalised to be in the range 0 to 1
def OpSeg(corpus,unigram_freq,DL,text):
n = len(text)
OS = []
DLG_stored = []
maxDLG = 0
for k in range(0,n):
if k>30:
OS[k-30][:]=[]
OS.append([])
if k>0:
OS[k][:]= []
OS[k] = copy.deepcopy(OS[k-1])
OS[k].append(text[k])
DLG_stored.append(DLG_stored[k-1]+DLG(corpus,unigram_freq,DL,text[k]))
else:
OS[k][:]=[]
OS[k].append(text[k])
DLG_stored.append(DLG(corpus,unigram_freq,DL,text[k]))
for j in range(k-1,-1,-1):
if j < k-25:
break
ngram = text[j:k+1]
#print 'j value : ',str(j),' k value ',str(k),' current ngram ',ngram
if occurrences(corpus,ngram)<2:
break
#if len(ngram) == 1 : #commented to include constraint
# dlgain = DLG_stored[j-1]
if j>0:
dlgain = DLG_stored[j-1]+ DLG(corpus,unigram_freq,DL,ngram)
else:
dlgain = DLG(corpus,unigram_freq,DL,ngram)
'''print DLG_stored
print 'new DL', dlgain
print 'DLG_stored is ',DLG_stored[k]'''
#print type(dlgain), type(lexgain), type(DLG_stored[k]), type(lex_stored[k])
# if (ALPHA*dlgain + BETA*lexgain) > (ALPHA*DLG_stored[k] + BETA*lex_stored[k]):
if (dlgain > DLG_stored[k]):
if j>0:
OS[k][:] = []
OS[k] = copy.deepcopy(OS[j-1])
OS[k].append(ngram)
#print 'ngram is ',ngram
DLG_stored[k] = dlgain
elif j==0:
OS[k][:]=[]
OS[k].append(ngram)
DLG_stored[k] = dlgain
print 'OS[{}] is now assigned {} with dlgain {} '.format(k,OS[k],DLG_stored[k])
#raw_input('')
return OS[n-1]