-
Notifications
You must be signed in to change notification settings - Fork 0
/
stemmer.py
60 lines (48 loc) · 1.24 KB
/
stemmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from nltk.stem import LancasterStemmer
class syllabify():
'''
def __init__(self):
#stemmed_word=self.stemmed(word)
#print 'stemmed word is ',stemmed_word
#stemmed_length = len(stemmed_word)
#stem = len(word)-stemmed_length
#if stem >0:
# word = self.parse(stemmed_word)+'_'+word[-1*stem:]
#else:
#word = self.parse(word)
#print word
'''
def stemmed(self,word):
stemmer = LancasterStemmer()
return stemmer.stem(word)
def parse(self,word):
vowels = {'a','e','i','o','u'}
diphthongs = {'th','sh','ph','th','ch','wh'}
t_word = word
w_index =0
for i in range(len(t_word)):
if i == len(t_word)-1 and t_word[i]=='e':
word = word[:-3]+'_'+word[-3:]
break
if t_word[i] in vowels:
if i ==0:
w_index+=1
continue
elif t_word[i+1] in vowels:
w_index+=1
continue
elif i<len(t_word)-3 and t_word[i+1] not in vowels and t_word[i+2] not in vowels and t_word[i+3] in vowels:
word = word[:w_index+2]+'_'+word[w_index+2:]
i=i+4
w_index+=5
else:
index = -1
for j in range(w_index-2,-1,-1):
if word[j] in vowels:
index=j
break
if index >= 0:
word = word[0:index+1]+'_'+word[index+1:]
w_index+=1
w_index+=1
return word