-
Notifications
You must be signed in to change notification settings - Fork 9
/
nlp_test_wang_stemming.py
executable file
·247 lines (179 loc) · 6.31 KB
/
nlp_test_wang_stemming.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
from __future__ import division, unicode_literals
import math
from textblob import TextBlob as tb
import nltk
#from nltk.stem.lancaster import LancasterStemmer # stemer st = LancasterStemmer() st.stem('saying')
#from nltk.stem.porter import * # poster stemmer
#from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
from nltk import cluster
import os
from os import listdir
from os.path import isfile, join
import time
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob)
def idf(word, bloblist):
return math.log(len(bloblist) / (n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
def buildVector(iterable1, iterable2):
counter1= dict((k[0],k[1]) for k in iterable1)
counter2= dict((k[0],k[1]) for k in iterable2)
all_items = list(set(counter1.keys()).union( set(counter2.keys()) ))
vector1=[]
vector2=[]
for k in all_items:
if k in counter1:
vector1.append(counter1[k])
else:
vector1.append(0)
if k in counter2:
vector2.append(counter2[k])
else:
vector2.append(0)
#print vector1
#print vector2
return vector1, vector2
def cosine_sim(v1,v2):
return (1-cluster.util.cosine_distance(v1,v2))
cachedStopWords = stopwords.words("english")
def stemming(doc):
d = toker.tokenize(doc)
d = [k for k in d if k not in cachedStopWords]
for i in range(0,len(d)):
d[i]=lemma.lemmatize(d[i])
return tb(" ".join(d))
wordList={}
def tfidf_list(bloblist):
totallength = len(bloblist)
#blob.words.count(word) / len(blob.words)
########
bloblistidf = [set(b.words) for b in bloblist]
#print bloblistidf
for k, blob in enumerate(bloblistidf):
if k%100==0:
print "make dict : " + str(k)
for word in blob:
if not word in wordList:
wordList[word]=math.log(totallength / n_containing(word, bloblistidf))
#wordList_name.append(word)
#wordList_num.append(math.log(totallength / n_containing(word, bloblistidf)))
#print wordList
del(bloblistidf)
listlist = []
for i, blob in enumerate(bloblist):
#if i%2==0:
# print "iter : "+str(i)
# time.sleep(0.1)
bloblength = len(blob.words)
print("Top words in document {}".format(i + 1))
#scores = {word: ((blob.words.count(word) / bloblength) * wordList_num[wordList_name.index(word)] ) for word in blob.words}
scores = {word: ((blob.words.count(word) / bloblength) * wordList[word] ) for word in blob.words if word[0] != '\ufeff'}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
#if i==2:
# print sorted_words
listlist.append(sorted_words)
#for word, score in sorted_words[:5]:
# print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
return listlist
def filestring(f):
title = u""
abstract = u""
body = u""
while(1):
line=f.readline().decode('utf-8')
if line=="":
break
#print line
#title=title+" "+line
title = u' '.join([title, line])
while(1):
line=f.readline().decode('utf-8')
if line=="":
break
#print line
#abstract=abstract+" "+line
abstract = u' '.join([abstract, line])
while(1):
line=f.readline().decode('utf-8')
if not line: break
#print line
#body=body+" "+line
body = u' '.join([body, line])
return title, abstract, body
threshold = 0.00
####################################################################################
lemma = nltk.wordnet.WordNetLemmatizer() # stemming
toker = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True) #punctuation
files0_dir='./0/'
files1_dir='./1/'
files2_dir='./2/'
files0 = [f for f in os.listdir(files0_dir)]
files1 = [f for f in os.listdir(files1_dir)]
files2 = [f for f in os.listdir(files2_dir)]
print "start !!!!"
fileList = []
#print files0
for j,i in enumerate(files0):
#if j%100==0:
# print "file0 : " + str(j)
if i[-3:]=="txt":
fileList.append(files0_dir+i)
#print fileList
for j,i in enumerate(files1):
#if j%100==0:
# print "file1 : " + str(j)
if i[-3:]=="txt":
fileList.append(files1_dir+i)
for j,i in enumerate(files2):
#if j%100==0:
# print "file2 : " + str(j)
if i[-3:]=="txt":
fileList.append(files2_dir+i)
######################################################
whole_doc=[]
for j,i in enumerate(fileList):
if j%100==0:
print "sum whole doc title, abstract, body : " + str(j)
#f = open("testtest"+str(i)+".txt",'r')
f = open(i,'r')
#line = fcontrol.readline()
#if not line: break
title, abstract, body = filestring(f)
whole_doc.append(title+" "+abstract+" "+body)
f.close()
document_list=[]
for j,i in enumerate(whole_doc):
if j%100==0:
print "stemming : " + str(j)
document_list.append(stemming(i.lower()))
#document1 = stemming(whole_doc[0])
#document2 = stemming(whole_doc[1])
#document3 = stemming(whole_doc[2])
#print document_list
del(whole_doc)
for j,i in enumerate(fileList):
f2 = open("./s/"+i[4:-4]+".txt","w")
for k in document_list[j]:
#print str(k[1])
f2.write(k.encode('utf-8'))
f2.close()
#############################################################################################
# cosine sim
#v1,v2= buildVector(listlist[2], listlist[0])
#print cosine_sim(v1,v2)
#network
"""
for i, blob1 in enumerate(listlist):
for j, blob2 in enumerate(listlist[i+1:]):
blob1_v,blob2_v= buildVector(blob1, blob2)
temp_cos = cosine_sim(blob1_v,blob2_v)
#print temp_cos
if temp_cos >= threshold:
print i , j+i+1 , temp_cos
"""