-
Notifications
You must be signed in to change notification settings - Fork 1
/
jobProcessing.py
239 lines (177 loc) · 6.78 KB
/
jobProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# Text PreProcessing
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from scipy.spatial.distance import cosine
import numpy as np
import pandas as pd
import math
def main():
indeedText = 'Indeed.txt'
usaaText = 'USAA.txt'
indeedName = 'Indeed'
usaaName = 'USAA'
# Tokenize and apply Porter filter
myDimensionalize(indeedText, indeedName)
myDimensionalize(usaaText, usaaName)
vectorSpace = makeVectorSpace(indeedName, usaaName) # Vector Space with one of each 'word' in both documents
makeBinaryVectorSpace(vectorSpace, indeedName, usaaName) # Binary Vector Space
rtf = makeRTFVectorSpace(vectorSpace, indeedName, usaaName) # Raw Term Frequeny (RTF) Vector Space
ntf = makeNormalTF(rtf, indeedName, usaaName) # Normalized Vector Space
idf = makeTFIDF(ntf, indeedName, usaaName) # TF-IDF Calculations
# COSINE comparisons
cos1 = makeCOScompare(idf, indeedName, usaaName)
cosIs(cos1, indeedName, usaaName)
# ============= Called Functions ==============================================
def myDimensionalize(txDoc, txName):
myToken(txDoc, txName)
myPorter(txDoc, txName)
def myToken(tDoc, tName):
thisDoc = open(tDoc).read()
thisToken = word_tokenize(thisDoc) # tokenize
print('The length of the initial lexicon for ', tName, ' is ', len(thisToken)) # display token info
def myPorter(pDoc, pName):
# stop word removal, print dimensionality
porterDoc = open(pDoc).read()
stop_words=set(stopwords.words("english"))
words = word_tokenize(porterDoc)
filtered_sentence=[] #create empty list for filtered sentence go into
for w in words:
if w not in stop_words:
filtered_sentence.append(w) #this will create a list of non-stopword words in sentence
print('The length of the ', pName, ' lexicon without stop words is ', len(filtered_sentence))
makeFile(filtered_sentence, "filtered "+pName) # for rtf and normalization process later
# apply Porter filter
ps=PorterStemmer()
porter_sentence=[]
reducedPorter=[]
for x in filtered_sentence:
porter_sentence.append(ps.stem(x))
for y in porter_sentence:
if y not in reducedPorter:
reducedPorter.append(y)
print('The length of the lexicon after applying the Porter filter to ', pName, ' is ', len(reducedPorter))
makeFile(reducedPorter, pName)
def makeFile(sentence, fName):
newFile = open(fName+".txt", "w")
for eachLine in sentence:
newFile.write(eachLine + " ") # Added space between words to read later
newFile.close()
def makeVectorSpace(aName, bName): # see myPorter where applied Porter filter
aList = list(open(aName+".txt"))
bList = list(open(bName+".txt"))
vsList = aList + bList
list(set(vsList)) # removes duplicates as set and returns an unordered list
# print(vsList)
listFile= "vectorSpace.txt"
newFile = open(listFile, 'w')
for eachLine in vsList:
newFile.write(eachLine)
newFile.close()
print("Creation of vectorSpace.txt successful!")
return listFile
aList.close()
bList.close()
def fileToList(file):
aFile = open(file,"r")
aText=[]
for each in aFile:
aText.append(each)
word_list=[]
for item in aText:
for word in item.split():
word_list.append(word)
return word_list
aFile.close()
def makeDataFrame(file):
vSpace = fileToList(file)
data = {'Words': vSpace}
myVec = pd.DataFrame(data, columns = ['Words']) # Successful creation of panda dataframe
return myVec
def addBlankCol(dataFrame, colTitle):
dataFrame[colTitle] = range(len(dataFrame))
dataFrame.loc[:,colTitle] = np.array([0] * len(dataFrame))
def biCompare(dataFrame, mainTitle, compareList, colTitle): # Engine of binary vector creation
for index, row in dataFrame.iterrows():
for word in compareList:
if dataFrame.at[index,mainTitle] == word:
dataFrame.at[index,colTitle] = 1
def makeBinaryVectorSpace(vsFile, doc1, doc2):
vsDF = makeDataFrame(vsFile) # vector space file turned dataframe
biTitle1 = "binary - "+doc1
biTitle2 = "binary - "+doc2
doc1List = fileToList(doc1+".txt")
doc2List = fileToList(doc2+".txt")
addBlankCol(vsDF, biTitle1)
addBlankCol(vsDF, biTitle2)
biCompare(vsDF, 'Words', doc1List, biTitle1)
biCompare(vsDF, 'Words', doc2List, biTitle2)
print(vsDF)
print("Completed binary vector space")
def rtfCompare(dataFrame, mainTitle, compareList, colTitle): # Engine of RTF creation
for index, row in dataFrame.iterrows():
for word in compareList:
a = dataFrame.at[index,colTitle]
if dataFrame.at[index,mainTitle] == word:
a+=1
dataFrame.at[index,colTitle] = a
def makeRTFVectorSpace(vsFile, doc1, doc2):
vsRTF = makeDataFrame(vsFile)
rtfTitle1= "rtf - "+doc1
rtfTitle2 = "rtf - "+doc2
doc1List = fileToList("filtered "+doc1+".txt")
doc2List = fileToList("filtered "+doc2+".txt")
addBlankCol(vsRTF, rtfTitle1)
addBlankCol(vsRTF, rtfTitle2)
print("Making raw term frequency columns in dataframe")
rtfCompare(vsRTF, 'Words', doc1List, rtfTitle1)
rtfCompare(vsRTF, 'Words', doc2List, rtfTitle2)
return vsRTF
def compNorm(dFrame, rtfCol, normCol): # Engine of Normalization function
dFrame[normCol] = dFrame[normCol].astype(np.float)
normalSum = dFrame[rtfCol].sum()
for index, row in dFrame.iterrows():
b = dFrame.at[index,rtfCol]
c = b/normalSum
dFrame.at[index,normCol] = c
def makeNormalTF(makeRTF, doc1, doc2):
rtfTitle1= "rtf - "+doc1
rtfTitle2 = "rtf - "+doc2
normTitle1 = "normal - "+doc1
normTitle2 = "normal - "+doc2
addBlankCol(makeRTF, normTitle1)
addBlankCol(makeRTF, normTitle2)
print("Making normalized columns in dataframe")
compNorm(makeRTF, rtfTitle1, normTitle1)
compNorm(makeRTF, rtfTitle2, normTitle2)
return makeRTF
def makeTFIDF(ntf, doc1, doc2):
rtfTitle1= "rtf - "+doc1
rtfTitle2 = "rtf - "+doc2
addBlankCol(ntf, "TFIDF")
addBlankCol(ntf, "sum")
ntf["TFIDF"] = ntf["TFIDF"].astype(np.float)
print("Making TF-IDF calculations...")
for index, row in ntf.iterrows():
a = ntf.at[index,rtfTitle1]
b = ntf.at[index,rtfTitle2]
rtfSum = a+b
ntf.at[index,"sum"]=rtfSum
thisIDF = math.log(2/(1+rtfSum)) # using constant '2' since only comparing total of 2 documents
thisTFIDF = rtfSum * thisIDF
ntf.at[index,"TFIDF"] = thisTFIDF
print(ntf)
return ntf
def makeCOScompare(tfidf, doc1, doc2): # make cosine comparison of two documents
normTitle1 = "rtf - "+doc1
normTitle2 = "rtf - "+doc2
print("making COSINE comparison between " + doc1 + " and " + doc2)
s = tfidf[normTitle1]
t = tfidf[normTitle2]
x = cosine(s,t)
return x
def cosIs(num, docA, docB):
print("The cosine of "+docA+" and "+docB+" is ")
print(num)
# =========== call main function ==============================================
main()