-
Notifications
You must be signed in to change notification settings - Fork 0
/
Asg1.py
148 lines (123 loc) · 4.51 KB
/
Asg1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#AI assignment1
from __future__ import division
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] = 1
else:
pass
#print"word:%s is not in my Vocabulary" % word
return returnVec
def storeSentence(sentence):
sentenceList =[]
for word in sentence.split(' '):
word = word.strip()
sentenceList.append(word)
return sentenceList
#----------------------------------start the program------------------------------------
print '----------------------------Start the program-------------------------------'
print '----------------------------Pre-processing Step-----------------------------'
traindata_file = '/Users/chongli/PycharmProjects/NaivebayesClassifier/traindata.txt'
trainlables_file = '/Users/chongli/PycharmProjects/NaivebayesClassifier/trainlabels.txt'
stoplist_file = '/Users/chongli/PycharmProjects/NaivebayesClassifier/stoplist.txt'
vocabulary = []
stoplist=[]
withoutLable_vectors = []
withLable_vectors = []
M = 0
#-------convert stoplist_file into a LIST------------
with open(stoplist_file,'r') as s:
for line in s:
for word in line.split(' '):
word = word.strip()
stoplist.append(word)
stoplist.sort()
#print 'in stoplist:'
#for i in stoplist:
# print i
#-------form an vocabulary------------
with open(traindata_file,'r') as td:
for line in td:
for word in line.split(' '):
word = word.strip()
vocabulary.append(word)
vocabulary = list(set(vocabulary)) #remove duplication
vocabulary.sort()
vocabulary.remove('') #remove ''
for word in stoplist:
if word in vocabulary:
vocabulary.remove(word)
M = len(vocabulary)
'''print 'in file1:'
for i in vocabulary:
print i
print M ''' #get the size of vocabulary
#--------form vectors-----------------
with open(traindata_file, 'r') as td:
for line in td:
line = line.strip()
test = storeSentence(line) # get each sentence list
withoutLable_vectors.append(setOfWords2Vec(vocabulary, test))
#print withoutLable_vectors
#----add labels in each end-------
trainlabels=[]
with open(trainlables_file,'r') as d:
for line in d:
line = line.strip()
trainlabels.append(int(line)) #convert the content of lable into a list trainlabels[]
#print trainlabels
mid=[]
for x in range(1,len(withoutLable_vectors)): # x represent the index
#print len(withoutLable_vectors[x])
mid.append(trainlabels[x])
withLable_vectors.append(withoutLable_vectors[x]+ mid) #"+" is suitable for adding two list
#print len(withLable_vectors[x - 1])
mid.pop(0)
#--------store the result into proprecess.txt file
filename = "preprocessed.txt"
target = open(filename,'w')
for item in vocabulary:
target.write(item + ',')
for item in withLable_vectors: ###################need combine vectors with labels and output them into file##################
for eachNum in item:
target.write((str)(eachNum) + ',')
target.write('\n')
target.close()
#preprocessing step
#1 (checked)put stopword into LIST
#2-1 (checked)put traindata into LIST,
#2-2 (checked)remove stopword, convert it into vocalbulary, size = M
#3 (checked)compare each sentance with vocalbulary , and combine it with trainable.txt, become vectors
#[[ + class label],[],[]...]
#4(checked)tore the vocabulary with vectors into proprecess.txt file(need to be finished)
#classifier step
#---------get parameters--------------
filecount = 0
file0Num = 0
file1Num = 0
with open(trainlables_file,'r') as ta:
for line in ta:
line = line.strip()
filecount = filecount + 1
if line == '0':
file0Num = file0Num + 1
file1Num = filecount - file0Num
p_class0 = file0Num/filecount
p_class1 = file1Num/filecount
print p_class0
print p_class1
#get p_word_c from traindate.txt;
#plan to create a dictionary for each class c:{word:count;word:count}
class0_dic = {}
class1_dic = {}
'''how to calculate? there is a stupid method:
use withLabel_vectors, divide this vectors into class0_vec_list and class1_vec_list.
for each vec_list, calculate how many words shows in it===>sum of them is the total num of words in class0;
if we want to calculate a specific word, we need get the index of this word in vocalbulary, use index to check each vector in class0,
count the num of word showing times. update the value in class0_dic{}.
'''
#smooth part
#convert log
#predict testdata.txt
#rewrite it within the type of OOP