-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.py
225 lines (182 loc) · 7.59 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import re
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from collections import Counter
import numpy as np
import random
def make_matrix(tweets, features):
matrix = []
row=[]
for tweet in tweets:
for feature in features:
#Each element of row is count of feature in tweet
row.append(tweet.count(feature))
matrix.append(row)
row=[]
df = pd.DataFrame(matrix)
df.columns = features
return df
def Label(df,target):
label=[]
for item in df[target]:
label.append(str(item))
return label
def Tweets(df):
tweet=[]
for item in df['Tweet']:
tweet.append(str(item))
removelist="#"
# Lowercase, then replace any non-letter, space, or digit character in the headlines except Hashtag
tweet = [re.sub(r'[^\w\s\d'+removelist+']','',t.lower()) for t in tweet]
# Replace sequences of whitespace with a space character.
tweet = [re.sub("\s+", " ", t) for t in tweet]
# Replace all spaces in string
tweet = [re.sub(" ", "", t) for t in tweet]
return tweet
def Features(target):
f=open('/home/divyat/Desktop/RTE/Data/TrainingData/'+target+'.txt','r')
feature_set=f.read().split("\n")
feature=[]
for item in feature_set:
feature.append(item)
f.close()
return feature
def Analysis(clf,matrix,label,target,Classes):
index=0
error=0
for item in clf.predict(matrix):
#If item in predicted label set is not same as the item in label set at the same index
if item!=label[index]:
error=error+1
index=index+1
f=open('/home/divyat/Desktop/RTE/Data/TrainingData/Analysis.txt','a')
f.write('Analysis for '+target+'\n'+'\n')
#Calculate the count of each class
for Class in Classes:
#It counts the no of times item appears in the label set
f.write('Old Count of'+Class+': '+str(label.count(Class))+'\n')
#It counts the no of times item appears in the predicted label set
f.write('New Count of'+Class+': '+str(clf.predict(matrix).tolist().count(Class))+'\n'+'\n')
f.write('Total Wrong Cases: '+ str(error)+"\n")
f.write('Accuracy: '+ str(clf.score(matrix,label,sample_weight=None)) +'\n'+'\n')
f.close()
def train_svm_stance(target,sec_target):
#If No element in data frame with specified secondary target
if not target in sec_target:
return 0
#Taking data specific to the target
sub_data=data[target==sec_target]
#Making list of all tweets
sub_tweets=Tweets(sub_data)
#Feature Set made by selecting words with high pmi from all the words form tweets
sub_features =Features(target)
#Convert the tweets into feature vectors and make a matrix of feature vectors
sub_matrix=make_matrix(sub_tweets,sub_features)
#Mark the classes of training set tweets for training the SVM
sub_label=Label(sub_data,'Stance')
#Implementing the Support Vector Machine
clf = svm.SVC(kernel='linear')
print clf
#Train the SVM on training data using the sub_label
clf.fit(sub_matrix, sub_label)
#Analysis on Trained SVM
#Analysis(clf,sub_matrix,sub_label,target,['FAVOR','AGAINST','NONE'])
return clf
def train_svm(tweets,features):
'''
#Adding some meta features
transform_functions= [
lambda x: len(x),
lambda x: x.count(" "),
lambda x: x.count("."),
lambda x: x.count("!"),
lambda x: x.count("?"),
lambda x: len(x) / (x.count(" ") + 1),
lambda x: x.count(" ") / (x.count(".") + 1),
lambda x: len(re.findall("\d", x)),
lambda x: len(re.findall("[A-Z]", x)),
]
columns = []
for func in transform_functions:
columns.append(data["Tweet"].apply(func))
meta = numpy.asarray(columns)
'''
#Convert the tweets into feature vectors and make a matrix of feature vectors
matrix=make_matrix(tweets,features)
#Mark the classes of training set tweets for training the SVM
label=Label(data,'Local Target')
#Implementing the Support Vector Machine
clf = svm.SVC(kernel='linear')
print clf
#Train the classfier for predicitng the secondary targets using the labelled secondary targets in label list
clf.fit(matrix, label)
#Analysis on Trained SVM for secondary target classification
#Analysis(clf,matrix,label,'Secondary Targets',['Baby Rights','Women Rights','Christianity','Adoption','Abortion','Other'])
#Train the Stance Classifier for each of the subsets we have made using Secondary Targets
clf1=train_svm_stance('Baby Rights',clf.predict(matrix))
clf2=train_svm_stance('Women Rights',clf.predict(matrix))
#clf3=train_svm_stance('Christianity',clf.predict(matrix))
clf4=train_svm_stance('Adoption',clf.predict(matrix))
clf5=train_svm_stance('Abortion',clf.predict(matrix))
clf6=train_svm_stance('Other',clf.predict(matrix))
#Making a list of all classifiers to use for predicting over test data
classifiers =[]
classifiers.append(clf)
classifiers.append(clf1)
classifiers.append(clf2)
#classifiers.append(clf3)
classifiers.append(clf4)
classifiers.append(clf5)
classifiers.append(clf6)
#Check if a classifier is not eympty or we had no data available in trainnig set for that particular secondary target
for item in classifiers:
if item==0:
classifiers.remove(item)
return classifiers
#Main Code Starts Here
#Reading the data into data frame objects from excel sheets
data=pd.read_excel('/home/divyat/Desktop/RTE/Data/TrainingData/Training_Set_New.xlsx', index_col=None, na_values=['NA'] )
data_test=pd.read_excel('/home/divyat/Desktop/RTE/Data/TestData/test_data.xlsx', index_col=None, na_values=['NA'] )
#Making list of all tweets
tweets=Tweets(data)
#Feature Set made by selecting words with high pmi from all the words form tweets
features = Features('features')
#Train SVM classifier to classify tweets into secondary targets
classifiers=train_svm(tweets, features)
#Making list of all tweets in test set
test_tweets=Tweets(data_test)
#Convert the tweets into feature vectors and make a matrix of feature vectors which serve as an input to classifier trained on Training Data
matrix=make_matrix(test_tweets,features)
#Predicting Stance for the Test Data
stance_predicted=[]
list_target=['Baby Rights','Women Rights','Christianity','Adoption','Abortion','Other']
#Secondary Target Classification of Test Tweets
target_predicted=classifiers[0].predict(matrix)
#Check if a particular secondary target occurs in list of secondary targets predicted for the training data
for item in list_target:
if not item in target_predicted:
list_target.remove(item)
#Predicting Stance for test Set by looping over the list of secondary targets in the predicted secondary target list
for i in range(0,len(classifiers)-1):
#Making new data test specific to a secondary target
new_data=data_test[list_target[i]==target_predicted]
#Making list of all tweets specific to a secondary category
new_tweets=Tweets(new_data)
#Making list of features specific to a secondary category
new_features=Features(list_target[i])
#Making matrix specific to a secondary target
new_matrix=make_matrix(new_tweets,new_features)
stance_predicted=stance_predicted + classifiers[i+1].predict(new_matrix).tolist()
#Adding the secondary targets and stance prediced to the Test Set
data_test['Target_Predicted'] = pd.Series(target_predicted, index=data_test.index)
data_test['Stance_Predicted'] = pd.Series(np.asarray(stance_predicted), index=data_test.index)
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('/home/divyat/Desktop/RTE/Data/TestData/test_data.xlsx', engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
data_test.to_excel(writer, sheet_name='Sheet1')
# Close the Pandas Excel writer and output the Excel file.
writer.save()