-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier train.py
138 lines (89 loc) · 5.64 KB
/
classifier train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#------------------------------------------------------------------------------------------------------------------------------------------
## import the classes of classifiers
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
#------------------------------------------------------------------------------------------------------------------------------------------
## import the class in order to save the trained classifiers as pickle file
import pickle
#------------------------------------------------------------------------------------------------------------------------------------------
## import the classes of pandas data frame and spliting of data set
import pandas as pd
#from sklearn.model_selection import train_test_split
#------------------------------------------------------------------------------------------------------------------------------------------
## import the classes for splitting the dataset into training and testing set. Also, to stratify the dataset.
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
#------------------------------------------------------------------------------------------------------------------------------------------
## fetch the dataset.csv file
dataset_file = 'dataset.csv'
data = pd.read_csv(dataset_file) # convert it into pandas data frame
data = data.drop('tweets',axis=1)
#print (data.head())
y = data.label # store response vector in y
X = data.drop('label', axis=1) # store feature matrix in x
#------------------------------------------------------------------------------------------------------------------------------------------
## splitting of dataset into 80% training set and 20% testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
# stratify is used to proportionately distribute the dataset into training and testing sets as per the predicted label column
##print ("\nX_train:\n")
##print(X_train.head())
##print (X_train.shape) # this commented part can be used to determine X_train and X_test using head and shape.
##print ("\nX_test:\n")
##print(X_test.head())
##print (X_test.shape)
##y_train.to_csv("y_train.csv") # this commented part can be used to determine weather stratify has been performed in the right way or not.
##y_test.to_csv("y_test.csv")
#------------------------------------------------------------------------------------------------------------------------------------------
## instantiate the model
clf1 = LogisticRegression()
#clf2 = MultinomialNB() # Naive Bayes algorithm cannot be used because our dataset is having negative values.
clf3 = RandomForestClassifier()
clf4 = GradientBoostingClassifier()
clf5 = SVC()
#------------------------------------------------------------------------------------------------------------------------------------------
## fit the model with training data
clf1.fit(X_train,y_train)
#clf2.fit(X_train,y_train)
clf3.fit(X_train,y_train)
clf4.fit(X_train,y_train)
clf5.fit(X_train,y_train)
#------------------------------------------------------------------------------------------------------------------------------------------
## pickled all the classifiers
##save_classifier = open("pickled_algos/LogisticRegression5k.pickle","wb")
##pickle.dump(clf1, save_classifier)
##save_classifier.close()
##
####save_classifier = open("pickled_algos/MultinomialNB5k.pickle","wb")
####pickle.dump(clf2, save_classifier)
####save_classifier.close()
##
##save_classifier = open("pickled_algos/RandomForestClassifier5k.pickle","wb")
##pickle.dump(clf3, save_classifier)
##save_classifier.close()
##
##save_classifier = open("pickled_algos/GradientBoostingClassifier5k.pickle","wb")
##pickle.dump(clf4, save_classifier)
##save_classifier.close()
##
##save_classifier = open("pickled_algos/SVC5k.pickle","wb")
##pickle.dump(clf5, save_classifier)
##save_classifier.close()
#------------------------------------------------------------------------------------------------------------------------------------------
## predicting the response for the new observations
y_pred1 = clf1.predict(X_test)
##y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)
y_pred4 = clf4.predict(X_test)
y_pred5 = clf5.predict(X_test)
#------------------------------------------------------------------------------------------------------------------------------------------
## comparing actual response values (y_test) with predicted response values (y_pred) on the testing set
from sklearn import metrics
print("Logistic Regression model accuracy:", metrics.accuracy_score(y_test, y_pred1))
#print(clf1.score(X_train,y_train)) ## This commented line gives the accuracy of the logistic regression classifier on the training set.
##print("MultinomialNBClassifier model accuracy:", metrics.accuracy_score(y_test, y_pred2))
print("RandomForestClassifier model accuracy:", metrics.accuracy_score(y_test, y_pred3))
print("GradientBoostingClassifier model accuracy:", metrics.accuracy_score(y_test, y_pred4))
print("SVCClassifier model accuracy:", metrics.accuracy_score(y_test, y_pred5))