-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_models.py
151 lines (123 loc) · 6.04 KB
/
train_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import argparse
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn_evaluation import plot
from gensim.models import Doc2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from lib import vectorize
def main(args):
print('Preparing...')
# Load Dataset
train_df = pd.read_csv(args.train_path)
devtest_df = pd.read_csv(args.devtest_path)
if args.vectorize == 'doc2vec':
title_doc2vec = Doc2Vec.load(args.doc2vec_title_path)
review_doc2vec = Doc2Vec.load(args.doc2vec_review_path)
train_X, train_y = vectorize.doc2vec_make_dataset(train_df, review_doc2vec, title_doc2vec)
devtest_X, devtest_y = vectorize.doc2vec_make_dataset(devtest_df, review_doc2vec, title_doc2vec)
elif args.vectorize == 'count_tfidf':
with open(os.path.join(args.pickle_dir, 'review_CountVectorizer.pickle'), 'rb') as f:
review_count = pickle.load(f)
with open(os.path.join(args.pickle_dir, 'review_TfidfTransformer.pickle'), 'rb') as f:
review_tfidf = pickle.load(f)
with open(os.path.join(args.pickle_dir, 'title_CountVectorizer.pickle'), 'rb') as f:
title_count = pickle.load(f)
with open(os.path.join(args.pickle_dir, 'title_TfidfTransformer.pickle'), 'rb') as f:
title_tfidf = pickle.load(f)
train_X, train_y = vectorize.count_tfidf_make_dataset(train_df, review_count, review_tfidf, title_count, title_tfidf)
devtest_X, devtest_y = vectorize.count_tfidf_make_dataset(devtest_df, review_count, review_tfidf, title_count, title_tfidf)
else:
raise ValueError('vectorize method must be doc2vec or count_tfidf')
# binary or not
binary = len(np.unique(train_y)) == 2
# Init Result File
result_dir = os.path.split(args.result_path)[0]
if not os.path.isdir(result_dir):
os.makedirs(result_dir)
if not os.path.isfile(args.result_path):
if binary:
pd.DataFrame(columns=['Model Name', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])\
.to_csv(args.result_path, index=False)
else:
pd.DataFrame(columns=['Model Name', 'accuracy', 'precision', 'recall', 'f1'])\
.to_csv(args.result_path, index=False)
# Init Confusion Matrix Directory
if args.confusion_matrix_dir:
if not os.path.isdir(args.confusion_matrix_dir):
os.makedirs(args.confusion_matrix_dir)
# Model list
clf_models = [
DecisionTreeClassifier,
LogisticRegression,
Perceptron,
RandomForestClassifier,
LinearSVC,
]
# Training
for model in clf_models:
clf = model(class_weight='balanced')
clf.fit(train_X, train_y)
pred = clf.predict(devtest_X)
# Save Best Model
if model == LogisticRegression:
if binary:
best_path = 'model/binary_best.pickle'
else:
best_path = 'model/multi_best.pickle'
with open(best_path, 'wb') as f:
pickle.dump(clf, f)
# Save Confusion Matrix Image
if args.confusion_matrix_dir:
plot.confusion_matrix(devtest_y, pred)
plt.savefig(os.path.join(args.confusion_matrix_dir, '{}.png'.format(model.__name__)))
plt.clf()
# Save Result
result_df = pd.read_csv(args.result_path)
if binary:
result_df.loc[len(result_df)] = {
'Model Name': model.__name__,
'accuracy': accuracy_score(devtest_y, pred),
'precision': precision_score(devtest_y, pred),
'recall': recall_score(devtest_y, pred),
'f1': f1_score(devtest_y, pred),
'roc_auc': roc_auc_score(devtest_y, pred),
}
else:
result_df.loc[len(result_df)] = {
'Model Name': model.__name__,
'accuracy': accuracy_score(devtest_y, pred),
'precision': precision_score(devtest_y, pred, average='weighted'),
'recall': recall_score(devtest_y, pred, average='weighted'),
'f1': f1_score(devtest_y, pred, average='weighted'),
# ROC AUC is not available on multi class
}
result_df.to_csv(args.result_path, index=False)
print('{} Done...'.format(model.__name__))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--train_path', type=str, required=False, default='data/binary/train.csv',
help='Path of train dataset')
parser.add_argument('--devtest_path', type=str, required=False, default='data/binary/devtest.csv',
help='Path of devtest dataset')
parser.add_argument('--vectorize', required=True, type=str,
help='Vectorize method (doc2vec or count_tfidf)')
parser.add_argument('--doc2vec_title_path', required=False, type=str, default='doc2vec/d2v_title_100',
help='Path of title Doc2Vec model (ignored when vectorize is not doc2vec)')
parser.add_argument('--doc2vec_review_path', required=False, type=str, default='doc2vec/d2v_review_300',
help='Path of review Doc2Vec model (ignored when vectorize is not doc2vec)')
parser.add_argument('--pickle_dir', type=str, required=False, default='pickle/',
help='Directory of CountVectorizer and TfidfTransformer pickles (ignored when vectorize is not count_tfidf')
parser.add_argument('--result_path', type=str, required=True,
help='Path of result CSV file')
parser.add_argument('--confusion_matrix_dir', type=str, required=False,
help='Directory of confusion matrix images')
args = parser.parse_args()
main(args)