# Train Classifier with TFIDF

In [19]:
import csv
import jieba
import re
import pickle
import random
import numpy as np
import pprint

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
# from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from collections import defaultdict, Counter

## type dict
Grammar = {'完成式': 1, '進行式': 2, '過去式': 3, '未來式': 4, '關係代名詞': 5, '不定詞': 6, '名詞子句': 7, 
           '被動': 8, '介係詞': 9, '連接詞': 10, '假設語氣': 11, '分詞': 12, 'PT': 13, '其它': 0}

In [20]:
Grammar = {'1': '完成式', '2': '進行式', '3': '過去式', '4': '未來式', '5': '關係代名詞', '6': '不定詞', '7': '名詞子句', '8': '被動', '9': '介係詞', \
           '10': '連接詞', '11': '假設語氣', '12': '分詞', '13': 'PT', '0': '其它'}

In [21]:
with open('questions_nondup_dup.csv') as csvfile:
    data_dict = defaultdict()
    for row in csv.DictReader(csvfile):
        data_dict[row['question_id']] = row

## splitting data

In [22]:
from collections import defaultdict

class DataHelper(object):
    def __init__(self, file):
        self.file = file
        self.stopwords = ['什麼', '請問', '這裡', '不是', '意思', '這邊', '謝謝', '這句', '為何', '使用', '怎麼', '要加', '老師', '還是', '如何', '甚麼', '一下', '這個', '這樣', '問為', '因為', '何要', '用過', '是不是', '一個', '應該', '直接', '好像', '如果', '何不', '兩個', '這是', '何用', '需要', '時候', '所以', '您好', '起來', '還有', '加上', '寫成', '你好', '此句', '有點', '問此', '不好意思', '不到', '像是', '這裏', '為什麼']
        
        with open('{0}'.format(self.file)) as data_file:
            self.unamb_data = defaultdict(list)
            self.amb_data = defaultdict(list)
            for row in csv.DictReader(data_file):
                if row['ambiguous'] == '0':
                    # can't directly use row.values() as it doesn't grantee the order
                    self.unamb_data[row['type']].append([row['question_id'], row['member_id'], \
                                                         row['type'], row['question'], row['ambiguous']])
                else:
                    self.amb_data[row['type']].append([row['question_id'], row['member_id'], \
                                                         row['type'], row['question'], row['ambiguous']])
                    
    def get_type_dist(self, outfile):
        type_counter = Counter()
        for key, vals in self.unamb_data.items():
            type_counter[key] += len(vals)
          
        print('Unambiguous data:')
        pprint.pprint(type_counter)
        
        type_counter = Counter()
        for key, vals in self.amb_data.items():
            type_counter[key] += len(vals)
            
        print('Ambiguous data:')
        pprint.pprint(type_counter)
            
#         header = list(Grammar.values())
#         with open(outfile, 'w') as csvfile:
#             spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
#             for key, val in type_counter.items():
#                 spamwriter.writerow([header[header.index(Grammar[key])], val])
        
        
    def get_shuffled_data(self, ratio = 8):
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []
        member_train = []
        member_test = []
        question_train = []
        question_test = []
        for key, record in self.unamb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            random.shuffle(questions)
            split_point = len(questions)*ratio//10
            train = questions[:split_point]
            test = questions[split_point:]
            member_train += members[:split_point]
            member_test += members[split_point:]
            question_train += question_idx[:split_point]
            question_test += question_idx[split_point:]
            X_train += train
            X_test += test
            Y_train += [key]*len(train) # repeat len(train) times
            Y_test += [key]*len(test)
            
        X_train_text = self.cut_questions(X_train)
        X_test_text = self.cut_questions(X_test)
        return X_train_text, np.array(Y_train), X_test_text, np.array(Y_test), member_train, member_test, question_train, question_test
    
    # use non-duplications as training and duplications as testing
    # the file should be questions_nondup_dup.csv
    def get_fixed_data(self):
        X_train = []
        X_test = []
        Y_train = []
        Y_test = []
        member_train = []
        member_test = []
        question_train = []
        question_test = []
        
        for key, record in self.unamb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            X_train += questions
            Y_train += [key]*len(questions)
            member_train += members
            question_train += question_idx
        for key, record in self.amb_data.items():
            if key == '13':
                continue
            questions = list(list(zip(*record))[3]) # get question list from records
            members = list(list(zip(*record))[1]) # get memberid list from records
            question_idx = list(list(zip(*record))[0])
            X_test += questions
            Y_test += [key]*len(questions)
            member_test += members
            question_test += question_idx
            
        X_train_text = self.cut_questions(X_train)
        X_test_text = self.cut_questions(X_test)
        return X_train_text, np.array(Y_train), X_test_text, np.array(Y_test), member_train, member_test, question_train, question_test
        
    def cut_questions(self, data):
        corpus = []
        for q in data:
            segs = jieba.cut(q, cut_all=False)
            final = [seg for seg in segs if seg not in self.stopwords]
            corpus.append(' '.join(final))
        return corpus

In [23]:
dh = DataHelper('questions_nondup_dup2.csv')

In [24]:
dh.get_type_dist('question_type_dist.csv')

Unambiguous data:
Counter({'13': 2869,
         '9': 1000,
         '10': 528,
         '12': 515,
         '5': 462,
         '3': 448,
         '8': 244,
         '1': 207,
         '2': 193,
         '6': 96,
         '7': 88,
         '11': 57,
         '4': 35,
         '0': 2})
Ambiguous data:
Counter({'5': 491,
         '1': 410,
         '2': 309,
         '8': 306,
         '3': 156,
         '6': 115,
         '9': 91,
         '7': 36,
         '10': 20,
         '11': 15})


### get shuffled data

In [67]:
X_train_text, y_train, X_test_text, y_test, member_train, member_test, question_train, question_test = dh.get_shuffled_data()
print('X train shape: {}'.format(X_train.shape))
print('y train shape: {}'.format(y_train.shape))

X train shape: (3875, 10760)
y train shape: (3095,)


## extract features of text

In [68]:
class TextFeature(object):
    def __init__(self, training_data, testing_data):
        self.training_text = training_data
        self.testing_text = testing_data
        
    def get_tfidf(self, use_idf = True):
#         texts = self.training_text + self.testing_text
        self.tfidf_vectorizer = TfidfVectorizer(use_idf = True)
        self.tfidf_vectorizer.fit(self.training_text)
        X_train = self.tfidf_vectorizer.transform(self.training_text)
        X_test = None
        if self.testing_text != None:
            X_test = self.tfidf_vectorizer.transform(self.testing_text)
        return X_train, X_test
    
    def dump_vectorizer(self, outfile):
        joblib.dump(self.tfidf_vectorizer, outfile)

## get features

In [69]:
tf = TextFeature(X_train_text, X_test_text)
X_train, X_test = tf.get_tfidf()
print(X_train.shape)
print(X_test.shape)

(3095, 9621)
(780, 9621)


## Corss Validation

### Get all data and get features

In [58]:
X_train_text, y_train, X_test_text, y_test, member_train, member_test, question_train, question_test = dh.get_fixed_data()

In [59]:
tf = TextFeature(X_train_text, X_test_text)
X_train, X_test = tf.get_tfidf()

# dump tfidf vectorizer
# tf.dump_vectorizer('tfidf.pkl')

In [46]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import multiprocessing as mp

### Naive Bayes corss validation

In [24]:
NB_cv = Pipeline([('cls', MultinomialNB()),])
parameters = {'cls__alpha': (0.5, 0.8, 1.0, 5, 10)}
gs_cls = GridSearchCV(NB_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
gs_cls = gs_cls.fit(X_train.todense(), y_train)



In [25]:
print('Best Paras:', gs_cls.best_params_)
y_predict = gs_cls.predict(X_train)
y_predict_prob = gs_cls.predict_proba(X_train)
infile = 'predicted/NB_unamb_predict.csv'
cat = [Grammar[item] for item in gs_cls.best_estimator_.classes_]
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_train)):
        writestring = [question_train[i], data_dict[question_train[i]]['member_id'], data_dict[question_train[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)
        
print(metrics.classification_report(y_train, y_predict))

y_predict_prob = gs_cls.predict_proba(X_test)
infile = 'predicted/NB_amb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

Best Paras: {'cls__alpha': 0.5}
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         2
          1       1.00      0.83      0.90       207
         10       0.89      0.92      0.91       528
         11       0.92      0.19      0.32        57
         12       0.91      0.94      0.93       515
          2       0.96      0.80      0.87       193
          3       0.92      0.85      0.88       448
          4       0.00      0.00      0.00        35
          5       0.92      0.94      0.93       462
          6       1.00      0.09      0.17        96
          7       1.00      0.11      0.20        88
          8       0.98      0.50      0.66       244
          9       0.70      0.99      0.82      1000

avg / total       0.86      0.84      0.82      3875



  'precision', 'predicted', average, warn_for)


### Random Forest cross validation

In [26]:
RF_cv = Pipeline([('cls', RandomForestClassifier()),])
parameters = {'cls__n_estimators': (20, 64, 128, 256),
              'cls__max_features': ['auto', 'sqrt', 'log2']}
gs_cls = GridSearchCV(RF_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
gs_cls = gs_cls.fit(X_train.todense(), y_train)



In [27]:
print('Best Paras:', gs_cls.best_params_)
y_predict = gs_cls.predict(X_train)
y_predict_prob = gs_cls.predict_proba(X_train)

infile = 'predicted/RF_unamb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_train)):
        writestring = [question_train[i], data_dict[question_train[i]]['member_id'], data_dict[question_train[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)
        
print(metrics.classification_report(y_train, y_predict))

y_predict_prob = gs_cls.predict_proba(X_test)
infile = 'predicted/RF_amb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

Best Paras: {'cls__n_estimators': 128, 'cls__max_features': 'sqrt'}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00         2
          1       1.00      1.00      1.00       207
         10       1.00      1.00      1.00       528
         11       1.00      1.00      1.00        57
         12       1.00      1.00      1.00       515
          2       1.00      1.00      1.00       193
          3       1.00      1.00      1.00       448
          4       1.00      1.00      1.00        35
          5       1.00      1.00      1.00       462
          6       1.00      1.00      1.00        96
          7       1.00      1.00      1.00        88
          8       1.00      1.00      1.00       244
          9       1.00      1.00      1.00      1000

avg / total       1.00      1.00      1.00      3875



### SVM cross validation

In [None]:
SVM_cv = Pipeline([('cls', SVC()),])
parameters = {'cls__kernel': ('linear', 'rbf', 'sigmoid'),
              'cls__C': (0.01, 0.1, 1.0, 5, 10),
              'cls__gamma': ('auto', 0.1, 1, 10)}
gs_cls = GridSearchCV(SVM_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
gs_cls = gs_cls.fit(X_train.todense(), y_train)



## Logistic Regression corss validation

In [49]:
LR_cv = Pipeline([('cls', LogisticRegression(max_iter = 200)),])
parameters = {'cls__C': (0.1, 0.5, 1, 5, 10),
              'cls__solver': ['newton-cg', 'sag', 'lbfgs']}
gs_cls = GridSearchCV(LR_cv, param_grid = parameters, cv = 10, n_jobs = mp.cpu_count()-1)
gs_cls = gs_cls.fit(X_train.todense(), y_train)



In [50]:
print('Best Paras:', gs_cls.best_params_)
# y_predict = gs_cls.predict(X_train)
y_predict_prob = gs_cls.predict_proba(X_train)

cat = [Grammar[item] for item in gs_cls.best_estimator_.classes_]

infile = 'predicted/LR_unamb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_train)):
        writestring = [question_train[i], data_dict[question_train[i]]['member_id'], data_dict[question_train[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)
        
# print(metrics.classification_report(y_train, y_predict))

y_predict_prob = gs_cls.predict_proba(X_test)
infile = 'predicted/LR_amb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

Best Paras: {'cls__solver': 'newton-cg', 'cls__C': 10}


In [51]:
joblib.dump(gs_cls.best_estimator_, 'pkl files/LR_classifier.pkl')

['pkl files/LR_classifier.pkl']

## Naive Bayes

In [33]:
NB = MultinomialNB(alpha = 0.5)
NB.fit(X_train.todense(), y_train)

y_predict = NB.predict(X_test.todense())
print(metrics.classification_report(y_test, y_predict))

# # dump classifier
# joblib.dump(NB, 'NB_classifier.pkl')

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       1.00      0.55      0.71        42
         10       0.82      0.77      0.80       106
         11       1.00      0.08      0.15        12
         12       0.81      0.84      0.83       103
          2       0.92      0.56      0.70        39
          3       0.86      0.71      0.78        90
          4       0.00      0.00      0.00         7
          5       0.86      0.76      0.81        93
          6       1.00      0.05      0.10        20
          7       1.00      0.11      0.20        18
          8       0.92      0.24      0.39        49
          9       0.56      0.98      0.71       200

avg / total       0.79      0.72      0.69       780



  'precision', 'predicted', average, warn_for)


## Random Forest

In [34]:
RF  = RandomForestClassifier(n_jobs=-1, max_features="sqrt", n_estimators=128)
RF.fit(X_train.todense(), y_train)
y_predicted = RF.predict(X_test)
print(metrics.classification_report(y_test, y_predicted))


             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       1.00      0.93      0.96        42
         10       0.96      0.94      0.95       106
         11       0.92      0.92      0.92        12
         12       0.92      0.99      0.95       103
          2       1.00      0.87      0.93        39
          3       0.93      0.93      0.93        90
          4       1.00      0.57      0.73         7
          5       0.94      0.98      0.96        93
          6       1.00      0.75      0.86        20
          7       1.00      0.67      0.80        18
          8       1.00      0.84      0.91        49
          9       0.88      0.97      0.93       200

avg / total       0.94      0.93      0.93       780



  'precision', 'predicted', average, warn_for)


## SVM

In [61]:
svc = LinearSVC(C = 1.0, max_iter = 10000)
svc = svc.fit(X = X_train.todense(), y = y_train)


y_predict = svc.predict(X = X_test)
# print(metrics.classification_report(y_test, y_predict))

cat = [Grammar[item] for item in svc.classes_]

infile = 'predicted/SVM_amb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        y_predict_prob = [0]*len(cat)
        y_predict_prob[cat.index(Grammar[y_predict[i]])] = 1
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += y_predict_prob
        spamwriter.writerow(writestring)

y_predict = svc.predict(X = X_train)
infile = 'predicted/SVM_unamb_predict.csv'
with open(infile, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_train)):
        y_predict_prob = [0]*len(cat)
        y_predict_prob[cat.index(Grammar[y_predict[i]])] = 1
        writestring = [question_train[i], data_dict[question_train[i]]['member_id'], data_dict[question_train[i]]['question']]
        writestring += y_predict_prob
        spamwriter.writerow(writestring)

In [71]:
LR = LogisticRegression(max_iter = 200, solver = 'newton-cg', C = 10)
LR = LR.fit(X = X_train.todense(), y = y_train)
y_predict = LR.predict(X = X_test)
print(metrics.classification_report(y_test, y_predict))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       1.00      0.90      0.95        42
         10       0.91      0.97      0.94       106
         11       1.00      0.75      0.86        12
         12       0.93      0.95      0.94       103
          2       0.97      0.95      0.96        39
          3       0.91      0.90      0.91        90
          4       0.80      0.57      0.67         7
          5       0.91      0.90      0.91        93
          6       1.00      0.80      0.89        20
          7       0.92      0.67      0.77        18
          8       0.98      0.88      0.92        49
          9       0.89      0.96      0.92       200

avg / total       0.92      0.92      0.92       780



  'precision', 'predicted', average, warn_for)


### get fixed data

In [61]:
X_train_text, y_train, X_test_text, y_test, member_train, member_test, question_train, question_test = dh.get_fixed_data()
print('X train shape: {}'.format(len(X_train_text)))
print('y train shape: {}'.format(y_train.shape))

X train shape: 3875
y train shape: (3875,)


In [62]:
tf = TextFeature(X_train_text, X_test_text)
X_train, X_test = tf.get_tfidf()
print(X_train.shape)
print(X_test.shape)

(3875, 10760)
(1949, 10760)


## write predicted results into file

In [65]:
NB = MultinomialNB(alpha = 1.0)
NB.fit(X_train.todense(), y_train)
y_predict_prob = NB.predict_proba(X_test.todense())
cat = [Grammar[item] for item in NB.classes_]
out_NB = 'predicted/NB_predicted.csv'
with open(out_NB, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

In [66]:
RF  = RandomForestClassifier(n_jobs=-1, max_features="sqrt", n_estimators=128)
RF.fit(X_train.todense(), y_train)
y_predict_prob = RF.predict_proba(X_test.todense())
cat = [Grammar[item] for item in RF.classes_]
out_RF = 'predicted/RF_predicted.csv'
with open(out_RF, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += list(y_predict_prob[i])
        spamwriter.writerow(writestring)

In [223]:
SVC = LinearSVC(C=1.0, max_iter=10000)
SVC = SVC.fit(X = X_train.todense(), y = y_train)
y_predict = SVC.predict(X_test.todense())
# cat = [Grammar[item] for item in SVC.classes_]
out_SVC = 'predicted/SVC_predicted.csv'
with open(out_SVC, 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')
    spamwriter.writerow(['question_id', 'member_id', 'question'] + cat)
    for i in range(len(question_test)):
        writestring = [question_test[i], data_dict[question_test[i]]['member_id'], data_dict[question_test[i]]['question']]
        writestring += [Grammar[y_predict[i]]]
        spamwriter.writerow(writestring)