In [1]:
import string
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from imblearn.metrics import classification_report_imbalanced
import csv

In [2]:
df = pd.read_csv('Train.csv', sep=',')
print(df.shape)
df.head()

(751, 8)


Unnamed: 0,comment_content,Giao hang,Dong goi,Gia ban,Thiet ke,Chat luong san pham,Cham soc khach hang,Do nhay
0,sản phẩm giống mô tả và tốt hơn mong đợi chất ...,1.0,1.0,0.0,0.0,1.0,1.0,0.0
1,về tiki mua hàng trên tiki thì luôn an tâm ở ...,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,giao hàng nhanh chóng và dịch vụ tốt thông báo...,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,đóng gói hàng cẩn thận chắc chắn,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,sản phẩm giống mô tả và tốt hơn mong đợi chất ...,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [3]:
df.sum(numeric_only=True)

Giao hang              262.0
Dong goi               208.0
Gia ban                107.0
Thiet ke               240.0
Chat luong san pham    434.0
Cham soc khach hang     87.0
Do nhay                203.0
dtype: float64

In [5]:
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 
from sklearn.svm import SVC as svc 
from sklearn.metrics import make_scorer, roc_auc_score
from scipy import stats
from sklearn.pipeline import Pipeline, FeatureUnion

mdl = svc(probability = True, random_state = 1)
target = df.columns[1:]
vectorizer = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode', analyzer='word', ngram_range=(1,3))
word_vectorizer = FeatureUnion([('word_vect', vectorizer)])

pipeline = Pipeline([
    ('vect', word_vectorizer),
    ('clf', mdl),
     ])

rand_list = {"vect__word_vect__ngram_range":  [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1,6),(1,7)],
             "clf__C": stats.uniform(2, 10),
             "clf__gamma": stats.uniform(0.1, 1)}

rand_search = [RandomizedSearchCV(pipeline, param_distributions = rand_list, n_iter=20, cv=5, 
                                   scoring='roc_auc', n_jobs=-1, verbose=2, random_state = i) for i in range(len(target))]

for i in range(len(rand_search)):
    rand_search[i].fit(df['comment_content'], df[target[i]]) 
    print(rand_search[i].best_params_)
    print(rand_search[i].best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   20.4s finished


{'clf__C': 2.871292997015407, 'clf__gamma': 0.12021839744032572, 'vect__word_vect__ngram_range': (1, 4)}
0.9130965634014689
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   21.5s finished


{'clf__C': 11.13962024579233, 'clf__gamma': 0.5572048079869882, 'vect__word_vect__ngram_range': (1, 2)}
0.9881532418430337
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   21.1s finished


{'clf__C': 2.820949222750248, 'clf__gamma': 0.4663424016750204, 'vect__word_vect__ngram_range': (1, 1)}
0.9542314593411677
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   21.7s finished


{'clf__C': 11.736482669357393, 'clf__gamma': 0.502360643558958, 'vect__word_vect__ngram_range': (1, 2)}
0.8905219622139704
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   20.3s finished


{'clf__C': 10.556209483356476, 'clf__gamma': 0.7090356014499725, 'vect__word_vect__ngram_range': (1, 2)}
0.7673745990864198
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   21.1s finished


{'clf__C': 2.24306561629487, 'clf__gamma': 0.30455554637995064, 'vect__word_vect__ngram_range': (1, 2)}
0.9771461873612565
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   21.1s finished


{'clf__C': 9.708809663078798, 'clf__gamma': 0.8647509835004671, 'vect__word_vect__ngram_range': (1, 1)}
0.9167364660811901


In [6]:
import pickle
pickle.dump(rand_search, open('finalized_model.sav', 'wb'))

In [7]:
df = pd.read_csv('Test.csv', sep=',')
print(df.shape)
df.head()

(188, 8)


Unnamed: 0,comment_content,Giao hang,Dong goi,Gia ban,Thiet ke,Chat luong san pham,Cham soc khach hang,Do nhay
0,chuột êm kết nối dể dàng với macbook qua bluet...,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,bộ phím chuột không dây rapoo p sản phẩm đẹp c...,0.0,0.0,1.0,1.0,1.0,0.0,0.0
2,chuột chuẩn hơn bên nova computer khôg bị xước...,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,tiki đóng gói kĩ giao hàng nhanh cực kỳ thông ...,1.0,1.0,0.0,1.0,0.0,0.0,1.0
4,sản phẩm giống mô tả và tốt hơn mong đợi chất ...,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [17]:
for i in range(len(rand_search)):
    y_pred = rand_search[i].predict(df['comment_content'])
    print(target[i],": roc_auc =", metrics.roc_auc_score(df[target[i]],y_pred),
         ", f1_score = ", metrics.f1_score(df[target[i]],y_pred))

Giao hang : roc_auc = 0.8605308106687688 , f1_score =  0.8349514563106796
Dong goi : roc_auc = 0.8927055702917772 , f1_score =  0.8761904761904762
Gia ban : roc_auc = 0.7820121951219512 , f1_score =  0.6122448979591836
Thiet ke : roc_auc = 0.828125 , f1_score =  0.7692307692307692
Chat luong san pham : roc_auc = 0.7292035398230088 , f1_score =  0.8083333333333333
Cham soc khach hang : roc_auc = 0.8561253561253562 , f1_score =  0.7916666666666666
Do nhay : roc_auc = 0.7220552454558465 , f1_score =  0.6046511627906976
