In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import modules.eda as Detective
import modules.model as Model
import pandas as pd
import warnings
import numpy as np
import emojis
import time
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("./data/normalize_reviews.csv").fillna("")
data = data[['raw_comment', 'normalize_comment', 'emoji', 'label']]

data.head()

Unnamed: 0,raw_comment,normalize_comment,emoji,label
0,Giao hàng kh đúng cần phê bình hjjjjjhhd...,giao hàng không đúng cần phê bình,,0
1,Chất lượng sản phẩm tạm được. Giao...,chất lượng sản phẩm tạm được giao ...,,0
2,Ko có lắc tay như hình,không có lắc tay như hình,,0
3,Giao hàng lâu. Bảo có lắc tay mà k thâ...,giao hàng lâu bảo có lắc tay mà không ...,,0
4,"Mình mua 2 cái, một dùng ok. Một cái k...",mua cái một dùng ok một cái không chạ...,😢,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data[['raw_comment', 'normalize_comment', 'emoji']], data['label'], test_size=0.2, random_state=42)

In [5]:
X_train['dash_comment'] = Model.replaceInNGrams(X_train['normalize_comment'], [2, 3, 4], 20, 99999999)

In [6]:
X_train['dash_comment']

3804                                form_không đẹp_lắm
9534     áo_rộng thật_sự nhanh chật lượng v...
318                     màu túi hơi tối do ảnh chụp
3611                  chất_liệu vải_không ổn lắm
3341     mã màu_đen hiện lên áo màu_đen còn mã...
                               ...                        
5191     bán combo ghi đôi nhưng_nhận chỉ đôi_ta...
13418           áo_đẹp from chuẩn sẽ ủng_hộ tiếp
5390     đặt_màu trắng kem giao_màu xanh đen giao...
860      đồ chơi tí hon nên dừng bán sản_phẩ...
7270     son đẹp_lắm mọi đáng đồng_tiền bát m...
Name: dash_comment, Length: 11364, dtype: object

In [7]:
X_train['dash_comment'] = Model.convertToNFC(X_train['dash_comment'])
# X_test['normalize_comment'] = Model.convertToNFC(X_test['normalize_comment'])

In [8]:
bow_vec = CountVectorizer()
bow_comments = bow_vec.fit_transform(X_train['dash_comment'])

print(len(bow_vec.get_feature_names()))
print(bow_vec.get_feature_names())
print(len(bow_comments.toarray()))
print(bow_comments.toarray())

4528
['ac', 'ace', 'ad', 'ah', 'ai', 'am', 'amazing', 'among', 'an', 'and', 'android', 'anh', 'any', 'ao', 'auto', 'av', 'ay', 'ba', 'baby', 'bad', 'badly', 'bai', 'bam', 'ban', 'bang', 'banh', 'bao', 'bao_giờ', 'bao_nhiêu', 'base', 'basic', 'bat', 'bau', 'bay', 'be', 'beautiful', 'become', 'beige', 'ben', 'beo', 'bi', 'bia', 'big', 'binh', 'bit', 'biên', 'biến', 'biếng', 'biết', 'biết_có', 'biết_là', 'biển', 'biểu', 'biệt', 'bk', 'bl', 'black', 'bling', 'blink', 'blue', 'bo', 'boat', 'bode', 'body', 'bog', 'bom', 'bon', 'bong', 'bonus', 'boring', 'bots', 'box', 'boxer', 'boxing', 'bra', 'brand', 'broken', 'brown', 'build', 'bung', 'bung_chỉ', 'but', 'buy', 'buôn', 'buốn', 'buốt', 'buồi', 'buồn', 'buổi', 'buộc', 'buộc_tóc', 'buộn', 'bye', 'bà', 'bài', 'bàn', 'bành', 'bày', 'bá', 'bác', 'bái', 'bám', 'bán', 'bán_hàng', 'bán_đắt', 'bánh', 'báo', 'bát', 'bân', 'bây', 'bã', 'bão', 'bè', 'bèo', 'bé', 'bén', 'béo', 'bét', 'bê', 'bên', 'bên_ngoài', 'bên_trong', 'bên_vận', 'bì', 'bìa', 'bình',

In [9]:
tfidf_vec = TfidfVectorizer()
tfidf_comments = tfidf_vec.fit_transform(X_train['dash_comment'])

print(len(tfidf_vec.get_feature_names()))
print(tfidf_vec.get_feature_names())
print(len(tfidf_comments.toarray()))
print(tfidf_comments.toarray())

4528
['ac', 'ace', 'ad', 'ah', 'ai', 'am', 'amazing', 'among', 'an', 'and', 'android', 'anh', 'any', 'ao', 'auto', 'av', 'ay', 'ba', 'baby', 'bad', 'badly', 'bai', 'bam', 'ban', 'bang', 'banh', 'bao', 'bao_giờ', 'bao_nhiêu', 'base', 'basic', 'bat', 'bau', 'bay', 'be', 'beautiful', 'become', 'beige', 'ben', 'beo', 'bi', 'bia', 'big', 'binh', 'bit', 'biên', 'biến', 'biếng', 'biết', 'biết_có', 'biết_là', 'biển', 'biểu', 'biệt', 'bk', 'bl', 'black', 'bling', 'blink', 'blue', 'bo', 'boat', 'bode', 'body', 'bog', 'bom', 'bon', 'bong', 'bonus', 'boring', 'bots', 'box', 'boxer', 'boxing', 'bra', 'brand', 'broken', 'brown', 'build', 'bung', 'bung_chỉ', 'but', 'buy', 'buôn', 'buốn', 'buốt', 'buồi', 'buồn', 'buổi', 'buộc', 'buộc_tóc', 'buộn', 'bye', 'bà', 'bài', 'bàn', 'bành', 'bày', 'bá', 'bác', 'bái', 'bám', 'bán', 'bán_hàng', 'bán_đắt', 'bánh', 'báo', 'bát', 'bân', 'bây', 'bã', 'bão', 'bè', 'bèo', 'bé', 'bén', 'béo', 'bét', 'bê', 'bên', 'bên_ngoài', 'bên_trong', 'bên_vận', 'bì', 'bìa', 'bình',

In [10]:
X_vectorizers = [
    ('Bag of Words', bow_comments),
    ('TF-IDF', tfidf_comments)
]

In [11]:
lst_models = [
    ('Logistic Regression - [solver: lbfgs]', LogisticRegression(solver='lbfgs')),
    ('Logistic Regression - [solver: liblinear]', LogisticRegression(solver='liblinear')),
    ('Logistic Regression - [solver: newton-cg]', LogisticRegression(solver='newton-cg')),
    ('KNN - [n_neighbors: 2]', KNeighborsClassifier(n_neighbors=2)),
    ('KNN - [n_neighbors: 3]', KNeighborsClassifier(n_neighbors=3)),
    ('SVC - [kernel: linear]', SVC(kernel='linear', random_state=42)),
    ('SVC - [kernel: poly]', SVC(kernel='poly', random_state=42)),
    ('SVC - [kernel: rbf]', SVC(kernel='rbf', random_state=42)),
    ('SVC - [kernel: sigmoid]', SVC(kernel='sigmoid', random_state=42)),
    ('Bernoulli', BernoulliNB()),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('AdaBoost', AdaBoostClassifier(base_estimator=RandomForestClassifier(random_state=42), random_state=42)),
    ('XGBoost', XGBClassifier(eval_metric='mlogloss'))
]

In [12]:
def train(lst_models, X_vectorizer, y, cv):
    res_table = []
    for vec_name, X in X_vectorizer:
        for mdl_name, model in lst_models:
            cv_res = cross_validate(model, X, y, cv=cv, return_train_score=True, scoring=['accuracy', 'roc_auc'])
            res_table.append([vec_name, mdl_name,
                              cv_res['train_accuracy'].mean(),
                              cv_res['test_accuracy'].mean(),
                              np.abs(cv_res['train_accuracy'].mean() - cv_res['test_accuracy'].mean()),
                              cv_res['train_accuracy'].std(),
                              cv_res['test_accuracy'].std(),
                              cv_res['train_roc_auc'].mean(),
                              cv_res['test_roc_auc'].mean(),
                              np.abs(cv_res['train_roc_auc'].mean() - cv_res['test_roc_auc'].mean()),
                              cv_res['train_roc_auc'].std(),
                              cv_res['test_roc_auc'].std(),
                              cv_res['fit_time'].mean()
            ])
    
    res_table = pd.DataFrame(res_table, columns=['vectorizer', 'model', 'train_acc', 'test_acc', 'diff_acc',
                                                 'train_acc_std', 'test_acc_std', 'train_roc_auc', 'test_roc_auc',
                                                 'diff_roc_auc', 'train_roc_auc_std', 'test_roc_auc_std', 'fit_time'])
    res_table.sort_values(by=['test_acc', 'test_roc_auc'], ascending=False, inplace=True)
    return res_table.reset_index(drop=True)

In [13]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
res_models = train(lst_models, X_vectorizers, y_train, cv)

res_models

Unnamed: 0,vectorizer,model,train_acc,test_acc,diff_acc,train_acc_std,test_acc_std,train_roc_auc,test_roc_auc,diff_roc_auc,train_roc_auc_std,test_roc_auc_std,fit_time
0,TF-IDF,SVC - [kernel: rbf],0.976554,0.883491,0.093063,0.000511,0.005737,0.995896,0.945423,0.050472,0.0001269072,0.003912,13.232349
1,TF-IDF,SVC - [kernel: linear],0.928106,0.879707,0.048399,0.000982,0.005763,0.972801,0.942599,0.030202,0.0002546106,0.003562,5.712535
2,TF-IDF,SVC - [kernel: sigmoid],0.901238,0.87918,0.022058,0.001233,0.005814,0.955299,0.942341,0.012957,0.000418226,0.003415,9.501146
3,TF-IDF,Logistic Regression - [solver: lbfgs],0.914046,0.878563,0.035483,0.000803,0.007258,0.967489,0.943788,0.0237,0.0003192845,0.004322,0.073368
4,TF-IDF,Logistic Regression - [solver: newton-cg],0.914037,0.878563,0.035473,0.000796,0.007258,0.967488,0.943787,0.023702,0.0003194728,0.004324,0.079191
5,TF-IDF,Logistic Regression - [solver: liblinear],0.914076,0.878563,0.035512,0.000817,0.007258,0.967489,0.943786,0.023703,0.0003192405,0.004323,0.018759
6,Bag of Words,Logistic Regression - [solver: liblinear],0.938969,0.873636,0.065333,0.001201,0.009394,0.982582,0.936166,0.046416,0.0002438185,0.004672,0.092556
7,Bag of Words,Logistic Regression - [solver: lbfgs],0.938998,0.873548,0.065451,0.001227,0.009499,0.982583,0.936162,0.046421,0.0002435316,0.004669,0.216573
8,Bag of Words,Logistic Regression - [solver: newton-cg],0.938998,0.873548,0.065451,0.001227,0.009499,0.982582,0.936161,0.046421,0.0002440348,0.00467,0.203967
9,Bag of Words,SVC - [kernel: sigmoid],0.871798,0.869852,0.001946,0.001178,0.007567,0.926313,0.931164,0.004851,0.00145321,0.004744,7.845911


In [19]:
X_train['dash_comment_50'] = Model.replaceInNGrams(X_train['normalize_comment'], [2, 3, 4], 50, 99999999)

In [21]:
bow_vec_50 = CountVectorizer()
bow_comments_50 = bow_vec_50.fit_transform(X_train['dash_comment_50'])

In [22]:
tfidf_vec_50 = TfidfVectorizer()
tfidf_comments_50 = tfidf_vec_50.fit_transform(X_train['dash_comment_50'])

In [23]:
X_vectorizers_50 = [
    ('Bag of Words', bow_comments_50),
    ('TF-IDF', tfidf_comments_50)
]

In [24]:
res_models_50 = train(lst_models, X_vectorizers_50, y_train, cv)

res_models_50

In [None]:
import os
os.system("vlc ./GAOooooooo.mp3")