In [1]:
import string
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from imblearn.metrics import classification_report_imbalanced
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain

In [2]:
df = pd.read_csv('DataMerged.csv', sep=',',index_col='ID')
print(df.shape)
df.head()

(1690, 23)


Unnamed: 0_level_0,comment_content,Giao hang,Dong goi,Tieng click chuot,Phu kien,Gia ban,Thiet ke,Chat luong san pham,Pin,Cap ket noi,...,Nut power,Con lan,Nut ben trai,Nut ben phai,Do nhay,Phi ship,Ket noi,Tem bao hanh,Den,Che do sac
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,"Chuột nhạy, xài mượt, êm. Giá cũng OK. Sản phẩ...",1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Thấy có vài bạn nói bị vênh này kia nhưng mình...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Khi mới nhận được sản phẩm thì chuột đã có dấu...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,This is the best cheap wireless mouse I've eve...,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TIKI cho hỏi chuột bị 1 vạch gờ lên do hoàn th...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df.drop_duplicates(subset ="comment_content", inplace = True)
df['comment_content'].replace('', np.nan, inplace=True)
df.dropna(subset=['comment_content'], inplace=True)
print(df.shape)

(1323, 23)


In [4]:
df.mean()

Giao hang              0.242630
Dong goi               0.201058
Tieng click chuot      0.064248
Phu kien               0.021164
Gia ban                0.099017
Thiet ke               0.226757
Chat luong san pham    0.703704
Pin                    0.067271
Cap ket noi            0.006803
Cham soc khach hang    0.085412
Bao hanh               0.002268
Bluetooth              0.003779
Nut power              0.012094
Con lan                0.030234
Nut ben trai           0.008314
Nut ben phai           0.006047
Do nhay                0.191988
Phi ship               0.005291
Ket noi                0.043084
Tem bao hanh           0.034769
Den                    0.024943
Che do sac             0.000756
dtype: float64

In [5]:

#Convert all to lower case
df.comment_content = df.comment_content.apply(lambda x: x.lower())
#Remove punctuation, number, Emojify
df.comment_content = df.comment_content.str.replace(r"""[^a-z\s
                                    \à\á\ạ\ả\ã\â\ầ\ấ\ậ\ẩ\ẫ\ă\ằ\ắ\ặ\ẳ\ẵ
                                    \è\é\ẹ\ẻ\ẽ\ê\ề\ế\ệ\ể\ễ
                                    \ì\í\ị\ỉ\ĩ
                                    \ò\ó\ọ\ỏ\õ\ô\ồ\ố\ộ\ổ\ỗ\ơ\ờ\ớ\ợ\ở\ỡ
                                    \ù\ú\ụ\ủ\ũ\ư\ừ\ứ\ự\ử\ữ
                                    \ỳ\ý\ỵ\ỷ\ỹ
                                    \đ]""", '')
df.head()

Unnamed: 0_level_0,comment_content,Giao hang,Dong goi,Tieng click chuot,Phu kien,Gia ban,Thiet ke,Chat luong san pham,Pin,Cap ket noi,...,Nut power,Con lan,Nut ben trai,Nut ben phai,Do nhay,Phi ship,Ket noi,Tem bao hanh,Den,Che do sac
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,chuột nhạy xài mượt êm giá cũng ok sản phẩm gi...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,thấy có vài bạn nói bị vênh này kia nhưng mình...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,khi mới nhận được sản phẩm thì chuột đã có dấu...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,this is the best cheap wireless mouse ive ever...,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,tiki cho hỏi chuột bị vạch gờ lên do hoàn thi...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
patterns = {
    '[àáảãạăắằẵặẳâầấậẫẩ]': 'a',
    '[đ]': 'd',
    '[èéẻẽẹêềếểễệ]': 'e',
    '[ìíỉĩị]': 'i',
    '[òóỏõọôồốổỗộơờớởỡợ]': 'o',
    '[ùúủũụưừứửữự]': 'u',
    '[ỳýỷỹỵ]': 'y'
}

def toASCII(text):
    output = text
    for regex, replace in patterns.items():
        output = re.sub(regex, replace, output)
    return output

df_ascii = df.copy()
df_ascii.comment_content = df_ascii.comment_content.apply(lambda x: toASCII(x))
df_ascii.head()

Unnamed: 0_level_0,comment_content,Giao hang,Dong goi,Tieng click chuot,Phu kien,Gia ban,Thiet ke,Chat luong san pham,Pin,Cap ket noi,...,Nut power,Con lan,Nut ben trai,Nut ben phai,Do nhay,Phi ship,Ket noi,Tem bao hanh,Den,Che do sac
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,chuot nhay xai muot em gia cung ok san pham gi...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,thay co vai ban noi bi venh nay kia nhung minh...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,khi moi nhan duoc san pham thi chuot da co dau...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,this is the best cheap wireless mouse ive ever...,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,tiki cho hoi chuot bi vach go len do hoan thi...,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
frames = [df, df_ascii]
df = pd.concat(frames)
print(df.shape)

(2646, 23)


In [8]:
target = ["Giao hang","Dong goi","Tieng click chuot","Phu kien","Gia ban","Thiet ke","Chat luong san pham","Pin",
          "Cap ket noi","Cham soc khach hang","Bao hanh","Bluetooth","Nut power","Con lan","Nut ben trai",
          "Nut ben phai","Do nhay","Phi ship","Ket noi","Tem bao hanh","Den","Che do sac"]
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3))
vectorizer.fit(df['comment_content'])
for i in range(len(target)):
    X_train, X_test, y_train, y_test = train_test_split(df['comment_content'], df[target[i]], test_size=0.25,
                                                        stratify=df[target[i]], random_state=123456)
    vectors_train = vectorizer.transform(X_train)
    vectors_test = vectorizer.transform(X_test)
    #clf = GaussianNB()
    clf = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators=10, random_state=123456)
    clf.fit(vectors_train.toarray(), y_train)
    predicted = clf.predict(vectors_test.toarray())
    accuracy = metrics.f1_score(y_test, predicted)
    print(target[i],': ',accuracy)

Giao hang :  0.8965517241379312
Dong goi :  0.9398496240601504
Tieng click chuot :  0.9285714285714286
Phu kien :  0.6923076923076924
Gia ban :  0.8405797101449276
Thiet ke :  0.8206896551724138
Chat luong san pham :  0.9237288135593221
Pin :  0.9069767441860465
Cap ket noi :  0.5
Cham soc khach hang :  0.9166666666666667
Bao hanh :  1.0
Bluetooth :  0.5
Nut power :  1.0
Con lan :  0.888888888888889
Nut ben trai :  0.5
Nut ben phai :  1.0
Do nhay :  0.9119999999999999
Phi ship :  0.8
Ket noi :  0.8363636363636363
Tem bao hanh :  0.9333333333333332
Den :  0.6428571428571428
Che do sac :  1.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df['comment_content'], df[target[:]],
                                                    stratify=df[target[:]], test_size=0.25, random_state=123456)
vectors_train = vectorizer.transform(X_train)
vectors_test = vectorizer.transform(X_test)
clf = BinaryRelevance(AdaBoostClassifier(GaussianNB(),n_estimators=10, random_state=123456))
clf.fit(vectors_train, y_train)
predicted = clf.predict(vectors_test)
print('hamming loss: ',metrics.hamming_loss(y_test, predicted))
print('jaccard similarity_ core: ', metrics.jaccard_similarity_score(y_test, predicted))
print('micro averaging F1-Score: ', metrics.f1_score(y_test, predicted, average="micro"))
print('macro averaging F1-Score: ', metrics.f1_score(y_test, predicted, average="macro"))

hamming loss:  0.04929964295523208
jaccard similarity_ core:  0.5110098067424351
micro averaging F1-Score:  0.7008333333333332
macro averaging F1-Score:  0.775270425763305


  'precision', 'predicted', average, warn_for)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['comment_content'], df[target[:]],
                                                    stratify=df[target[:]], test_size=0.25, random_state=123456)
vectors_train = vectorizer.transform(X_train)
vectors_test = vectorizer.transform(X_test)
clf = BinaryRelevance(AdaBoostClassifier(DecisionTreeClassifier(),n_estimators=10, random_state=123456))
clf.fit(vectors_train, y_train)
predicted = clf.predict(vectors_test)
print('hamming loss: ',metrics.hamming_loss(y_test, predicted))
print('jaccard similarity_ core: ', metrics.jaccard_similarity_score(y_test, predicted))
print('micro averaging F1-Score: ', metrics.f1_score(y_test, predicted, average="micro"))
print('macro averaging F1-Score: ', metrics.f1_score(y_test, predicted, average="macro"))

hamming loss:  0.01249656687723153
jaccard similarity_ core:  0.8838368580060423
micro averaging F1-Score:  0.9334308705193854
macro averaging F1-Score:  0.9275002374953839
