# Import Useful Modules 

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS

import keras
import tensorflow
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical

from preprocessing_pipeline import preprocessing

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
#Determine Model's Save Location

#version=
version="version_6"

In [5]:
if(not(os.path.exists("model/{}".format(version)))):
    os.makedirs("model/{}".format(version))

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [6]:
# data untuk klasifikasi kategori produk
data_for_classification=pd.read_csv("data/query.csv",header=None)

# data untuk word embedding
data_for_embedding=pd.read_fwf('data/products2m.txt',header=None)
data_for_embedding["Product Title"]=data_for_embedding[0]
data_for_embedding=data_for_embedding[["Product Title"]]
data_for_embedding.dropna(inplace=True,axis=0)

In [12]:
data_for_embedding.head()

Unnamed: 0,Product Title
0,"PC LENOVO IC300s-i5(4460)-VGA with LED-19,5"
1,prosessor intel core i5 4440
2,LENOVO All in One aio310-0kid
3,PC Lenovo aio510-crid
4,HP Pavilion 570-P034D Win 10 Home


In [9]:
data_for_classification.head()

Unnamed: 0,0,1,2
0,hunting,gear+camping,327
1,koleksi,Lightstick+EXO,63
2,kartu-perdana,vivo+v7,183
3,aksesoris-mobil,subwoofer+kolong+aktif,389
4,anak-perempuan,Hotpants+anak,1366


In [15]:
len(data_for_classification)

2440682

# ---------------------------------------------------------------------------------------------------------------

# Construct Word Embedder (Using fasttext)

In [15]:
preprocessor=preprocessing(None,None)

In [16]:
data_for_classification.head()

Unnamed: 0,0,1,2
0,hardware,KINGSTON+KVR1333D3N9,1510.0
1,musik,power+amplifier+wisdom+,62.0
2,outwear-motor,jas%20hujan%20anak,391.0
3,celana,Celana+bahan+formal,288.0
4,komputer,Preset+lightroom,1.0


In [19]:
# menghapus karakter tidak penting dari data
# product_title=[preprocessing_pipeline.remove_parentheses(value) for value in data_for_embedding["Product Title"]]

# menghapus karakter tidak penting dari data
data_for_classification.dropna(axis=0,inplace=True)
product_title=[preprocessor.remove_parentheses(value) for value in data_for_classification[1]]

In [20]:
EMBEDDING_DIMENSION=100
EMBEDDING_EPOCH=20
EMBEDDING_WINDOW=7
SEED=2918342

#### Train Model

In [21]:
# membuat model word embedding
print("Start Time : {}\n".format(str(datetime.datetime.now())))
word_embedder = FastText(product_title, size=EMBEDDING_DIMENSION, window=EMBEDDING_WINDOW, min_count=10, workers=4, sg=1, seed=SEED, min_n=5, iter=EMBEDDING_EPOCH,)
print("End Time : {}".format(str(datetime.datetime.now())))

Start Time : 2018-07-04 20:36:27.935689

End Time : 2018-07-04 20:38:01.733099


#### Save Model

In [23]:
#save model
pickle.dump(word_embedder, open("model/{}/word_embedder.pickle".format(version), 'wb'))

# ---------------------------------------------------------------------------------------------------------------

# Word Embedding Evaluation

#### Words With Similar Context Will Have Higher Similarity Degree

In [24]:
word_embedder.wv.most_similar("naruto")

[('shippuden', 0.7462983727455139),
 ('akatsuki', 0.6295211911201477),
 ('anime', 0.6164800524711609),
 ('twice', 0.600074291229248),
 ('woody', 0.5965604186058044),
 ('jubah', 0.5933154821395874),
 ('brigez', 0.5873425006866455),
 ('tad', 0.5824598670005798),
 ('badge', 0.5809571146965027),
 ('ninjago', 0.5797038078308105)]

#### Merk dari barang akan saling berdekatan

In [25]:
word_embedder.wv.most_similar("honda")

[('civic', 0.6304067373275757),
 ('cielo', 0.6278772950172424),
 ('mobilio', 0.6134516000747681),
 ('jazz', 0.6122152805328369),
 ('nuvo', 0.6055834293365479),
 ('freed', 0.6030832529067993),
 ('orisinil', 0.5964998006820679),
 ('idsi', 0.5910791754722595),
 ('vtec', 0.5893610119819641),
 ('accord', 0.5868967175483704)]

### Embedding tidak terlalu terpengaruh typo

In [26]:
word_embedder.wv.most_similar("rhonda")

[('honda', 0.9430763721466064),
 ('onda', 0.6621943712234497),
 ('ebbro', 0.6589542627334595),
 ('orisinil', 0.6427961587905884),
 ('cielo', 0.637367308139801),
 ('nuvo', 0.6361693143844604),
 ('civic', 0.6337378025054932),
 ('footrest', 0.6335226893424988),
 ('ahm', 0.6318982839584351),
 ('skok', 0.6311133503913879)]

In [27]:
word_embedder.wv.most_similar("hondar")

[('honda', 0.9483532905578613),
 ('orisinil', 0.6483314633369446),
 ('cielo', 0.6374216079711914),
 ('opel', 0.6283911466598511),
 ('hrs', 0.6220570802688599),
 ('nuvo', 0.620927095413208),
 ('civic', 0.617662787437439),
 ('skok', 0.6168977618217468),
 ('ahm', 0.6105969548225403),
 ('pdk', 0.6082438230514526)]

In [28]:
word_embedder.wv.most_similar("samsung")

[('samsu', 0.7955980896949768),
 ('duos', 0.7017251253128052),
 ('galaxy', 0.6823732852935791),
 ('galaxi', 0.6497334241867065),
 ('bootloop', 0.6482325196266174),
 ('jace', 0.6337250471115112),
 ('jh', 0.6240614652633667),
 ('otterbox', 0.6198964715003967),
 ('gh', 0.6152838468551636),
 ('uag', 0.6117679476737976)]

# ---------------------------------------------------------------------------------------------------------------

# Embed Product Title To Vector Space

In [None]:
embedded_data,label_encoder=preprocessing_pipeline["preprocess_data"](
    data_for_classification["Product Title"],
    data_for_classification["Category Name"],
    EMBEDDING_DIMENSION,
    word_embedder
)

In [None]:
embedded_data.head()

# ---------------------------------------------------------------------------------------------------------------

# Using Logistic Regression

In [None]:
sampled_embedded_data=embedded_data.sample(n=len(embedded_data))

In [None]:
lr_X_train,lr_X_test,lr_y_train,lr_y_test=train_test_split(sampled_embedded_data.drop("Category Name",axis=1),sampled_embedded_data["Category Name"],test_size=0.25)

#### Train Model

In [None]:
print("First Logistic Regression\n")
print("Start Time : {}\n".format(str(datetime.datetime.now())))
classifier_lr=LR(solver="newton-cg",multi_class="multinomial",n_jobs=-1,warm_start=True)
classifier_lr.fit(lr_X_train,lr_y_train)
report_lr=classification_report(lr_y_test,classifier_lr.predict(lr_X_test))
print("Finish Time : {}\n".format(str(datetime.datetime.now())))
print("Accuracy : {}".format(classifier_lr.score(lr_X_test,lr_y_test)))

#### Save Model

In [None]:
#save model
pickle.dump(classifier_lr, open("model/{}/classifier_lr.pickle".format(version), 'wb'))

#### Feed Probability Prediction To Another Logistic Regression

In [None]:
lr_second_X_train,lr_second_X_test,lr_second_y_train,lr_second_y_test=train_test_split(classifier_lr.predict_proba(sampled_embedded_data.drop("Category Name",axis=1)),sampled_embedded_data["Category Name"],test_size=0.25)

In [None]:
print("Second Logistic Regression\n")
print("Start Time : {}\n".format(str(datetime.datetime.now())))
second_classifier_lr=LR(n_jobs=-1)
second_classifier_lr.fit(lr_second_X_train,lr_second_y_train)
second_report_lr=classification_report(lr_second_y_test,second_classifier_lr.predict(lr_second_X_test))
print("Finish Time : {}\n".format(str(datetime.datetime.now())))
print("Accuracy : {}".format(second_classifier_lr.score(lr_second_X_test,lr_second_y_test)))

#### Save Second Model

In [None]:
#save model
pickle.dump(second_classifier_lr, open("model/{}/second_classifier_lr.pickle".format(version), 'wb'))

# ---------------------------------------------------------------------------------------------------------------

# Using Support Vector Machine

In [None]:
sampled_embedded_data=embedded_data.sample(n=len(embedded_data))

In [None]:
svm_X_train,svm_X_test,svm_y_train,svm_y_test=train_test_split(sampled_embedded_data.drop("Category Name",axis=1),sampled_embedded_data["Category Name"],test_size=0.25)

#### Train Model With Optimization

In [None]:
print("Start Time : {}\n".format(str(datetime.datetime.now())))
param_grid={"C":[1,10,100,1000],"gamma":[0.01,0.1,1,10],"kernel":["rbf","linear"]}
classifier_svm=SVC(verbose=0)
optimizer_svm=GS(classifier_svm,param_grid,scoring="accuracy",verbose=4)
optimizer_svm.fit(svm_X_train,svm_y_train)
report_svm=classification_report(svm_y_test,optimizer_svm.predict(svm_X_test))
print("Finish Time : {}\n".format(str(datetime.datetime.now())))
print("Accuracy : {}".format(optimizer_svm.score(svm_X_test,svm_y_test)))

In [None]:
print("Best Hyperparameters")
optimizer_svm.best_params_

#### Train Model No Optimization

In [None]:
print("Start Time : {}\n".format(str(datetime.datetime.now())))

classifier_svm=SVC(C=10,gamma=0.01,kernel='rbf')
classifier_svm.fit(svm_X_train,svm_y_train)
report_svm=classification_report(svm_y_test,classifier_svm.predict(svm_X_test))
print("Finish Time : {}\n".format(str(datetime.datetime.now())))
print("Accuracy : {}".format(classifier_svm.score(svm_X_test,svm_y_test)))

#### Save Model

In [None]:
#save model
pickle.dump(classifier_svm, open("model/{}/classifier_svm.pickle".format(version), 'wb'))

# ---------------------------------------------------------------------------------------------------------------