# Import Useful Modules 

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical

In [37]:
#Determine Model's File Location

version = "version_2"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [38]:
from preprocessing_pipeline import preprocessing

In [39]:
with open("model/{}/word_embedder.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

## Check

#### Preprocessing

In [40]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
preprocessor

<preprocessing_pipeline.preprocessing at 0x71dabdb908>

#### Word Embedding

In [41]:
word_embedder

<gensim.models.fasttext.FastText at 0x71b2a9e8d0>

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

##### 30000 Data

In [42]:
# data untuk klasifikasi kategori produk
data_for_classification=pd.read_csv("data/product_data.csv")

In [43]:
data_for_classification.head()

Unnamed: 0,Product Id,Category Id,Category Name,Product Title
0,114628582,2,Desktop,PC HP Pavillion 251VGA-i5(4460) with LED-20
1,640569762,2,Desktop,Unik SanDisk Flashdisk 64GB Ultra USB 3 3 0 Fl...
2,757005547,2,Desktop,Paket Spek PC Agan Bagas 2
3,758393462,2,Desktop,PAket Spek PC Agan JP Wogo
4,757008997,2,Desktop,PC HP All In One AIO 20 C303D


##### 400000 Data

In [44]:
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [45]:
large_data_for_classification.head()

Unnamed: 0,0,1,2
0,hardware,KINGSTON+KVR1333D3N9,1510.0
1,musik,power+amplifier+wisdom+,62.0
2,outwear-motor,jas%20hujan%20anak,391.0
3,celana,Celana+bahan+formal,288.0
4,komputer,Preset+lightroom,1.0


### Preprocess Data

##### 30000 Data

In [46]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
embedded_data, label_encoder = preprocessor.preprocess_data(
    data_for_classification["Product Title"],
    data_for_classification["Category Name"],
)

In [47]:
embedded_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Labels
0,-1.353583,-3.1719,1.371736,0.123707,-0.62016,1.464381,1.22238,-4.205549,-0.133768,-2.833227,...,3.284703,2.795318,-0.21433,0.750985,-1.134297,-2.441089,-0.397881,-0.623074,-0.85346,60
1,-1.411664,-3.624061,-0.124707,1.157138,-2.769813,0.722179,1.752806,-6.023485,1.032547,-3.752555,...,5.373015,2.577363,1.723452,0.312116,1.192225,-4.973745,3.088582,0.075678,3.797503,60
2,-1.210998,-1.52154,-0.512093,1.162975,-0.206553,0.776194,1.275206,-1.973089,-0.176616,-2.115182,...,1.901303,1.018535,0.845923,0.409504,-0.487097,-1.452161,0.795725,-1.173649,1.033765,60
3,-0.62024,-1.366821,-0.797583,1.449532,-0.178919,0.559059,1.577674,-2.021882,-0.063093,-1.323352,...,1.7254,0.725442,0.841356,0.316991,-0.575546,-1.296193,0.435206,-0.329237,0.069929,60
4,-0.494332,-4.235569,0.269955,-0.220324,0.461205,1.288682,1.095711,-5.439537,1.180161,-2.108301,...,5.4325,1.697883,-0.479877,-0.55601,-1.109091,-0.888318,0.026072,0.021216,0.143521,60


In [48]:
embedded_data.shape

(30116, 101)

In [49]:
label_encoder

LabelEncoder()

##### 400000 Data

In [50]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
large_embedded_data, large_label_encoder = preprocessor.preprocess_data(
    large_data_for_classification[1],
    large_data_for_classification[0],
)

In [51]:
large_embedded_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Labels
0,-0.580157,-0.902683,-0.200629,0.426924,-0.298552,0.377382,0.334578,-1.341522,0.284654,-0.601798,...,0.714149,0.436149,0.067786,0.280465,-0.069688,0.040827,0.063288,0.042661,-0.402801,45
1,-0.151489,-1.193317,-0.412542,0.074164,-1.0538,0.283322,0.306509,-0.714775,-0.282685,-0.205421,...,1.560397,1.276572,-0.248248,0.154423,-0.181553,-0.448146,-0.126884,-0.248857,1.153092,68
2,-0.206052,-0.15816,-0.194283,0.238795,0.007292,0.196439,0.141539,-0.22354,0.046263,-0.282032,...,-0.081149,0.099055,0.030815,0.09932,-0.081225,0.031153,-0.080204,-0.028578,-0.03781,73
3,-0.053166,0.007892,-1.529572,1.45867,-0.734888,-0.503057,-0.532288,-0.022648,0.070379,-1.529616,...,1.294445,-0.273994,1.71803,0.06049,-1.941472,1.189055,-0.650072,-0.596036,0.285584,24
4,-0.162186,-0.530731,-0.356022,0.209101,-0.282881,0.324258,0.422291,-0.745394,-0.038498,-0.477472,...,0.860707,0.729105,-0.046928,0.239313,0.133619,-0.398247,0.280034,0.195425,0.424736,59


In [55]:
large_embedded_data["sum"]=large_embedded_data.drop(["Labels"],axis=1).sum(axis=1)

In [56]:
large_embedded_data=large_embedded_data.loc[large_embedded_data["sum"]!=0].drop("sum",axis=1)

In [57]:
large_embedded_data.shape

(392417, 102)

In [53]:
large_label_encoder

LabelEncoder()

In [54]:
with open("model/{}/label_encoder.pickle".format(version), "wb") as file:
    pickle.dump(large_label_encoder,file)

# ---------------------------------------------------------------------------------------------------------------

### Build Neural Network

In [None]:
data=large_embedded_data.copy()
sampled_embedded_data=data.sample(n=len(data))

In [None]:
nn_X_train,nn_X_test,nn_y_train,nn_y_test=train_test_split(sampled_embedded_data.drop("Labels",axis=1),to_categorical(sampled_embedded_data["Labels"]),test_size=0.2)

In [None]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [None]:
model = Sequential()
model.add(Dense(2000, input_shape=(100,), activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1500, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(110, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy')

In [None]:
history = model.fit(nn_X_train, nn_y_train, epochs=15, batch_size=100, validation_data=(nn_X_test,nn_y_test), shuffle=True)

In [None]:
nn_y_truth=[np.argmax(value) for value in nn_y_test]
nn_y_pred=[np.argmax(value) for value in model.predict(nn_X_test)]
print("Validation Accuracy : {}".format(accuracy(nn_y_pred,nn_y_truth)))

In [None]:
nn_y_truth=[np.argmax(value) for value in nn_y_train]
nn_y_pred=[np.argmax(value) for value in model.predict(nn_X_train)]
print("Train Accuracy : {}".format(accuracy(nn_y_pred,nn_y_truth)))

In [None]:
model.save("model.h5")

# ---------------------------------------------------------------------------------------------------------------