# Import Useful Modules 

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical
from keras.optimizers import SGD

In [2]:
#Determine Model's File Location

version = "version_4"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [3]:
from preprocessing_pipeline import preprocessing

In [4]:
with open("model/{}/word_embedder.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

## Check

#### Preprocessing

In [5]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
preprocessor

<preprocessing_pipeline.preprocessing at 0xee69d3d128>

#### Word Embedding

In [6]:
word_embedder

<gensim.models.fasttext.FastText at 0xee69d3d2e8>

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

##### 30000 Data

In [7]:
# data untuk klasifikasi kategori produk
data_for_classification=pd.read_csv("data/product_data.csv")

In [8]:
data_for_classification.head()

Unnamed: 0,Product Id,Category Id,Category Name,Product Title
0,114628582,2,Desktop,PC HP Pavillion 251VGA-i5(4460) with LED-20
1,640569762,2,Desktop,Unik SanDisk Flashdisk 64GB Ultra USB 3 3 0 Fl...
2,757005547,2,Desktop,Paket Spek PC Agan Bagas 2
3,758393462,2,Desktop,PAket Spek PC Agan JP Wogo
4,757008997,2,Desktop,PC HP All In One AIO 20 C303D


##### 400000 Data

In [9]:
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [10]:
large_data_for_classification.head()

Unnamed: 0,0,1,2
0,hardware,KINGSTON+KVR1333D3N9,1510.0
1,musik,power+amplifier+wisdom+,62.0
2,outwear-motor,jas%20hujan%20anak,391.0
3,celana,Celana+bahan+formal,288.0
4,komputer,Preset+lightroom,1.0


### Preprocess Data

In [11]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
large_embedded_data, large_label_encoder = preprocessor.preprocess_data(
    large_data_for_classification[1],
    large_data_for_classification[0],
)

In [12]:
large_embedded_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Labels
0,0.275295,-0.438877,0.347446,0.420617,-0.655826,-0.893736,-0.268864,0.091967,0.253759,-0.407018,...,0.060328,0.121378,-0.236838,1.079665,-0.457356,-0.169778,-0.031531,-0.533224,-0.596936,45
1,0.225148,1.786125,0.616444,-0.040366,-0.316455,0.551794,-0.654612,-0.795263,1.905773,0.121287,...,-2.228765,-0.147218,0.039434,-0.156134,-0.311683,0.178662,0.349768,-0.69628,0.755887,68
2,0.227633,-0.858571,-0.520574,0.164235,-2.518956,-0.054563,1.125197,1.010731,0.801415,1.16337,...,-3.043469,2.140666,-2.564406,2.357174,1.715434,-0.115448,-2.115783,-2.267262,-1.78494,73
3,1.103322,-0.371176,1.602983,0.019839,0.338218,-2.489933,-1.299406,1.60262,1.629507,1.238133,...,-1.477074,-2.45807,-1.160693,1.680575,0.541392,1.17359,-2.529253,-1.546401,1.537661,24
4,-0.096464,-0.166233,0.577785,0.399365,-0.718592,-0.854424,0.374337,-0.318422,-0.018439,0.245501,...,-0.487409,0.230962,-0.333771,0.158394,0.055914,0.179707,0.396101,-0.48504,-0.054236,59


In [13]:
large_embedded_data["sum"]=large_embedded_data.drop(["Labels"],axis=1).sum(axis=1)

In [14]:
large_embedded_data=large_embedded_data.loc[large_embedded_data["sum"]!=0].drop("sum",axis=1)

In [15]:
large_embedded_data.shape

(392527, 101)

In [16]:
large_label_encoder

LabelEncoder()

In [54]:
with open("model/{}/label_encoder.pickle".format(version), "wb") as file:
    pickle.dump(large_label_encoder,file)

# ---------------------------------------------------------------------------------------------------------------

### Build Neural Network

In [27]:
data=large_embedded_data.copy()
sampled_embedded_data=data.sample(n=len(data))

In [28]:
nn_X_train,nn_X_test,nn_y_train,nn_y_test=train_test_split(sampled_embedded_data.drop("Labels",axis=1),to_categorical(sampled_embedded_data["Labels"]),test_size=0.02)

In [29]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [94]:
model = Sequential()
model.add(Dense(2000, input_shape=(100,), activation='relu'))
model.add(Dense(1500, activation='relu'))
model.add(Dense(1000, activation='relu'))
model.add(Dense(750, activation='relu'))
model.add(Dense(110, activation='softmax'))


model.compile(optimizer="Adagrad", loss='categorical_crossentropy',metrics=['accuracy'])

In [95]:
history = model.fit(nn_X_train, nn_y_train, epochs=30, batch_size=100, validation_data=(nn_X_test,nn_y_test), shuffle=True)

Train on 384676 samples, validate on 7851 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
  2800/384676 [..............................] - ETA: 9:27 - loss: 0.3866 - acc: 0.8514

KeyboardInterrupt: 

In [96]:
model.save("model_84_77.75.h5")

In [97]:
dir(model)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_inbound_node',
 '_built',
 '_check_num_samples',
 '_check_trainable_weights_consistency',
 '_container_nodes',
 '_feed_input_names',
 '_feed_inputs',
 '_fit_loop',
 '_flattened_layers',
 '_gather_list_attr',
 '_get_node_attribute_at_index',
 '_inbound_nodes',
 '_initial_weights',
 '_make_predict_function',
 '_make_test_function',
 '_make_train_function',
 '_node_key',
 '_nodes_by_depth',
 '_outbound_nodes',
 '_output_mask_cache',
 '_output_shape_cache',
 '_output_tensor_cache',
 '_predict_loop',
 '_standardize_user_data',
 '_test_loop',
 '_trainable',
 '_updated_config',
 'add',
 'add_loss',
 'add_updat

# ---------------------------------------------------------------------------------------------------------------

In [99]:
nn_y_truth=[np.argmax(value) for value in nn_y_test]
nn_y_pred=[np.argmax(value) for value in model.predict(nn_X_test)]
print("Validation Accuracy : {}".format(accuracy(nn_y_pred,nn_y_truth)))

Validation Accuracy : 0.7750605018468985


In [101]:
nn_y_truth=[np.argmax(value) for value in nn_y_train]
nn_y_pred=[np.argmax(value) for value in model.predict(nn_X_train)]
print("Train Accuracy : {}".format(accuracy(nn_y_pred,nn_y_truth)))

Train Accuracy : 0.8519455333839387


In [102]:
print(classification_report(nn_y_truth,nn_y_pred))

             precision    recall  f1-score   support

          0       0.81      0.85      0.83       976
          1       0.89      0.83      0.86     11176
          2       0.93      0.80      0.86     16169
          3       0.80      0.50      0.62      1093
          4       0.46      0.35      0.40        17
          5       0.84      0.83      0.84      9552
          6       0.84      0.84      0.84     13687
          7       0.77      0.73      0.75      1283
          8       0.74      0.83      0.78      1056
          9       0.83      0.84      0.83      1662
         10       0.70      0.68      0.69       983
         11       0.83      0.78      0.80      1466
         12       0.64      0.60      0.62      1116
         13       0.86      0.85      0.85      1810
         14       0.94      0.92      0.93      1404
         15       0.69      0.90      0.78       854
         16       0.81      0.76      0.78      1982
         17       0.86      0.83      0.84   

  'precision', 'predicted', average, warn_for)
