# Import Useful Modules 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
#Determine Model's File Location

version = "version_2"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [4]:
from preprocessing_pipeline import preprocessing

In [5]:
with open("model/{}/label_encoder.pickle".format(version), "rb") as file:
    label_encoder = pickle.load(file)

with open("model/{}/word_embedder.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

neural_network = load_model("model/{}/neural_network.h5".format(version))

## Check

#### Category Encoder

In [31]:
label_encoder

LabelEncoder()

#### Preprocessing

In [32]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)

#### Word Embedding

In [33]:
word_embedder

<gensim.models.fasttext.FastText at 0x4e9d2c77f0>

#### Classifier

In [34]:
neural_network

<keras.models.Sequential at 0x4e9f6e2ef0>

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [35]:
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [36]:
large_data_for_classification.head()

Unnamed: 0,0,1,2
0,hardware,KINGSTON+KVR1333D3N9,1510.0
1,musik,power+amplifier+wisdom+,62.0
2,outwear-motor,jas%20hujan%20anak,391.0
3,celana,Celana+bahan+formal,288.0
4,komputer,Preset+lightroom,1.0


### Preprocess Data

In [37]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
large_embedded_data = preprocessor.preprocess_data(
    large_data_for_classification[1],
    large_data_for_classification[0],
    label_encoder
)

In [38]:
large_embedded_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Labels
0,-0.580157,-0.902683,-0.200629,0.426924,-0.298552,0.377382,0.334578,-1.341522,0.284654,-0.601798,...,0.714149,0.436149,0.067786,0.280465,-0.069688,0.040827,0.063288,0.042661,-0.402801,45
1,-0.151489,-1.193317,-0.412542,0.074164,-1.0538,0.283322,0.306509,-0.714775,-0.282685,-0.205421,...,1.560397,1.276572,-0.248248,0.154423,-0.181553,-0.448146,-0.126884,-0.248857,1.153092,68
2,-0.206052,-0.15816,-0.194283,0.238795,0.007292,0.196439,0.141539,-0.22354,0.046263,-0.282032,...,-0.081149,0.099055,0.030815,0.09932,-0.081225,0.031153,-0.080204,-0.028578,-0.03781,73
3,-0.053166,0.007892,-1.529572,1.45867,-0.734888,-0.503057,-0.532288,-0.022648,0.070379,-1.529616,...,1.294445,-0.273994,1.71803,0.06049,-1.941472,1.189055,-0.650072,-0.596036,0.285584,24
4,-0.162186,-0.530731,-0.356022,0.209101,-0.282881,0.324258,0.422291,-0.745394,-0.038498,-0.477472,...,0.860707,0.729105,-0.046928,0.239313,0.133619,-0.398247,0.280034,0.195425,0.424736,59


In [39]:
large_embedded_data.shape

(396099, 101)

# ---------------------------------------------------------------------------------------------------------------

### Test

In [40]:
def predict_product_class(query,preprocessor=preprocessor,classifier=neural_network,label_encoder=label_encoder):
    no_parentheses=preprocessor.remove_parentheses(query)
    embedded_query=preprocessor.vectorize_sentence(no_parentheses).reshape(-1,100)
    prediction=classifier.predict_classes(embedded_query)
    class_prediction=label_encoder.inverse_transform(prediction[0])
    return class_prediction

In [41]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [42]:
model=neural_network
data=large_embedded_data.copy()

In [23]:
# sampled_embedded_data=data.sample(n=len(data))

In [63]:
truth=data["Labels"]
pred=model.predict_classes(data.drop("Labels",axis=1))
print("Accuracy : {}".format(accuracy(pred,truth)))

Accuracy : 0.8037409839459327


In [64]:
print(classification_report(truth,pred))

             precision    recall  f1-score   support

          0       0.75      0.81      0.78       997
          1       0.80      0.80      0.80     11550
          2       0.88      0.77      0.82     16564
          3       0.67      0.49      0.57      1147
          4       0.42      0.29      0.34        17
          5       0.75      0.80      0.77      9782
          6       0.76      0.77      0.76     13987
          7       0.70      0.64      0.67      1323
          8       0.72      0.67      0.69      1107
          9       0.76      0.75      0.76      1717
         10       0.65      0.58      0.61      1010
         11       0.77      0.74      0.75      1515
         12       0.53      0.57      0.55      1148
         13       0.83      0.78      0.81      1880
         14       0.85      0.92      0.89      1441
         15       0.74      0.79      0.76       882
         16       0.81      0.72      0.76      2046
         17       0.81      0.86      0.84   

# ---------------------------------------------------------------------------------------------------------------