# Import Useful Modules 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#Determine Model's File Location

version = "version_2"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [3]:
from preprocessing_pipeline import preprocessing

In [4]:
with open("model/{}/label_encoder.pickle".format(version), "rb") as file:
    label_encoder = pickle.load(file)

with open("model/{}/word_embedder.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

neural_network = load_model("model/{}/neural_network.h5".format(version))

## Check

#### Category Encoder

In [5]:
label_encoder

LabelEncoder()

#### Preprocessing

In [6]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)

#### Word Embedding

In [7]:
word_embedder

<gensim.models.fasttext.FastText at 0xe47a892f98>

#### Classifier

In [8]:
neural_network

<keras.models.Sequential at 0xe47ccb07b8>

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [9]:
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [10]:
large_data_for_classification.head()

Unnamed: 0,0,1,2
0,hardware,KINGSTON+KVR1333D3N9,1510.0
1,musik,power+amplifier+wisdom+,62.0
2,outwear-motor,jas%20hujan%20anak,391.0
3,celana,Celana+bahan+formal,288.0
4,komputer,Preset+lightroom,1.0


### Preprocess Data

In [11]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
large_embedded_data = preprocessor.preprocess_data(
    large_data_for_classification[1],
    large_data_for_classification[0],
    label_encoder
)

In [12]:
large_embedded_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Labels
0,-0.036711,-0.336128,-0.161967,0.099742,-0.092715,0.196797,0.281012,-0.571889,0.054769,-0.252326,...,0.493046,0.427457,-0.021196,0.340428,-0.027172,-0.208955,0.11806,0.153254,0.065467,45
1,-0.151489,-1.193317,-0.412542,0.074164,-1.0538,0.283322,0.306509,-0.714775,-0.282685,-0.205421,...,1.560397,1.276572,-0.248248,0.154423,-0.181553,-0.448146,-0.126884,-0.248857,1.153092,68
2,-0.236827,-0.166503,-0.234478,0.278609,-0.010111,0.248139,0.202832,-0.236111,0.063634,-0.34277,...,-0.090241,0.113574,0.08181,0.114736,-0.101849,-0.024248,-0.114954,-0.040353,-0.044465,73
3,-0.053166,0.007892,-1.529572,1.45867,-0.734888,-0.503057,-0.532288,-0.022648,0.070379,-1.529616,...,1.294445,-0.273994,1.71803,0.06049,-1.941472,1.189055,-0.650072,-0.596036,0.285584,24
4,-0.162186,-0.530731,-0.356022,0.209101,-0.282881,0.324258,0.422291,-0.745394,-0.038498,-0.477472,...,0.860707,0.729105,-0.046928,0.239313,0.133619,-0.398247,0.280034,0.195425,0.424736,59


In [13]:
large_embedded_data.shape

(396099, 101)

# ---------------------------------------------------------------------------------------------------------------

In [14]:
dir(word_embedder)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_check_training_sanity',
 '_clear_post_train',
 '_do_train_job',
 '_get_job_params',
 '_get_thread_working_mem',
 '_job_producer',
 '_load_dict',
 '_load_model_params',
 '_load_specials',
 '_load_vectors',
 '_log_epoch_end',
 '_log_epoch_progress',
 '_log_progress',
 '_log_train_end',
 '_raw_word_count',
 '_save_specials',
 '_set_train_params',
 '_smart_save',
 '_train_epoch',
 '_update_job_params',
 '_worker_loop',
 'accuracy',
 'alpha',
 'batch_words',
 'bucket',
 'build_vocab',
 'build_vocab_from_freq',
 'callbacks',
 'cbow_mean',
 'clear_sims',
 'compute_loss',
 '

### Test

In [15]:
def predict_product_class(query,preprocessor=preprocessor,classifier=neural_network,label_encoder=label_encoder):
    no_parentheses=preprocessor.remove_parentheses(query)
    embedded_query=preprocessor.vectorize_sentence(no_parentheses).reshape(-1,100)
    prediction=classifier.predict_classes(embedded_query)
    class_prediction=label_encoder.inverse_transform(prediction[0])
    return class_prediction

In [16]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [17]:
model=neural_network
data=large_embedded_data.copy()

In [18]:
# sampled_embedded_data=data.sample(n=len(data))

In [63]:
# truth=data["Labels"]
# pred=model.predict_classes(data.drop("Labels",axis=1))
# # print("Accuracy : {}".format(accuracy(pred,truth)))
# print(classification_report(truth,pred))

Accuracy : 0.8037409839459327


# ---------------------------------------------------------------------------------------------------------------