# Import Useful Modules 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [33]:
#Determine Model's File Location

version = "version_5"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [34]:
from preprocessing_pipeline import preprocessing

In [36]:
with open("model/{}/word_embedder_20_new.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

UnpicklingError: invalid load key, 'v'.

In [30]:
word_embedder.corpus_count

396099

In [25]:
word_embedder.epochs

10

In [26]:
word_embedder.vector_size

300

## Check

#### Preprocessing

In [5]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
preprocessor

<preprocessing_pipeline.preprocessing at 0xd3a6ac3940>

#### Word Embedding

In [6]:
word_embedder

<gensim.models.fasttext.FastText at 0xd3a6b18710>

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [7]:
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [8]:
large_data_for_classification.head()

Unnamed: 0,0,1,2
0,hardware,KINGSTON+KVR1333D3N9,1510.0
1,musik,power+amplifier+wisdom+,62.0
2,outwear-motor,jas%20hujan%20anak,391.0
3,celana,Celana+bahan+formal,288.0
4,komputer,Preset+lightroom,1.0


### Preprocess Data

In [None]:
def word_count(sentences):
    counts = dict()
    print("1/1")
    for sentence in sentences:
        for word in sentence:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
    return counts

In [None]:
def getFilteredData(product_title,labels,frequency, N_words, word_length):
    print("1/3")
    result=word_count(product_title)
    print("2/3")
    new_product_title=[]
    for sentence in tqdm.tqdm(product_title):
        new_product_title.append([word for word in sentence if result[word]>=frequency and len(word)>=word_length])
    
    print("3/3")
    new_features=[]
    new_labels=[]
    for index,title in tqdm.tqdm(enumerate(new_product_title)):
        if(len(title)>=N_words):
            new_features.append(title)
            new_labels.append(labels[index])
    
    return new_features,new_labels

In [None]:
def getTfIdf(new_product_title):
    print("1/3")
    concatenated_product_title=[]
    for sentence in tqdm.tqdm(new_product_title):
        concatenated_product_title.append(" ".join(sentence))
    print("2/3")
    cv=CountVectorizer()
    result=cv.fit_transform(concatenated_product_title)
    print("3/3")
    tftransformer = TfidfTransformer(smooth_idf=False)
    final_result=tftransformer.fit_transform(result)
    
    return final_result,cv,tftransformer

In [None]:
product_title=[preprocessor.remove_parentheses(value) for value in tqdm.tqdm(data_for_embedding[0])]

In [None]:
new_data=getFilteredData(product_title,list(data_for_embedding[1]),50,2,3)

In [None]:
final_data=getTfIdf(new_data[0])

In [9]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
large_embedded_data, large_label_encoder = preprocessor.preprocess_data(
    large_data_for_classification[1],
    large_data_for_classification[0],
)

TOKENIZE DATA


100%|█████████████████████████████████████████████████████████████████████████| 396099/396099 [00:05<00:00, 69972.77it/s]


APPLYING FILTER


100%|████████████████████████████████████████████████████████████████████████| 396099/396099 [00:01<00:00, 358806.76it/s]
396099it [00:00, 1223053.82it/s]
100%|████████████████████████████████████████████████████████████████████████| 233661/233661 [00:00<00:00, 955088.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 2495/2495 [01:18<00:00, 31.98it/s]


ENCODING LABELS
CONVERTING SENTENCE TO VECTOR


100%|██████████████████████████████████████████████████████████████████████████| 233661/233661 [01:20<00:00, 2900.23it/s]


SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████████████████████████████████████████████████████████████████████████████| 300/300 [00:29<00:00, 10.27it/s]


In [10]:
large_embedded_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Labels
0,3.264496,0.20871,-0.44376,-0.907765,-2.632752,0.158368,-3.586424,-1.78591,3.898135,0.939986,...,1.582202,-0.164722,-0.756719,1.773552,1.83325,-0.583499,0.179634,-3.766484,1.977673,68
1,-0.164715,1.832589,-0.262292,-2.031944,0.842927,-0.571419,-1.047813,-2.035863,1.860049,-0.758529,...,-0.270215,0.475889,-0.13637,1.81812,1.613666,-0.545223,-0.55205,-1.585006,-0.510418,73
2,-0.335104,2.684124,0.85299,-1.026407,-3.19323,-0.30396,0.79244,-2.090835,0.898031,-2.162154,...,-1.366067,-0.246191,-0.430107,0.154076,4.087794,0.552817,-0.495441,-2.110649,0.768014,24
3,1.462855,1.509614,-0.613853,0.205316,-1.6623,1.290029,0.529779,-0.465181,1.233522,-1.301436,...,-0.3011,0.40625,-0.150427,1.064217,2.052432,-0.859622,-0.087222,-0.095722,0.115876,35
4,1.355598,1.588747,-0.781314,1.432406,-4.851846,0.319511,-1.951439,-3.628851,0.650444,0.297381,...,-3.938601,2.840376,-2.844895,0.851221,0.813232,-0.668121,0.996377,-1.334379,-1.376656,58


In [11]:
large_embedded_data["sum"]=large_embedded_data.drop(["Labels"],axis=1).sum(axis=1)

In [12]:
large_embedded_data=large_embedded_data.loc[large_embedded_data["sum"]!=0].drop("sum",axis=1)

In [13]:
large_embedded_data.shape

(233347, 301)

In [14]:
large_label_encoder

LabelEncoder()

In [15]:
with open("model/{}/label_encoder.pickle".format(version), "wb") as file:
    pickle.dump(large_label_encoder,file)

# ---------------------------------------------------------------------------------------------------------------

### Build Neural Network

In [16]:
data=large_embedded_data.copy()
sampled_embedded_data=data.sample(n=len(data))

In [17]:
nn_X_train,nn_X_test,nn_y_train,nn_y_test=train_test_split(sampled_embedded_data.drop("Labels",axis=1),to_categorical(sampled_embedded_data["Labels"]),test_size=0.2)

In [18]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [21]:
model = Sequential()
model.add(Dense(2000, input_shape=(300,), activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1500, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(110, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy')

In [22]:
history = model.fit(nn_X_train, nn_y_train, epochs=15, batch_size=100, validation_data=(nn_X_test,nn_y_test), shuffle=True)

Train on 186677 samples, validate on 46670 samples
Epoch 1/15












Epoch 2/15












Epoch 3/15












Epoch 4/15












Epoch 5/15












Epoch 6/15












Epoch 7/15












Epoch 8/15












Epoch 9/15


 30200/186677 [===>..........................] - ETA: 4:08 - loss: 0.6357 - ETA: 4:13 - loss: 0.6029 - ETA: 4:07 - loss: 0.5416 - ETA: 4:07 - loss: 0.5918 - ETA: 4:11 - loss: 0.5838 - ETA: 4:11 - loss: 0.5931 - ETA: 4:11 - loss: 0.5974 - ETA: 4:12 - loss: 0.6394 - ETA: 4:10 - loss: 0.6267 - ETA: 4:10 - loss: 0.6092 - ETA: 4:08 - loss: 0.6113 - ETA: 4:07 - loss: 0.6070 - ETA: 4:06 - loss: 0.6108 - ETA: 4:06 - loss: 0.6150 - ETA: 4:08 - loss: 0.6180 - ETA: 4:06 - loss: 0.6170 - ETA: 4:08 - loss: 0.6026 - ETA: 4:07 - loss: 0.6099 - ETA: 4:05 - loss: 0.6093 - ETA: 4:04 - loss: 0.6167 - ETA: 4:04 - loss: 0.6212 - ETA: 4:05 - loss: 0.6204 - ETA: 4:04 - loss: 0.6170 - ETA: 4:05 - loss: 0.6171 - ETA: 4:05 - loss: 0.6320 - ETA: 4:04 - loss: 0.6374 - ETA: 4:03 - loss: 0.6338 - ETA: 4:03 - loss: 0.6334 - ETA: 4:03 - loss: 0.6256 - ETA: 4:03 - loss: 0.6246 - ETA: 4:02 - loss: 0.6273 - ETA: 4:02 - loss: 0.6294 - ETA: 4:01 - loss: 0.6365 - ETA: 4:01 - loss: 0.6425 - ETA: 4:00 - loss: 0.6402 - ETA: 3

KeyboardInterrupt: 

In [None]:
nn_y_truth=[np.argmax(value) for value in nn_y_test]
nn_y_pred=[np.argmax(value) for value in model.predict(nn_X_test)]
print("Validation Accuracy : {}".format(accuracy(nn_y_pred,nn_y_truth)))

In [None]:
nn_y_truth=[np.argmax(value) for value in nn_y_train]
nn_y_pred=[np.argmax(value) for value in model.predict(nn_X_train)]
print("Train Accuracy : {}".format(accuracy(nn_y_pred,nn_y_truth)))

In [None]:
model.save("model.h5")

# ---------------------------------------------------------------------------------------------------------------