# Import Useful Modules 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.utils import to_categorical
from keras.layers import Conv1D, GlobalMaxPooling1D, Flatten
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#Determine Model's File Location

version = "version_2"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [3]:
from preprocessing_pipeline import preprocessing

In [4]:
with open("model/{}/word_embedder.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

## Check

#### Preprocessing

In [5]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
preprocessor

<preprocessing_pipeline.preprocessing at 0xa816af91d0>

#### Word Embedding

In [6]:
word_embedder

<gensim.models.fasttext.FastText at 0xa816af9400>

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [7]:
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [8]:
large_data_for_classification.head()

Unnamed: 0,0,1,2
0,hardware,KINGSTON+KVR1333D3N9,1510.0
1,musik,power+amplifier+wisdom+,62.0
2,outwear-motor,jas%20hujan%20anak,391.0
3,celana,Celana+bahan+formal,288.0
4,komputer,Preset+lightroom,1.0


### Preprocess Data

In [9]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
large_embedded_data, large_label_encoder = preprocessor.preprocess_data(
    large_data_for_classification[1],
    large_data_for_classification[0],
)

In [10]:
large_embedded_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Labels
0,-0.580157,-0.902683,-0.200629,0.426924,-0.298552,0.377382,0.334578,-1.341522,0.284654,-0.601798,...,0.714149,0.436149,0.067786,0.280465,-0.069688,0.040827,0.063288,0.042661,-0.402801,45
1,-0.151489,-1.193317,-0.412542,0.074164,-1.0538,0.283322,0.306509,-0.714775,-0.282685,-0.205421,...,1.560397,1.276572,-0.248248,0.154423,-0.181553,-0.448146,-0.126884,-0.248857,1.153092,68
2,-0.206052,-0.15816,-0.194283,0.238795,0.007292,0.196439,0.141539,-0.22354,0.046263,-0.282032,...,-0.081149,0.099055,0.030815,0.09932,-0.081225,0.031153,-0.080204,-0.028578,-0.03781,73
3,-0.053166,0.007892,-1.529572,1.45867,-0.734888,-0.503057,-0.532288,-0.022648,0.070379,-1.529616,...,1.294445,-0.273994,1.71803,0.06049,-1.941472,1.189055,-0.650072,-0.596036,0.285584,24
4,-0.162186,-0.530731,-0.356022,0.209101,-0.282881,0.324258,0.422291,-0.745394,-0.038498,-0.477472,...,0.860707,0.729105,-0.046928,0.239313,0.133619,-0.398247,0.280034,0.195425,0.424736,59


In [11]:
large_embedded_data["sum"]=large_embedded_data.drop(["Labels"],axis=1).sum(axis=1)

In [12]:
large_embedded_data=large_embedded_data.loc[large_embedded_data["sum"]!=0]

In [13]:
large_embedded_data.shape

(392417, 102)

In [14]:
large_label_encoder

LabelEncoder()

# ---------------------------------------------------------------------------------------------------------------

### Build Neural Network

In [219]:
data=large_embedded_data.copy()
sampled_embedded_data=data.sample(n=len(data))

In [220]:
features=sampled_embedded_data.drop(["Labels","sum"],axis=1)
labels=to_categorical(sampled_embedded_data["Labels"])

In [221]:
features=np.array(features).reshape(-1,100,1)
labels=np.array(labels).reshape(-1,110)

In [222]:
nn_X_train,nn_X_test,nn_y_train,nn_y_test=train_test_split(features,labels,test_size=0.2)

In [223]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [None]:
model = Sequential()
model.add(Conv1D (kernel_size = (3), filters = 100, activation='linear',input_shape=(100,1)))
model.add(Dropout(0.1))
model.add(Conv1D (kernel_size = (3), filters = 50, activation='linear',input_shape=(100,1)))
model.add(Dropout(0.1))

model.add(Flatten())
model.add(Dense(110, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy')

In [244]:
model.fit(nn_X_train,nn_y_train,epochs=1,batch_size=100)

Epoch 1/1
 21900/313933 [=>............................] - ETA: 5:12 - loss: 3.9309

KeyboardInterrupt: 

In [234]:
nn_y_truth=[np.argmax(value) for value in nn_y_train]
nn_y_pred=[np.argmax(value) for value in model.predict(nn_X_train)]
print("Train Accuracy : {}".format(accuracy(nn_y_pred,nn_y_truth)))

Train Accuracy : 0.5281031302857616


# ---------------------------------------------------------------------------------------------------------------

In [430]:
data=large_embedded_data.copy()
sampled_embedded_data=data.sample(n=len(data))

In [431]:
features=sampled_embedded_data.drop(["Labels","sum"],axis=1)
labels=sampled_embedded_data["Labels"]

In [432]:
X_train,X_test,y_train,y_test=train_test_split(features,labels,test_size=0.2)

In [433]:
N=0

In [436]:
model=GBC(verbose=100,learning_rate=0.1,n_estimators=0,warm_start=True,max_depth=8,min_samples_leaf=3,max_features=30)

In [437]:
N+=1
model.set_params(n_estimators=N,warm_start=True)
model.fit(X_train,y_train)
print(model.score(X_test,y_test))
print(model.score(X_train,y_train))

      Iter       Train Loss   Remaining Time 


KeyboardInterrupt: 