# Import Useful Modules 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS

import keras
import tensorflow
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical

from preprocessing_pipeline import preprocessing

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [None]:
# data untuk klasifikasi kategori produk
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

# data untuk word embedding
data_for_embedding=pd.read_fwf('data/products2m.txt',header=None)
data_for_embedding["Product Title"]=data_for_embedding[0]
data_for_embedding=data_for_embedding[["Product Title"]]
data_for_embedding.dropna(inplace=True,axis=0)

In [None]:
data_for_embedding.head()

# ---------------------------------------------------------------------------------------------------------------

# Construct Word Embedder (Using fasttext)

In [None]:
def remove_parentheses_old(input_string):
    result_string=input_string.lower()
    target_parentheses=['-','/','[',']','!','(',')',',','.','+','-',"'",'"',"|","*","@","#","!","<",">",":",";","?"]
    for parentheses in target_parentheses:
        result_string=result_string.replace(parentheses, ' ')
    result_string=result_string.strip(' ').split()
    return result_string

In [None]:
def remove_parentheses(input_string):
    input_string=''.join(i for i in input_string if not i.isdigit())
    result_string=input_string.lower()
    target_parentheses=['-','/','[',']','!','(',')',',','.','+','-',"'",'"',"|","*","@","#","!","<",">",":",";","?"]
    for parentheses in target_parentheses:
        result_string=result_string.replace(parentheses, ' ')
    result_string=result_string.strip(' ').split()
    return result_string

In [None]:
# menghapus karakter tidak penting dari data
product_title=[remove_parentheses(value) for value in data_for_embedding["Product Title"]]

# ---------------------------------------------------------------------------------------------------------------

# Finding Best Embedding Window

In [60]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [61]:
sampled_data=large_data_for_classification.sample(n=50000,random_state=1387178)
mask = np.random.rand(len(sampled_data)) < 0.8
train = sampled_data[mask]
validation = sampled_data[~mask]

In [63]:
result=[]
for EMBEDDING_WINDOW in [1,2,3,5,7]:
    print("TESTING ON EMBEDDING WINDOW OF {} | {}".format(EMBEDDING_WINDOW,str(datetime.datetime.now())))
    word_embedder = FastText(product_title, size=100, window=EMBEDDING_WINDOW, min_count=10, workers=4, sg=1, seed=SEED, min_n=5, iter=1)
    print("\tEMBEDDER CONSTRUCTED | {}".format(str(datetime.datetime.now())))
    preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
    embedded_data,label_encoder=preprocessor.preprocess_data(train[1],train[0])
    validation_set,validation_label_encoder=preprocessor.preprocess_data(validation[1],validation[0])
    embedded_data["sum"]=embedded_data.drop(["Labels"],axis=1).sum(axis=1)
    embedded_data=embedded_data.loc[embedded_data["sum"]!=0].drop("sum",axis=1)
    print("\tPREPROCESSING FINISHED | {}".format(str(datetime.datetime.now())))
    
    print("\n\tTRAINING CLASSIFIER | {}".format(str(datetime.datetime.now())))
    model = Sequential()
    model.add(Dense(2000, input_shape=(100,), activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1500, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(107, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy')
    
    history = model.fit(embedded_data.drop("Labels",axis=1),to_categorical(embedded_data["Labels"]), epochs=4, batch_size=32, shuffle=True)
    
    truth=[np.argmax(value) for value in to_categorical(validation_set["Labels"])]
    pred=[np.argmax(value) for value in model.predict(validation_set.drop("Labels",axis=1))]
    result.append(accuracy(pred,truth))
    print("\tVALIDATION ACCURACY : {}\n\n".format(accuracy(pred,truth)))

TESTING ON EMBEDDING WINDOW OF 1 | 2018-06-28 14:00:30.220999
	EMBEDDER CONSTRUCTED | 2018-06-28 14:01:17.235800
	PREPROCESSING FINISHED | 2018-06-28 14:01:21.980418

	TRAINING CLASSIFIER | 2018-06-28 14:01:21.981420
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
	VALIDATION ACCURACY : 0.1553686293913904

TESTING ON EMBEDDING WINDOW OF 2 | 2018-06-28 14:07:22.443479
	EMBEDDER CONSTRUCTED | 2018-06-28 14:08:24.282982
	PREPROCESSING FINISHED | 2018-06-28 14:08:31.842052

	TRAINING CLASSIFIER | 2018-06-28 14:08:31.842052
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
	VALIDATION ACCURACY : 0.15804057397328056

TESTING ON EMBEDDING WINDOW OF 3 | 2018-06-28 14:14:31.538178
	EMBEDDER CONSTRUCTED | 2018-06-28 14:15:23.935534
	PREPROCESSING FINISHED | 2018-06-28 14:15:28.571863

	TRAINING CLASSIFIER | 2018-06-28 14:15:28.571863
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
	VALIDATION ACCURACY : 0.16665017318159328

TESTING ON EMBEDDING WINDOW OF 5 | 2018-06-28 14:21:34.401014
	EMBEDDER CONSTRUCTED | 2018-06-28 1

In [67]:
result=[]
for EMBEDDING_WINDOW in [11]:
    print("TESTING ON EMBEDDING WINDOW OF {} | {}".format(EMBEDDING_WINDOW,str(datetime.datetime.now())))
    word_embedder = FastText(product_title, size=100, window=EMBEDDING_WINDOW, min_count=10, workers=4, sg=1, seed=SEED, min_n=5, iter=1)
    print("\tEMBEDDER CONSTRUCTED | {}".format(str(datetime.datetime.now())))
    preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
    embedded_data,label_encoder=preprocessor.preprocess_data(train[1],train[0])
    validation_set,validation_label_encoder=preprocessor.preprocess_data(validation[1],validation[0])
    embedded_data["sum"]=embedded_data.drop(["Labels"],axis=1).sum(axis=1)
    embedded_data=embedded_data.loc[embedded_data["sum"]!=0].drop("sum",axis=1)
    print("\tPREPROCESSING FINISHED | {}".format(str(datetime.datetime.now())))
    
    print("\n\tTRAINING CLASSIFIER | {}".format(str(datetime.datetime.now())))
    model = Sequential()
    model.add(Dense(2000, input_shape=(100,), activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1500, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(107, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy')
    
    history = model.fit(embedded_data.drop("Labels",axis=1),to_categorical(embedded_data["Labels"]), epochs=4, batch_size=32, shuffle=True)
    
    truth=[np.argmax(value) for value in to_categorical(validation_set["Labels"])]
    pred=[np.argmax(value) for value in model.predict(validation_set.drop("Labels",axis=1))]
    result.append(accuracy(pred,truth))
    print("\tVALIDATION ACCURACY : {}\n\n".format(accuracy(pred,truth)))

TESTING ON EMBEDDING WINDOW OF 11 | 2018-06-28 14:37:24.287958
	EMBEDDER CONSTRUCTED | 2018-06-28 14:38:29.156665
	PREPROCESSING FINISHED | 2018-06-28 14:38:33.982530

	TRAINING CLASSIFIER | 2018-06-28 14:38:33.983514
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
	VALIDATION ACCURACY : 0.17219198416625434




# ---------------------------------------------------------------------------------------------------------------

# Finding Best Embedding Dimension

In [71]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [72]:
sampled_data=large_data_for_classification.sample(n=50000,random_state=1387178)
mask = np.random.rand(len(sampled_data)) < 0.8
train = sampled_data[mask]
validation = sampled_data[~mask]

In [None]:
result=[]
for EMBEDDING_DIMENSION in [200]:
    print("TESTING ON EMBEDDING DIMENSION OF {} | {}".format(EMBEDDING_DIMENSION,str(datetime.datetime.now())))
    word_embedder = FastText(product_title, size=EMBEDDING_DIMENSION, window=7, min_count=10, workers=4, sg=1, seed=SEED, min_n=5, iter=1)
    print("\tEMBEDDER CONSTRUCTED | {}".format(str(datetime.datetime.now())))
    preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
    embedded_data,label_encoder=preprocessor.preprocess_data(train[1],train[0])
    validation_set,validation_label_encoder=preprocessor.preprocess_data(validation[1],validation[0])
    embedded_data["sum"]=embedded_data.drop(["Labels"],axis=1).sum(axis=1)
    embedded_data=embedded_data.loc[embedded_data["sum"]!=0].drop("sum",axis=1)
    print("\tPREPROCESSING FINISHED | {}".format(str(datetime.datetime.now())))
    
    print("\n\tTRAINING CLASSIFIER | {}".format(str(datetime.datetime.now())))
    model = Sequential()
    model.add(Dense(2000, input_shape=(EMBEDDING_DIMENSION,), activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1500, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(107, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy')
    
    history = model.fit(embedded_data.drop("Labels",axis=1),to_categorical(embedded_data["Labels"]), epochs=2, batch_size=32, shuffle=True)
    
    truth=[np.argmax(value) for value in to_categorical(validation_set["Labels"])]
    pred=[np.argmax(value) for value in model.predict(validation_set.drop("Labels",axis=1))]
    result.append(accuracy(pred,truth))
    print("\tVALIDATION ACCURACY : {}\n\n".format(accuracy(pred,truth)))

# ---------------------------------------------------------------------------------------------------------------

# Finding Best Embedding Epoch

In [None]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [None]:
sampled_data=large_data_for_classification.sample(n=50000,random_state=1387178)
mask = np.random.rand(len(sampled_data)) < 0.8
train = sampled_data[mask]
validation = sampled_data[~mask]

In [None]:
result=[]
for EMBEDDING_EPOCH in [1,5,10,20,40]:
    print("TESTING ON EMBEDDING EPOCH OF {} | {}".format(EMBEDDING_EPOCH,str(datetime.datetime.now())))
    word_embedder = FastText(product_title, size=100, window=5, min_count=10, workers=4, sg=1, seed=SEED, min_n=5, iter=EMBEDDING_EPOCH)
    print("\tEMBEDDER CONSTRUCTED | {}".format(str(datetime.datetime.now())))
    preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
    embedded_data,label_encoder=preprocessor.preprocess_data(train[1],train[0])
    validation_set,validation_label_encoder=preprocessor.preprocess_data(validation[1],validation[0])
    embedded_data["sum"]=embedded_data.drop(["Labels"],axis=1).sum(axis=1)
    embedded_data=embedded_data.loc[embedded_data["sum"]!=0].drop("sum",axis=1)
    print("\tPREPROCESSING FINISHED | {}".format(str(datetime.datetime.now())))
    
    print("\n\tTRAINING CLASSIFIER | {}".format(str(datetime.datetime.now())))
    model = Sequential()
    model.add(Dense(2000, input_shape=(100,), activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1500, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(107, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy')
    
    history = model.fit(embedded_data.drop("Labels",axis=1),to_categorical(embedded_data["Labels"]), epochs=2, batch_size=32, shuffle=True)
    
    truth=[np.argmax(value) for value in to_categorical(validation_set["Labels"])]
    pred=[np.argmax(value) for value in model.predict(validation_set.drop("Labels",axis=1))]
    result.append(accuracy(pred,truth))
    print("\tVALIDATION ACCURACY : {}\n\n".format(accuracy(pred,truth)))

# ---------------------------------------------------------------------------------------------------------------