# Import Useful Modules 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical

In [None]:
#Determine Model's File Location

version = "version_2"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [None]:
from preprocessing_pipeline import preprocessing

In [None]:
with open("model/{}/word_embedder.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

## Check

#### Preprocessing

In [None]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
preprocessor

#### Word Embedding

In [None]:
word_embedder

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

##### 30000 Data

In [None]:
# data untuk klasifikasi kategori produk
data_for_classification=pd.read_csv("data/product_data.csv")

In [None]:
data_for_classification.head()

##### 400000 Data

In [None]:
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [None]:
large_data_for_classification.head()

### Preprocess Data

##### 30000 Data

In [None]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
embedded_data, label_encoder = preprocessor.preprocess_data(
    data_for_classification["Product Title"],
    data_for_classification["Category Name"],
)

In [None]:
embedded_data.head()

In [None]:
embedded_data.shape

In [None]:
label_encoder

##### 400000 Data

In [None]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
large_embedded_data, large_label_encoder = preprocessor.preprocess_data(
    large_data_for_classification[1],
    large_data_for_classification[0],
)

In [None]:
large_embedded_data.head()

In [None]:
large_embedded_data.shape

In [None]:
large_label_encoder

In [None]:
with open("model/{}/label_encoder.pickle".format(version), "wb") as file:
    pickle.dump(large_label_encoder,file)

# ---------------------------------------------------------------------------------------------------------------

# Learning & Validation Curve

In [None]:
def plot_learning_curve(estimator, X, y, title="Learning Curve",
                        n_jobs=4,cv=5, train_sizes=np.linspace(.1, 1.0, 5),scoring="accuracy",ylim=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel(scoring)
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, 
        X, 
        y, 
        cv=cv, 
        n_jobs=n_jobs, 
        train_sizes=train_sizes,
        verbose=40, 
        scoring=scoring)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

In [None]:
def plot_validation_curve(estimator, X, y, hyperparameter, title="Validation Curve",cv=5, 
                          param_range=np.logspace(-5, 5, 2),scoring="accuracy"):


    train_scores, valid_scores = validation_curve(
        estimator, 
        X, 
        y, 
        "C",
        cv=cv,
        param_range=param_range,
        verbose=40,
        n_jobs=4, 
        scoring=scoring)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    valid_scores_std = np.std(valid_scores, axis=1)

    plt.title(title)
    plt.xlabel(hyperparameter)
    plt.ylabel(scoring)
    plt.ylim(0.0, 1.1)
    lw = 2
    plt.semilogx(param_range, train_scores_mean, label="Training score",
                 color="darkorange", lw=lw)
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=lw)
    plt.semilogx(param_range, valid_scores_mean, label="Cross-validation score",
                 color="navy", lw=lw)
    plt.fill_between(param_range, valid_scores_mean - valid_scores_std,
                     valid_scores_mean + valid_scores_std, alpha=0.2,
                     color="navy", lw=lw)
    plt.legend(loc="best")
    plt.show()

### Logistic Regression

In [None]:
data=large_embedded_data.copy()

sampled_embedded_data=data.sample(n=len(data))

# dimension_reducer=PCA(n_components=10)
# reduced_features=dimension_reducer.fit_transform(sampled_embedded_data.drop("Labels",axis=1))

print("Start Time : {}\n".format(str(datetime.datetime.now())))

plot_validation_curve(
    estimator = LR(n_jobs=-1), 
    X = reduced_features, 
    y = sampled_embedded_data["Labels"], 
    hyperparameter = "C", 
    title = "Validation Curve",
    cv = 5,
    param_range = np.logspace(-3, 3, 5),
    scoring = "accuracy")

print("\nFinish Time : {}".format(str(datetime.datetime.now())))

In [None]:
data=large_embedded_data.copy()

sampled_embedded_data=data.sample(n=len(data))

print("Start Time : {}\n".format(str(datetime.datetime.now())))

plot_learning_curve(
    estimator = LR(C=0.1,n_jobs=-1), 
    X = sampled_embedded_data.drop("Labels",axis=1), 
    y = sampled_embedded_data["Labels"], 
    title = "Learning Curve (C=0.1)",
    n_jobs=4,
    cv=2,
    train_sizes = np.linspace(.1, 1, 5), 
    scoring="accuracy")

print("\nFinish Time : {}".format(str(datetime.datetime.now())))

### Support Vector Machine

In [None]:
sampled_embedded_data=embedded_data.sample(n=len(embedded_data)//10)

print("Start Time : {}\n".format(str(datetime.datetime.now())))

plot_validation_curve(
    estimator = SVC(), 
    X = sampled_embedded_data.drop("Labels",axis=1), 
    y = sampled_embedded_data["Labels"], 
    hyperparameter = "C", 
    title = "Validation Curve",
    cv = 5,
    param_range = np.logspace(-1, 3, 5),
    scoring = "accuracy")

print("\nFinish Time : {}".format(str(datetime.datetime.now())))

In [None]:
sampled_embedded_data=embedded_data.sample(n=len(embedded_data))

print("Start Time : {}\n".format(str(datetime.datetime.now())))

plot_learning_curve(
    estimator = SVC(C=10), 
    X = sampled_embedded_data.drop("Labels",axis=1), 
    y = sampled_embedded_data["Labels"], 
    title = "Learning Curve (C=10)",
    n_jobs=4,
    cv=5,
    train_sizes = np.linspace(.1, 1, 5), 
    scoring="accuracy")

print("\nFinish Time : {}".format(str(datetime.datetime.now())))