# Import Useful Modules 

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical
from keras.optimizers import SGD

import tqdm

import sys

In [2]:
#Determine Model's File Location

version = "version_5"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [3]:
from preprocessing_pipeline import preprocessing

In [12]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)

In [4]:
with open("model/{}/word_embedder_20_new.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

In [7]:
model_1 = load_model("model/{}/level1_semsup_1.h5".format(version))
model_2 = load_model("model/{}/level1_semsup_2.h5".format(version))
model_3 = load_model("model/{}/level1_semsup_3.h5".format(version))
model_4 = load_model("model/{}/level1_semsup_4.h5".format(version))
model_5 = load_model("model/{}/level1_semsup_5.h5".format(version))
model_6 = load_model("model/{}/level1_semsup_6.h5".format(version))
model_7 = load_model("model/{}/level1_semsup_7.h5".format(version))
model_8 = load_model("model/{}/level1_semsup_8.h5".format(version))
model_9 = load_model("model/{}/level1_semsup_9.h5".format(version))
model_10 = load_model("model/{}/level1_semsup_10.h5".format(version))

models=[model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_9,model_10]

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [13]:
def read_classification_data():
    large_data_for_classification=pd.read_csv("data/big.csv",header=None)
    large_data_for_classification.dropna(axis=0,inplace=True)
    raw_category_mapper=pd.read_csv("category_mapping.csv",index_col=0)

    category_mapper={}
    for i in raw_category_mapper.index:
        category_mapper[raw_category_mapper["l2"][i]]=raw_category_mapper["l1"][i]
   
    new_category=[category_mapper[value] for value in large_data_for_classification[0]]
    large_data_for_classification[0]=new_category
    
    large_embedded_data, large_label_encoder = preprocessor.preprocess_data(
        large_data_for_classification[1],
        large_data_for_classification[0],
    )
    
    large_embedded_data["sum"]=large_embedded_data.drop(["Labels"],axis=1).sum(axis=1)
    large_embedded_data=large_embedded_data.loc[large_embedded_data["sum"]!=0].drop("sum",axis=1)
    
    return large_embedded_data

In [21]:
def read_product_title_data():
    product_title_only=pd.read_fwf('data/products2m.txt',header=None)
    product_title_only["Product Title"]=product_title_only[0]
    product_title_only=product_title_only[["Product Title"]]
    product_title_only.dropna(inplace=True,axis=0)
    
    return (product_title_only[:300000],
            product_title_only[300000:600000],
            product_title_only[600000:900000],
            product_title_only[900000:1200000],
            product_title_only[1200000:1500000],
            product_title_only[1500000:1800000],
            product_title_only[1800000:])
    
    return product_title_only

In [15]:
large_embedded_data=read_classification_data()

REMOVING PUNCTUATIONS


100%|█████████████████████████████████████████████████████████████████████████| 396099/396099 [00:05<00:00, 79183.25it/s]


CONVERTING SENTENCE TO VECTOR


100%|█████████████████████████████████████████████████████████████████████████| 396099/396099 [00:15<00:00, 25078.88it/s]


SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.37it/s]


In [22]:
product_titles=read_product_title_data()

# ---------------------------------------------------------------------------------------------------------------

In [175]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [193]:
def evaluate_semi_supervised(dataset):
    accepted_predictions=[]
    for index in tqdm.tqdm(dataset.index):
        predictions=[dataset["Prediction 1"][index],
                            dataset["Prediction 2"][index],
                            dataset["Prediction 3"][index],
                            dataset["Prediction 4"][index],
                            dataset["Prediction 5"][index],
                            dataset["Prediction 6"][index],
                            dataset["Prediction 7"][index],
                            dataset["Prediction 8"][index],
                            dataset["Prediction 9"][index],
                            dataset["Prediction 10"][index]]
        if(len(set(predictions))==1):
            accepted_predictions.append(dataset["Prediction 1"][index])
        else:
            accepted_predictions.append(-1)
    dataset["Success"]=accepted_predictions
    
    base_truth=dataset["Base Truth"]
    prediction=dataset["Success"]
    
    count=0
    total=0
    for index,pred in tqdm.tqdm(enumerate(prediction)):
        if(pred!=-1):
            total+=1
            if(pred==base_truth[index]):
                count+=1
    
    return count, total, len(dataset)    

In [211]:
def semi_supervised_classification_report(dataset):
    accepted_predictions=[]
    for index in tqdm.tqdm(dataset.index):
        predictions=[dataset["Prediction 1"][index],
                            dataset["Prediction 2"][index],
                            dataset["Prediction 3"][index],
                            dataset["Prediction 4"][index],
                            dataset["Prediction 5"][index],
                            dataset["Prediction 6"][index],
                            dataset["Prediction 7"][index],
                            dataset["Prediction 8"][index],
                            dataset["Prediction 9"][index],
                            dataset["Prediction 10"][index]]
        if(len(set(predictions))==1):
            accepted_predictions.append(dataset["Prediction 1"][index])
        else:
            accepted_predictions.append(-1)
    dataset["Success"]=accepted_predictions
    
    base_truth=dataset["Base Truth"]
    prediction=dataset["Success"]
    
    taken_base_truth=[]
    taken_prediction=[]
    for index,pred in tqdm.tqdm(enumerate(prediction)):
        if(pred!=-1):
            taken_base_truth.append(base_truth[index])
            taken_prediction.append(pred)
            
    
    print(classification_report(taken_base_truth,taken_prediction))

In [28]:
def semi_supervised_dataset(model,features):
    
    dummy_category=[0 for element in tqdm.tqdm(features["Product Title"])]

    embedded_product_title, dummy_encoder = preprocessor.preprocess_data(
        features["Product Title"],
        dummy_category,
    )
    
    embedded_product_title["sum"]=embedded_product_title.drop(["Labels"],axis=1).sum(axis=1)
    embedded_product_title=embedded_product_title.loc[embedded_product_title["sum"]!=0].drop("sum",axis=1)
    embedded_product_title.drop("Labels",axis=1,inplace=True)
    
    features=embedded_product_title
    
    semi_supervised_prediction=pd.DataFrame()
    print("PREDICTION 1")
    semi_supervised_prediction["Prediction 1"]=[np.argmax(value) for value in model[0].predict(features[[*range(0,10)]])]
    print("PREDICTION 2")
    semi_supervised_prediction["Prediction 2"]=[np.argmax(value) for value in model[1].predict(features[[*range(10,20)]])]
    print("PREDICTION 3")
    semi_supervised_prediction["Prediction 3"]=[np.argmax(value) for value in model[2].predict(features[[*range(20,30)]])]
    print("PREDICTION 4")
    semi_supervised_prediction["Prediction 4"]=[np.argmax(value) for value in model[3].predict(features[[*range(30,40)]])]
    print("PREDICTION 5")
    semi_supervised_prediction["Prediction 5"]=[np.argmax(value) for value in model[4].predict(features[[*range(40,50)]])]
    print("PREDICTION 6")
    semi_supervised_prediction["Prediction 6"]=[np.argmax(value) for value in model[5].predict(features[[*range(50,60)]])]
    print("PREDICTION 7")
    semi_supervised_prediction["Prediction 7"]=[np.argmax(value) for value in model[6].predict(features[[*range(60,70)]])]
    print("PREDICTION 8")
    semi_supervised_prediction["Prediction 8"]=[np.argmax(value) for value in model[7].predict(features[[*range(70,80)]])]
    print("PREDICTION 9")
    semi_supervised_prediction["Prediction 9"]=[np.argmax(value) for value in model[8].predict(features[[*range(80,90)]])]
    print("PREDICTION 10")
    semi_supervised_prediction["Prediction 10"]=[np.argmax(value) for value in model[9].predict(features[[*range(90,100)]])]
    
    result_dataset=features.copy()
    
    accepted_predictions=[]
    
    print("ENSEMBLING")
    for index in tqdm.tqdm(semi_supervised_prediction.index):
        predictions=[semi_supervised_prediction["Prediction {}".format(i)][index] for i in range(1,10+1)]
        
        if(len(set(predictions))==1):
            accepted_predictions.append(semi_supervised_prediction["Prediction 1"][index])
        else:
            accepted_predictions.append(-1)
            
    result_dataset["Labels"]=accepted_predictions
    
    result_dataset=result_dataset.loc[result_dataset["Labels"]!=-1]
    
    return result_dataset
    

In [30]:
additional_dataset_1 = semi_supervised_dataset(models,product_titles[0])

100%|███████████████████████████████████████████████████████████████████████| 300000/300000 [00:00<00:00, 1503534.16it/s]


REMOVING PUNCTUATIONS


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:05<00:00, 54293.03it/s]


CONVERTING SENTENCE TO VECTOR


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:24<00:00, 12305.15it/s]


SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:12<00:00,  8.23it/s]


PREDICTION 1
PREDICTION 2
PREDICTION 3
PREDICTION 4
PREDICTION 5
PREDICTION 6
PREDICTION 7
PREDICTION 8
PREDICTION 9
PREDICTION 10
ENSEMBLING


100%|██████████████████████████████████████████████████████████████████████████| 299848/299848 [00:47<00:00, 6268.48it/s]


In [31]:
additional_dataset_2 = semi_supervised_dataset(models,product_titles[1])

100%|███████████████████████████████████████████████████████████████████████| 300000/300000 [00:00<00:00, 1942538.86it/s]


REMOVING PUNCTUATIONS


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:05<00:00, 50810.41it/s]


CONVERTING SENTENCE TO VECTOR


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:26<00:00, 11325.60it/s]


SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:12<00:00,  8.23it/s]


PREDICTION 1
PREDICTION 2
PREDICTION 3
PREDICTION 4
PREDICTION 5
PREDICTION 6
PREDICTION 7
PREDICTION 8
PREDICTION 9
PREDICTION 10
ENSEMBLING


100%|██████████████████████████████████████████████████████████████████████████| 299971/299971 [00:48<00:00, 6203.36it/s]


In [32]:
additional_dataset_3 = semi_supervised_dataset(models,product_titles[2])

100%|███████████████████████████████████████████████████████████████████████| 300000/300000 [00:00<00:00, 1719571.38it/s]


REMOVING PUNCTUATIONS


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:05<00:00, 53774.61it/s]


CONVERTING SENTENCE TO VECTOR


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:23<00:00, 12749.36it/s]


SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:12<00:00,  8.10it/s]


PREDICTION 1
PREDICTION 2
PREDICTION 3
PREDICTION 4
PREDICTION 5
PREDICTION 6
PREDICTION 7
PREDICTION 8
PREDICTION 9
PREDICTION 10
ENSEMBLING


100%|██████████████████████████████████████████████████████████████████████████| 299907/299907 [00:48<00:00, 6230.85it/s]


In [33]:
additional_dataset_4 = semi_supervised_dataset(models,product_titles[3])

100%|███████████████████████████████████████████████████████████████████████| 300000/300000 [00:00<00:00, 1634987.39it/s]


REMOVING PUNCTUATIONS


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:05<00:00, 58921.22it/s]


CONVERTING SENTENCE TO VECTOR


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:21<00:00, 13647.42it/s]


SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.48it/s]


PREDICTION 1
PREDICTION 2
PREDICTION 3
PREDICTION 4
PREDICTION 5
PREDICTION 6
PREDICTION 7
PREDICTION 8
PREDICTION 9
PREDICTION 10
ENSEMBLING


100%|██████████████████████████████████████████████████████████████████████████| 299969/299969 [00:47<00:00, 6254.26it/s]


In [34]:
additional_dataset_5 = semi_supervised_dataset(models,product_titles[4])

100%|███████████████████████████████████████████████████████████████████████| 300000/300000 [00:00<00:00, 1286902.91it/s]


REMOVING PUNCTUATIONS


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:05<00:00, 55329.50it/s]


CONVERTING SENTENCE TO VECTOR


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:22<00:00, 13607.51it/s]


SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:12<00:00,  8.31it/s]


PREDICTION 1
PREDICTION 2
PREDICTION 3
PREDICTION 4
PREDICTION 5
PREDICTION 6
PREDICTION 7
PREDICTION 8
PREDICTION 9
PREDICTION 10
ENSEMBLING


100%|██████████████████████████████████████████████████████████████████████████| 299969/299969 [00:53<00:00, 5649.24it/s]


In [35]:
additional_dataset_6 = semi_supervised_dataset(models,product_titles[5])

100%|███████████████████████████████████████████████████████████████████████| 300000/300000 [00:00<00:00, 1201614.64it/s]


REMOVING PUNCTUATIONS


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:05<00:00, 56271.54it/s]


CONVERTING SENTENCE TO VECTOR


100%|█████████████████████████████████████████████████████████████████████████| 300000/300000 [00:22<00:00, 13179.96it/s]


SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:12<00:00,  8.04it/s]


PREDICTION 1
PREDICTION 2
PREDICTION 3
PREDICTION 4
PREDICTION 5
PREDICTION 6
PREDICTION 7
PREDICTION 8
PREDICTION 9
PREDICTION 10
ENSEMBLING


100%|██████████████████████████████████████████████████████████████████████████| 299989/299989 [00:48<00:00, 6227.69it/s]


In [36]:
additional_dataset_7 = semi_supervised_dataset(models,product_titles[6])

100%|███████████████████████████████████████████████████████████████████████| 138532/138532 [00:00<00:00, 1222697.30it/s]


REMOVING PUNCTUATIONS


100%|█████████████████████████████████████████████████████████████████████████| 138532/138532 [00:02<00:00, 58543.84it/s]


CONVERTING SENTENCE TO VECTOR


100%|█████████████████████████████████████████████████████████████████████████| 138532/138532 [00:10<00:00, 13178.50it/s]


SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████████████████████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 18.30it/s]


PREDICTION 1
PREDICTION 2
PREDICTION 3
PREDICTION 4
PREDICTION 5
PREDICTION 6
PREDICTION 7
PREDICTION 8
PREDICTION 9
PREDICTION 10
ENSEMBLING


100%|██████████████████████████████████████████████████████████████████████████| 138529/138529 [00:21<00:00, 6335.89it/s]


In [58]:
additional_dataset=pd.concat([additional_dataset_1,additional_dataset_2,additional_dataset_3,additional_dataset_4,additional_dataset_5,additional_dataset_6,additional_dataset_7],axis=0,ignore_index=True)

In [59]:
additional_dataset.to_csv("data/additional_data.csv")

# ---------------------------------------------------------------------------------------------------------------

In [97]:
new_dataset=pd.concat([additional_data_set,large_embedded_data],axis=0,ignore_index=True)

In [98]:
new_dataset.shape

(433207, 101)

In [99]:
data=new_dataset.copy()
sampled_embedded_data=data.sample(n=len(data))

In [100]:
nn_X_train,nn_X_test,nn_y_train,nn_y_test=train_test_split(sampled_embedded_data.drop("Labels",axis=1),to_categorical(sampled_embedded_data["Labels"]),test_size=0.02)

In [108]:
my_callback = MyCallback(nn_X_test, nn_y_test)

In [112]:
model = Sequential()
model.add(Dense(2000, input_shape=(100,), activation='relu'))
model.add(Dense(1500, activation='relu'))
model.add(Dense(1000, activation='relu'))
model.add(Dense(20, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=["accuracy"])

In [113]:
history = model.fit(nn_X_train, nn_y_train, epochs=5, batch_size=100, shuffle=True,verbose=1)

Epoch 1/5


 20400/424542 [>.............................] - ETA: 2:07:19 - loss: 3.0351 - acc: 0.03 - ETA: 1:09:04 - loss: 2.8415 - acc: 0.11 - ETA: 50:08 - loss: 2.6761 - acc: 0.1767 - ETA: 40:15 - loss: 2.6169 - acc: 0.20 - ETA: 34:24 - loss: 2.5070 - acc: 0.25 - ETA: 30:24 - loss: 2.4223 - acc: 0.28 - ETA: 27:38 - loss: 2.3545 - acc: 0.31 - ETA: 25:33 - loss: 2.2549 - acc: 0.33 - ETA: 23:55 - loss: 2.2110 - acc: 0.34 - ETA: 22:45 - loss: 2.1750 - acc: 0.36 - ETA: 21:42 - loss: 2.0848 - acc: 0.38 - ETA: 20:53 - loss: 2.0599 - acc: 0.39 - ETA: 20:12 - loss: 2.0129 - acc: 0.40 - ETA: 19:32 - loss: 1.9778 - acc: 0.41 - ETA: 18:59 - loss: 1.9635 - acc: 0.42 - ETA: 18:33 - loss: 1.9370 - acc: 0.43 - ETA: 18:05 - loss: 1.9151 - acc: 0.43 - ETA: 17:40 - loss: 1.8926 - acc: 0.44 - ETA: 17:19 - loss: 1.8667 - acc: 0.45 - ETA: 16:59 - loss: 1.8439 - acc: 0.45 - ETA: 16:42 - loss: 1.8112 - acc: 0.46 - ETA: 16:32 - loss: 1.7874 - acc: 0.47 - ETA: 16:21 - loss: 1.7745 - acc: 0.47 - ETA: 16:12 - loss: 1.7574

 33700/424542 [=>............................] - ETA: 12:21 - loss: 1.2518 - acc: 0.63 - ETA: 12:20 - loss: 1.2506 - acc: 0.63 - ETA: 12:20 - loss: 1.2492 - acc: 0.63 - ETA: 12:20 - loss: 1.2492 - acc: 0.63 - ETA: 12:19 - loss: 1.2474 - acc: 0.63 - ETA: 12:18 - loss: 1.2463 - acc: 0.63 - ETA: 12:17 - loss: 1.2460 - acc: 0.63 - ETA: 12:16 - loss: 1.2453 - acc: 0.63 - ETA: 12:16 - loss: 1.2455 - acc: 0.63 - ETA: 12:15 - loss: 1.2443 - acc: 0.63 - ETA: 12:14 - loss: 1.2430 - acc: 0.63 - ETA: 12:14 - loss: 1.2422 - acc: 0.63 - ETA: 12:13 - loss: 1.2419 - acc: 0.63 - ETA: 12:13 - loss: 1.2412 - acc: 0.63 - ETA: 12:12 - loss: 1.2416 - acc: 0.63 - ETA: 12:11 - loss: 1.2411 - acc: 0.63 - ETA: 12:11 - loss: 1.2404 - acc: 0.63 - ETA: 12:10 - loss: 1.2411 - acc: 0.63 - ETA: 12:09 - loss: 1.2403 - acc: 0.63 - ETA: 12:09 - loss: 1.2404 - acc: 0.63 - ETA: 12:08 - loss: 1.2395 - acc: 0.63 - ETA: 12:07 - loss: 1.2399 - acc: 0.63 - ETA: 12:06 - loss: 1.2397 - acc: 0.63 - ETA: 12:06 - loss: 1.2389 - acc

KeyboardInterrupt: 