# Import Useful Modules 

In [2]:
import tensorflow
config = tensorflow.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5 #(misalnya kalo pengen 0.4 dari GPU memory)
session = tensorflow.Session(config=config)

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText


# # Classifier
from xgboost import XGBClassifier as XGB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical
from keras.optimizers import SGD

import tqdm

In [4]:
#Determine Model's File Location

version = "version_x"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [5]:
from preprocessing_pipeline import preprocessing

In [6]:
with open("model/{}/word_embedder.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

## Check

#### Preprocessing

In [7]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
preprocessor

<preprocessing_pipeline.preprocessing at 0x7f4156b8a748>

#### Word Embedding

In [8]:
word_embedder

<gensim.models.fasttext.FastText at 0x7f4156bc67f0>

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [9]:
large_data_for_classification=pd.read_csv("data/query.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [10]:
large_data_for_classification.head()

Unnamed: 0,0,1,2
0,hunting,gear+camping,327
1,koleksi,Lightstick+EXO,63
2,kartu-perdana,vivo+v7,183
3,aksesoris-mobil,subwoofer+kolong+aktif,389
4,anak-perempuan,Hotpants+anak,1366


### Preprocess Data

In [10]:
raw_category_mapper=pd.read_csv("raw_leaf.csv",index_col=0)

category_mapper={}
for i in raw_category_mapper.index:
    category_mapper[raw_category_mapper["l2"][i]]=raw_category_mapper["l1"][i]

In [11]:
new_category=[category_mapper[value] for value in large_data_for_classification[0]]
large_data_for_classification[0]=new_category

In [15]:
product_title=[preprocessor.remove_parentheses(value) for value in tqdm.tqdm(large_data_for_classification[1])]

100%|██████████| 2440682/2440682 [06:09<00:00, 6605.80it/s]


In [26]:
def word_count(sentences):
    counts = dict()
    print("1/1")
    for sentence in sentences:
        for word in sentence:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
    return counts

In [32]:
def getFilteredData(product_title,frequency, N_words, word_length):
    print("1/3")
    result=word_count(product_title)
    print("2/3")
    new_product_title=[]
    for sentence in tqdm.tqdm(product_title):
        new_product_title.append([word for word in sentence if result[word]>=frequency and len(word)>=word_length])
    print("3/3")
    new_product_title=[title for title in tqdm.tqdm(new_product_title) if len(title)>=N_words]
    
    return new_product_title

In [None]:
def getTfIdf(new_product_title):
    print("1/3")
    concatenated_product_title=[]
    for sentence in tqdm.tqdm(new_product_title):
        concatenated_product_title.append(" ".join(sentence))
    print("2/3")
    cv=CountVectorizer()
    result=cv.fit_transform(concatenated_product_title).toarray()
    print("3/3")
    tftransformer = TfidfTransformer(smooth_idf=False)
    final_result=tftransformer.fit_transform(result).toarray()
    
    return final_result,cv,tftransformer

In [29]:
new_data=getFilteredData(product_title,50,2,2)

100%|██████████| 2440682/2440682 [01:10<00:00, 34736.18it/s]
100%|██████████| 2440682/2440682 [00:13<00:00, 186470.39it/s]


In [31]:
final_data,count_vectorizer,tf_idf=getTfIdf(new_data)

100%|██████████| 1757951/1757951 [00:13<00:00, 129835.47it/s]


KeyboardInterrupt: 

# ---------------------------------------------------------------------------------------------------------------

# Neural Network

In [None]:
data=large_embedded_data
sampled_embedded_data=data.sample(n=len(data))

In [None]:
nn_X_train,nn_X_test,nn_y_train,nn_y_test=train_test_split(sampled_embedded_data.drop("Labels",axis=1),to_categorical(sampled_embedded_data["Labels"]),test_size=0.02)

In [None]:
del large_embedded_data
del large_data_for_classification
del word_embedder
del preprocessor

In [None]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [None]:
model = Sequential()
model.add(Dense(4000, input_shape=(500,), activation='relu'))
model.add(Dense(3000, activation='relu'))
model.add(Dense(2000, activation='relu'))
model.add(Dense(20, activation='softmax'))


model.compile(optimizer="Adagrad", loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
history = model.fit(nn_X_train, nn_y_train, epochs=10, batch_size=100, validation_data=(nn_X_test,nn_y_test), shuffle=True)

In [None]:
history = model.fit(nn_X_train, nn_y_train,initial_epoch=10,epochs=20, batch_size=100, validation_data=(nn_X_test,nn_y_test), shuffle=True)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.show()

In [None]:
model.save("nn_experiment_90.h5")

# ---------------------------------------------------------------------------------------------------------------

# Splitting Data

In [18]:
data=large_embedded_data
sampled_embedded_data=data.sample(n=len(data))

In [19]:
X_train,X_test,y_train,y_test=train_test_split(sampled_embedded_data.drop("Labels",axis=1), sampled_embedded_data["Labels"],test_size=0.2)

# ---------------------------------------------------------------------------------------------------------------

# Logistic Regression

Optimized

In [None]:
lr_parameters = {'C':[1,10,100],'class_weight':["balanced",None],'solver':["newton-cg","saga","lbfgs"]}

classifier_lr=LR(n_jobs=-1)

clf_lr = GS(classifier_lr, lr_parameters,verbose=4,cv=5)

clf_lr.fit(X_train,y_train)

In [None]:
pickle.dump(clf_lr, open("lr_experiment.pickle", 'wb'))

No Optimization

In [28]:
print("Start Time : {}\n".format(str(datetime.datetime.now())))
baseline_lr=LR(n_jobs=-1,solver="saga")
baseline_lr.fit(X_train,y_train)
print("Finish Time : {}\n".format(str(datetime.datetime.now())))

Start Time : 2018-07-19 07:29:58.367029

max_iter reached after 98 seconds




max_iter reached after 98 seconds
max_iter reached after 98 seconds
max_iter reached after 98 seconds
max_iter reached after 99 seconds
max_iter reached after 99 seconds


[Parallel(n_jobs=-1)]: Done   5 out of  20 | elapsed:  1.7min remaining:  5.0min


max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds


[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  1.7min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:  1.7min remaining:   17.6s


max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds
max_iter reached after 100 seconds
Finish Time : 2018-07-19 07:31:38.464525



[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.7min finished


# ---------------------------------------------------------------------------------------------------------------

# Support Vector Machine

Optimized

In [None]:
svm_parameters = {'gamma':[0.1,1,10,100],'C':[1,10,100]}

classifier_svm=SVC()

clf_svm = GS(classifier_svm, svm_parameters,verbose=4,cv=5)

clf_svm.fit(X_train,y_train)

In [None]:
pickle.dump(clf_svm, open("svm_experiment.pickle", 'wb'))


No Optimization

In [None]:
print("Start Time : {}\n".format(str(datetime.datetime.now())))
baseline_svm = SVC(verbose=4)
baseline_svm.fit(X_train,y_train)
svm_pred=baseline_svm.predict(X_test)
svm_truth=y_test
print("Finish Time : {}\n".format(str(datetime.datetime.now())))
print(classification_report(svm_pred,svm_truth))

Start Time : 2018-07-19 07:41:39.726667

[LibSVM]

In [None]:
with open('svm_83','wb') as f:
    pickle.dump(baseline_svm,f)

1


In [None]:
print(1)

In [None]:
baseline_svm.score(X_train,y_train)

In [24]:
print(classification_report(svm_pred,svm_truth))

             precision    recall  f1-score   support

          0       0.67      0.70      0.68      3729
          1       0.64      0.80      0.71       653
          2       0.87      0.79      0.83     10278
          3       0.75      0.85      0.80      4162
          4       0.61      0.80      0.69       617
          5       0.94      0.93      0.94     10595
          6       0.87      0.74      0.80     10952
          7       0.47      0.59      0.52      1119
          8       0.88      0.90      0.89      2344
          9       0.64      0.76      0.69      1026
         10       0.88      0.90      0.89      5360
         11       0.81      0.87      0.84      3237
         12       0.93      0.93      0.93      8801
         13       0.79      0.84      0.81      5023
         14       0.76      0.83      0.79      1209
         15       0.65      0.86      0.74       587
         16       0.56      0.77      0.65       455
         17       0.78      0.74      0.76   

# ---------------------------------------------------------------------------------------------------------------

# Gradient Boosting

Optimized

In [None]:
gbc_parameters = {'max_depth':[25,50,100,200],"min_samples_split":[2,5,10],"min_samples_leaf":[1,4,8],"learning_rate":[0.01,0.05,0.1,0.2],"max_depth":[3,6,12]}

classifier_gbc=GBC()

clf_gbc = GS(classifier_gbc, gbc_parameters,verbose=4,cv=5)

clf_gbc.fit(X_train,y_train)

In [None]:
pickle.dump(clf_gbc, open("gbc_experiment.pickle", 'wb'))

No Optimization

In [None]:
baseline_gbc=GBC(verbose=4)
baseline_gbc.fit(X_train,y_train)
baseline_gbc.score(X_test,y_test)

# ---------------------------------------------------------------------------------------------------------------

# XGBoost

In [33]:
print("Start Time : {}\n".format(str(datetime.datetime.now())))
baseline_xgb=XGB(n_jobs=-1,silent=True)
baseline_xgb.fit(X_train,y_train)
base_truth_xgb=y_test
pred_xgb=baseline_xgb.predict(X_test)
print("Finish Time : {}\n".format(str(datetime.datetime.now())))

Start Time : 2018-07-18 10:52:27.461172

Finish Time : 2018-07-18 11:06:38.915193



In [35]:
print(classification_report(base_truth_xgb,pred_xgb))

             precision    recall  f1-score   support

          0       0.57      0.51      0.54      3884
          1       0.77      0.46      0.57       862
          2       0.67      0.82      0.74      9564
          3       0.77      0.58      0.66      4850
          4       0.71      0.34      0.46       748
          5       0.84      0.90      0.87     10524
          6       0.58      0.75      0.66      9230
          7       0.51      0.25      0.34      1423
          8       0.84      0.74      0.79      2475
          9       0.68      0.36      0.47      1201
         10       0.79      0.77      0.78      5408
         11       0.77      0.63      0.69      3388
         12       0.82      0.87      0.85      8504
         13       0.76      0.60      0.67      5555
         14       0.74      0.60      0.66      1388
         15       0.84      0.47      0.60       749
         16       0.70      0.20      0.31       636
         17       0.58      0.68      0.62   

# ---------------------------------------------------------------------------------------------------------------

In [7]:
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [8]:
#preprocess product title to 100-dimensional vector
#and preprocess category name to integer label
large_embedded_data, large_label_encoder = preprocessor.preprocess_data(
    large_data_for_classification[1],
    large_data_for_classification[0],
)

  1%|          | 4785/396099 [00:00<00:16, 23916.95it/s]

REMOVING PUNCTUATIONS


100%|██████████| 396099/396099 [00:03<00:00, 106769.54it/s]
  1%|          | 3285/396099 [00:00<00:23, 16418.64it/s]

CONVERTING SENTENCE TO VECTOR


100%|██████████| 396099/396099 [00:22<00:00, 17495.56it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

SAVE VECTOR TO PANDAS DATAFRAME


100%|██████████| 100/100 [00:21<00:00,  4.61it/s]


In [9]:
large_embedded_data["sum"]=large_embedded_data.drop(["Labels"],axis=1).sum(axis=1)

In [10]:
large_embedded_data=large_embedded_data.loc[large_embedded_data["sum"]!=0].drop("sum",axis=1)

In [11]:
data=large_embedded_data
sampled_embedded_data=data.sample(n=len(data))

In [12]:
X_train,X_test,y_train,y_test=train_test_split(sampled_embedded_data.drop("Labels",axis=1), sampled_embedded_data["Labels"],test_size=0.2)

In [15]:
print(classification_report(base_truth_xgb_big,pred_xgb))

             precision    recall  f1-score   support

          0       0.72      0.53      0.61       191
          1       0.64      0.54      0.59      2294
          2       0.72      0.57      0.64      3385
          3       0.70      0.09      0.15       242
          4       0.00      0.00      0.00         3
          5       0.59      0.61      0.60      2032
          6       0.61      0.51      0.56      2806
          7       0.56      0.22      0.31       291
          8       0.49      0.30      0.37       213
          9       0.64      0.37      0.47       355
         10       0.52      0.23      0.32       203
         11       0.63      0.56      0.59       290
         12       0.40      0.18      0.25       237
         13       0.69      0.51      0.59       363
         14       0.86      0.80      0.83       274
         15       0.74      0.60      0.66       176
         16       0.68      0.52      0.59       384
         17       0.78      0.64      0.70   

  'precision', 'predicted', average, warn_for)


In [None]:
BARU vv

In [20]:
print("Start Time : {}\n".format(str(datetime.datetime.now())))
baseline_xgb_big=XGB(n_jobs=-1,silent=True,objective="multi:softmax")
baseline_xgb_big.fit(X_train,y_train)
base_truth_xgb_big=y_test
pred_xgb_big=baseline_xgb_big.predict(X_test)
print("Finish Time : {}\n".format(str(datetime.datetime.now())))

Start Time : 2018-07-20 10:12:36.294887

Finish Time : 2018-07-20 10:37:36.131524



  if diff:


In [None]:
print("Start Time : {}\n".format(str(datetime.datetime.now())))
baseline_xgb_big_2=XGB(n_jobs=-1,silent=True,max_depth=6,objective="multi:softmax")
baseline_xgb_big_2.fit(X_train,y_train)
base_truth_xgb_big_2=y_test
pred_xgb_big_2=baseline_xgb_big_2.predict(X_test)
print("Finish Time : {}\n".format(str(datetime.datetime.now())))

Start Time : 2018-07-20 10:52:19.287046



In [None]:
print("Start Time : {}\n".format(str(datetime.datetime.now())))
baseline_lr=LR(n_jobs=-1,solver="saga")
baseline_lr.fit(X_train,y_train)
print("Finish Time : {}\n".format(str(datetime.datetime.now())))

In [22]:
print(2)

2


In [24]:
print(classification_report(base_truth_xgb_big_2,pred_xgb_big_2))

             precision    recall  f1-score   support

          0       0.67      0.63      0.65      3821
          1       0.79      0.63      0.70       823
          2       0.76      0.84      0.80      9548
          3       0.82      0.71      0.76      4739
          4       0.79      0.64      0.71       767
          5       0.91      0.93      0.92     10552
          6       0.72      0.82      0.77      9319
          7       0.61      0.40      0.49      1491
          8       0.88      0.85      0.86      2323
          9       0.76      0.59      0.66      1264
         10       0.86      0.86      0.86      5420
         11       0.85      0.77      0.81      3464
         12       0.89      0.91      0.90      8696
         13       0.81      0.74      0.77      5409
         14       0.82      0.75      0.79      1396
         15       0.87      0.64      0.74       777
         16       0.71      0.47      0.57       646
         17       0.68      0.76      0.72   