# Import Useful Modules 

In [1]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize



In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText, word2vec, doc2vec

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical

import string

from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from preprocessing_pipeline import preprocessing

In [4]:
preprocessor=preprocessing(None,None)
preprocessor

<preprocessing_pipeline.preprocessing at 0xc1b9e100f0>

In [5]:
large_data_for_classification=pd.read_csv("data/big.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [7]:
raw_category_mapper=pd.read_csv("category_mapping.csv",index_col=0)

category_mapper={}
for i in raw_category_mapper.index:
    category_mapper[raw_category_mapper["l2"][i]]=raw_category_mapper["l1"][i]

In [8]:
new_category=[category_mapper[value] for value in large_data_for_classification[0]]
large_data_for_classification[0]=new_category

In [9]:
large_data_for_classification.head()

Unnamed: 0,0,1,2
0,komputer,KINGSTON+KVR1333D3N9,1510.0
1,hobi_dan_koleksi,power+amplifier+wisdom+,62.0
2,motor,jas%20hujan%20anak,391.0
3,fashion_wanita,Celana+bahan+formal,288.0
4,komputer,Preset+lightroom,1.0


In [112]:
tagged_data = [TaggedDocument(words=preprocessor.remove_parentheses(large_data_for_classification[1][index]), tags=[large_data_for_classification[0][index]]) for index in large_data_for_classification.index]

In [113]:
tagged_data[:5]

[TaggedDocument(words=['kingston', 'kvrdn'], tags=['komputer']),
 TaggedDocument(words=['power', 'amplifier', 'wisdom'], tags=['hobi_dan_koleksi']),
 TaggedDocument(words=['jas', 'hujan', 'anak'], tags=['motor']),
 TaggedDocument(words=['celana', 'bahan', 'formal'], tags=['fashion_wanita']),
 TaggedDocument(words=['preset', 'lightroom'], tags=['komputer'])]

In [None]:
max_epochs = 15
vec_size = 50

embedder = Doc2Vec(vector_size=vec_size,
                min_count=10)
  
embedder.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    embedder.train(tagged_data,
                total_examples=embedder.corpus_count,
                epochs=embedder.epochs)



iteration 0




iteration 1




iteration 2




iteration 3




iteration 4




iteration 5




iteration 6


In [70]:
def preprocess_data(features,labels,dimension,encoder=None):
        
        embedded_data=pd.DataFrame()

        if(encoder==None):
            label_encoder=LabelEncoder()
            embedded_data["Labels"]=label_encoder.fit_transform(labels)
        else:
            label_encoder=encoder
            embedded_data["Labels"]=label_encoder.transform(labels)
            
        embedded_data["Features"]=[preprocessor.remove_parentheses(title) for title in features]
        embedded_data["Features Vector"]=[embedder.infer_vector(title) for title in embedded_data["Features"]]
    
        for i in range(dimension):
            embedded_data[i]=[value[i] for value in embedded_data["Features Vector"]]
    
        embedded_data = embedded_data[[*range(dimension),"Labels"]]
        
        if(encoder==None):
            return embedded_data, label_encoder
        else:
            return embedded_data

In [71]:
embedded_data,label_encoder=preprocess_data(large_data_for_classification[1],large_data_for_classification[0],200)

In [72]:
def accuracy(predicted,truth):
    result=[int(value) for value in np.array(predicted)==np.array(truth)]
    return sum(result)/len(result)

In [105]:
data=embedded_data.copy()
sampled_embedded_data=data.sample(n=len(data)//10)

In [106]:
nn_X_train,nn_X_test,nn_y_train,nn_y_test=train_test_split(sampled_embedded_data.drop("Labels",axis=1),to_categorical(sampled_embedded_data["Labels"]),test_size=0.2)

In [107]:
classifier = Sequential()
classifier.add(Dense(1500, input_shape=(200,), activation='relu'))
classifier.add(Dropout(0.1))
classifier.add(Dense(1000, activation='relu'))
classifier.add(Dropout(0.1))
classifier.add(Dense(20, activation='softmax'))

classifier.compile(optimizer='adam', loss='categorical_crossentropy')

In [108]:
for i in range(5):
    history = classifier.fit(nn_X_train, nn_y_train, epochs=1, batch_size=32, validation_data=(nn_X_test,nn_y_test), shuffle=True,verbose=1)
    nn_y_truth=[np.argmax(value) for value in nn_y_test]
    nn_y_pred=[np.argmax(value) for value in classifier.predict(nn_X_test)]
    print("Validation Accuracy : {}".format(accuracy(nn_y_pred,nn_y_truth)))

Train on 31687 samples, validate on 7922 samples
Epoch 1/1

KeyboardInterrupt: 

In [None]:
nn_y_truth=[np.argmax(value) for value in nn_y_test]
nn_y_pred=[np.argmax(value) for value in classifier.predict(nn_X_test)]
print("Validation Accuracy : {}".format(accuracy(nn_y_pred,nn_y_truth)))

In [102]:
sentences={}
sentences[0]="gitar listrik"
sentences[1]="gitar akustik"
sentences[2]="mobil balap"


sentence_vectors={}
for i in range(len(sentences)):
    sentence_vectors[i]=[embedder.infer_vector(preprocessor.remove_parentheses(sentences[i]))]

In [103]:
cosine_similarity(sentence_vectors[0],sentence_vectors[1])

array([[0.05705025]], dtype=float32)

In [110]:
dir(embedder)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adapt_by_suffix',
 '_check_training_sanity',
 '_clear_post_train',
 '_do_train_job',
 '_get_job_params',
 '_get_thread_working_mem',
 '_job_producer',
 '_load_specials',
 '_log_epoch_end',
 '_log_epoch_progress',
 '_log_progress',
 '_log_train_end',
 '_raw_word_count',
 '_save_specials',
 '_set_train_params',
 '_smart_save',
 '_train_epoch',
 '_update_job_params',
 '_worker_loop',
 'alpha',
 'batch_words',
 'build_vocab',
 'build_vocab_from_freq',
 'callbacks',
 'cbow_mean',
 'clear_sims',
 'comment',
 'compute_loss',
 'corpus_count',
 'cum_table',
 'dbow',
 'dbow_words',
 'delete_temporary_training_data