# Import Useful Modules 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS

# import keras
# import tensorflow
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.utils import to_categorical

from preprocessing_pipeline import preprocessing

import tqdm

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
#Determine Model's Save Location

#version=
version="Fixed"

In [5]:
if(not(os.path.exists("model/{}".format(version)))):
    os.makedirs("model/{}".format(version))

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [6]:
def read_product_title_data():
    product_title_only=pd.read_csv('data/query.csv',header=None)
    product_title_only["Product Title"]=product_title_only[1]
    product_title_only=product_title_only[["Product Title"]]
    product_title_only.dropna(inplace=True,axis=0)
    
    return product_title_only

In [7]:
data_for_embedding=read_product_title_data()

In [8]:
data_for_embedding.head()

Unnamed: 0,Product Title
0,gear+camping
1,Lightstick+EXO
2,vivo+v7
3,subwoofer+kolong+aktif
4,Hotpants+anak


# ---------------------------------------------------------------------------------------------------------------

# Construct Word Embedder (Using fasttext)

In [9]:
preprocessor=preprocessing(None,None)

In [11]:
product_title=[preprocessor.tokenize(value) for value in tqdm.tqdm(data_for_embedding["Product Title"])]


  0%|                                                                                        | 0/2440682 [00:00<?, ?it/s]
  0%|                                                                          | 1547/2440682 [00:00<02:39, 15276.07it/s]
  0%|▎                                                                         | 8928/2440682 [00:00<00:54, 44521.26it/s]
  1%|▌                                                                        | 18127/2440682 [00:00<00:40, 59863.43it/s]
  1%|▊                                                                        | 26823/2440682 [00:00<00:36, 66879.38it/s]
  1%|█                                                                        | 34979/2440682 [00:00<00:34, 69632.47it/s]
  2%|█▏                                                                       | 41632/2440682 [00:00<00:34, 69202.22it/s]
  2%|█▍                                                                       | 48336/2440682 [00:00<00:34, 68769.44it/s]
  2%|█▋                

 18%|████████████▊                                                           | 433054/2440682 [00:07<00:34, 58570.17it/s]
 18%|████████████▉                                                           | 439962/2440682 [00:07<00:34, 58676.92it/s]
 18%|█████████████▏                                                          | 446805/2440682 [00:07<00:33, 58810.97it/s]
 19%|█████████████▍                                                          | 453643/2440682 [00:07<00:33, 58933.25it/s]
 19%|█████████████▌                                                          | 460475/2440682 [00:07<00:34, 57752.09it/s]
 19%|█████████████▊                                                          | 467310/2440682 [00:08<00:34, 57881.43it/s]
 19%|█████████████▉                                                          | 473994/2440682 [00:08<00:33, 57989.16it/s]
 20%|██████████████▏                                                         | 480413/2440682 [00:08<00:33, 58062.22it/s]
 20%|██████████████▍    

 37%|██████████████████████████▌                                             | 898913/2440682 [00:14<00:25, 61020.30it/s]
 37%|██████████████████████████▋                                             | 905646/2440682 [00:14<00:25, 61061.73it/s]
 37%|██████████████████████████▉                                             | 912468/2440682 [00:14<00:25, 61108.58it/s]
 38%|███████████████████████████                                             | 919385/2440682 [00:15<00:24, 61161.13it/s]
 38%|███████████████████████████▎                                            | 926143/2440682 [00:15<00:24, 61202.47it/s]
 38%|███████████████████████████▌                                            | 933109/2440682 [00:15<00:24, 61256.92it/s]
 39%|███████████████████████████▋                                            | 939922/2440682 [00:15<00:24, 61300.69it/s]
 39%|███████████████████████████▉                                            | 946936/2440682 [00:15<00:24, 61358.05it/s]
 39%|███████████████████

 56%|███████████████████████████████████████▌                               | 1361406/2440682 [00:21<00:17, 62382.81it/s]
 56%|███████████████████████████████████████▊                               | 1368292/2440682 [00:21<00:17, 62405.89it/s]
 56%|████████████████████████████████████████                               | 1375132/2440682 [00:22<00:17, 62426.66it/s]
 57%|████████████████████████████████████████▏                              | 1381927/2440682 [00:22<00:16, 62445.22it/s]
 57%|████████████████████████████████████████▍                              | 1388677/2440682 [00:22<00:16, 62455.94it/s]
 57%|████████████████████████████████████████▌                              | 1395607/2440682 [00:22<00:16, 62485.84it/s]
 57%|████████████████████████████████████████▊                              | 1402571/2440682 [00:22<00:16, 62516.98it/s]
 58%|█████████████████████████████████████████                              | 1409433/2440682 [00:22<00:16, 62543.32it/s]
 58%|███████████████████

 75%|█████████████████████████████████████████████████████▎                 | 1831950/2440682 [00:28<00:09, 63305.86it/s]
 75%|█████████████████████████████████████████████████████▍                 | 1838927/2440682 [00:29<00:09, 63327.54it/s]
 76%|█████████████████████████████████████████████████████▋                 | 1845932/2440682 [00:29<00:09, 63350.03it/s]
 76%|█████████████████████████████████████████████████████▉                 | 1852908/2440682 [00:29<00:09, 63364.86it/s]
 76%|██████████████████████████████████████████████████████                 | 1859818/2440682 [00:29<00:09, 63381.59it/s]
 76%|██████████████████████████████████████████████████████▎                | 1866826/2440682 [00:29<00:09, 63405.99it/s]
 77%|██████████████████████████████████████████████████████▌                | 1873964/2440682 [00:29<00:08, 63432.41it/s]
 77%|██████████████████████████████████████████████████████▋                | 1880951/2440682 [00:29<00:08, 63447.12it/s]
 77%|███████████████████

 94%|███████████████████████████████████████████████████████████████████    | 2304033/2440682 [00:36<00:02, 63726.05it/s]
 95%|███████████████████████████████████████████████████████████████████▏   | 2310891/2440682 [00:36<00:02, 63717.82it/s]
 95%|███████████████████████████████████████████████████████████████████▍   | 2317507/2440682 [00:36<00:01, 63706.51it/s]
 95%|███████████████████████████████████████████████████████████████████▌   | 2324256/2440682 [00:36<00:01, 63716.65it/s]
 96%|███████████████████████████████████████████████████████████████████▊   | 2330854/2440682 [00:36<00:01, 63722.38it/s]
 96%|████████████████████████████████████████████████████████████████████   | 2337691/2440682 [00:36<00:01, 63734.59it/s]
 96%|████████████████████████████████████████████████████████████████████▏  | 2344670/2440682 [00:36<00:01, 63750.59it/s]
 96%|████████████████████████████████████████████████████████████████████▍  | 2352269/2440682 [00:36<00:01, 63782.72it/s]
 97%|███████████████████

In [16]:
new_product_title=[]
for sentence in tqdm.tqdm(product_title):
    new_product_title.append([word for word in sentence if result[word]>=50 and len(word)>2])
new_product_title=[title for title in tqdm.tqdm(new_product_title) if len(title)>=2]


  0%|                                                                                        | 0/2440682 [00:00<?, ?it/s]
  2%|█▎                                                                      | 44800/2440682 [00:00<00:05, 442380.88it/s]
  4%|██▉                                                                    | 100509/2440682 [00:00<00:04, 501213.67it/s]
  6%|████▌                                                                  | 156680/2440682 [00:00<00:04, 520879.16it/s]
  8%|█████▌                                                                 | 191723/2440682 [00:00<00:08, 271996.67it/s]
 10%|███████▎                                                               | 249614/2440682 [00:00<00:07, 310412.01it/s]
 13%|█████████                                                              | 310195/2440682 [00:00<00:06, 342983.08it/s]
 15%|██████████▋                                                            | 368927/2440682 [00:01<00:05, 367211.02it/s]
 17%|████████████▎     

In [17]:
result=preprocessor.word_count(product_title)
result2=preprocessor.word_count(new_product_title)

1/1
1/1


In [20]:
len(result2)/len(result)

0.12089641095424392

In [22]:
len(new_product_title)/len(product_title)

0.7202703998308669

#### Train Embedding Model

In [38]:
EMBEDDING_DIMENSION=100
EMBEDDING_EPOCH=100
EMBEDDING_WINDOW=7
MIN_COUNT=10
SEED=2918342

In [39]:
# membuat model word embedding
print("Start Time : {}\n".format(str(datetime.datetime.now())))
word_embedder = FastText(new_product_title[:100000], size=EMBEDDING_DIMENSION, window=EMBEDDING_WINDOW, min_count=MIN_COUNT, workers=16, sg=1, seed=SEED, min_n=5, iter=EMBEDDING_EPOCH)
print("End Time : {}".format(str(datetime.datetime.now())))

Start Time : 2018-08-08 17:50:29.146899

End Time : 2018-08-08 17:52:32.605620


#### Save Model

In [32]:
# #save model
# pickle.dump(word_embedder, open("model/{}/word_embedder_{}.pickle".format(version,EMBEDDING_DIMENSION), 'wb'))

# ---------------------------------------------------------------------------------------------------------------

# Word Embedding Evaluation

#### Fictional Character

In [40]:
word_embedder.wv.most_similar("mobil")

[('mobilan', 0.6665498614311218),
 ('brooks', 0.6144686937332153),
 ('mobilio', 0.5463486909866333),
 ('serep', 0.5144010186195374),
 ('maxx', 0.5107284784317017),
 ('freed', 0.48475661873817444),
 ('plafon', 0.4757026135921478),
 ('rubicon', 0.4727756083011627),
 ('organizer', 0.46525830030441284),
 ('tukang', 0.4617008566856384)]

#### Computer Brand

In [41]:
word_embedder.wv.most_similar("intel")

[('quad', 0.7961403131484985),
 ('procesor', 0.7942219972610474),
 ('amd', 0.7509782314300537),
 ('built', 0.7214125394821167),
 ('prosesor', 0.7135754227638245),
 ('processor', 0.7010242938995361),
 ('ryzen', 0.684984564781189),
 ('intex', 0.6610298156738281),
 ('vaio', 0.6570157408714294),
 ('lga', 0.6483675837516785)]

In [42]:
word_embedder.wv.most_similar("acer")

[('aspire', 0.6963398456573486),
 ('netbook', 0.6534676551818848),
 ('inspiron', 0.6520154476165771),
 ('toshiba', 0.6191145777702332),
 ('himax', 0.616962730884552),
 ('neca', 0.6091619729995728),
 ('ideapad', 0.596017599105835),
 ('ryzen', 0.5898139476776123),
 ('axioo', 0.5845321416854858),
 ('nitro', 0.5814634561538696)]

### Automotive Brand

In [43]:
word_embedder.wv.most_similar("honda")

[('jerigen', 0.5491578578948975),
 ('primus', 0.5002319812774658),
 ('juken', 0.4936077296733856),
 ('ferio', 0.4866533875465393),
 ('fiesta', 0.4853607714176178),
 ('camry', 0.48482197523117065),
 ('tengkorak', 0.4839434027671814),
 ('vega', 0.4784013032913208),
 ('repairkit', 0.4767449200153351),
 ('shad', 0.4748554825782776)]

In [44]:
word_embedder.wv.most_similar("yamaha")

[('midi', 0.5278173685073853),
 ('arai', 0.5064706802368164),
 ('parker', 0.49939867854118347),
 ('xabre', 0.49812889099121094),
 ('kumpulan', 0.4963878393173218),
 ('lagu', 0.4847766160964966),
 ('esp', 0.4830368459224701),
 ('psr', 0.4811265468597412),
 ('maru', 0.47903478145599365),
 ('cld', 0.47867780923843384)]

In [45]:
word_embedder.wv.most_similar("samsung")

[('galaxy', 0.6051572561264038),
 ('mito', 0.552219033241272),
 ('gerigi', 0.5494296550750732),
 ('note', 0.5469122529029846),
 ('lenovo', 0.5417517423629761),
 ('cumi', 0.5330456495285034),
 ('meizu', 0.5315901041030884),
 ('stylus', 0.5241528749465942),
 ('sony', 0.5186532735824585),
 ('himax', 0.5179975628852844)]

# ---------------------------------------------------------------------------------------------------------------