In [1]:
import tensorflow
config = tensorflow.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5 #(misalnya kalo pengen 0.4 dari GPU memory)
session = tensorflow.Session(config=config)

# Import Useful Modules 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from time import gmtime, strftime
import time
import datetime
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Embedder
from gensim.models import FastText

# Classifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.grid_search import GridSearchCV as GS
from sklearn.model_selection import validation_curve, learning_curve
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures as Poly

from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Dropout, LSTM, Embedding, Input
from keras.utils import to_categorical
import tqdm

Using TensorFlow backend.


In [3]:
#Determine Model's File Location

version = "version_4"

# ---------------------------------------------------------------------------------------------------------------

# Import Model

In [4]:
from preprocessing_pipeline import preprocessing

In [5]:
with open("model/{}/word_embedder.pickle".format(version), "rb") as file:
    word_embedder = pickle.load(file)

## Check

#### Preprocessing

In [6]:
preprocessor=preprocessing(word_embedder.vector_size,word_embedder)
preprocessor

<preprocessing_pipeline.preprocessing at 0x7f6784adfd30>

#### Word Embedding

In [7]:
word_embedder

<gensim.models.fasttext.FastText at 0x7f662db49cc0>

# ---------------------------------------------------------------------------------------------------------------

# Read Data To Pandas Dataframe

In [8]:
large_data_for_classification=pd.read_csv("data/query.csv",header=None)
large_data_for_classification.dropna(axis=0,inplace=True)

In [9]:
large_data_for_classification.head()

Unnamed: 0,0,1,2
0,hunting,gear+camping,327
1,koleksi,Lightstick+EXO,63
2,kartu-perdana,vivo+v7,183
3,aksesoris-mobil,subwoofer+kolong+aktif,389
4,anak-perempuan,Hotpants+anak,1366


In [None]:
f,l,e=preprocessor.preprocess_data_for_lstm(large_data_for_classification[1],large_data_for_classification[0])

  0%|          | 0/2440682 [00:00<?, ?it/s]

REMOVING UNIMPORTANT CHARACTERS


 10%|█         | 245614/2440682 [00:45<06:43, 5442.71it/s]

# ---------------------------------------------------------------------------------------------------------------

In [None]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
numpy.random.seed(12365716)

In [None]:
def word_count(sentences):
    counts = dict()
    print("1/1")
    for sentence in sentences:
        for word in sentence:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
    return counts

def getFilteredData(product_title,labels,frequency, N_words, word_length):
    print("1/3")
    result=word_count(product_title)
    print("2/3")
    new_product_title=[]
    for sentence in tqdm.tqdm(product_title):
        new_product_title.append([word for word in sentence if result[word]>=frequency and len(word)>=word_length])
    
    print("3/3")
    new_features=[]
    new_labels=[]
    for index,title in tqdm.tqdm(enumerate(new_product_title)):
        if(len(title)>=N_words):
            new_features.append(title)
            new_labels.append(labels[index])
    
    return new_features,new_labels

def getTfIdf(new_product_title):
    print("1/3")
    concatenated_product_title=[]
    for sentence in tqdm.tqdm(new_product_title):
        concatenated_product_title.append(" ".join(sentence))
    print("2/3")
    cv=CountVectorizer()
    result=cv.fit_transform(concatenated_product_title)
    print("3/3")
    tftransformer = TfidfTransformer(smooth_idf=False)
    final_result=tftransformer.fit_transform(result)
    
    return final_result,cv,tftransformer

In [None]:
unique_labels=set()
for label in tqdm.tqdm(large_data_for_classification[0]):
    unique_labels.add(label)

labels_mapper={}
for index,value in tqdm.tqdm(enumerate(unique_labels)):
    labels_mapper[value]=index+1
    

labels =[labels_mapper[label] for label in tqdm.tqdm(large_data_for_classification[0])]

In [None]:
product_title=[preprocessor.remove_parentheses(value) for value in tqdm.tqdm(large_data_for_classification[1])]
product_title,labels=getFilteredData(product_title,list(labels),50,2,3)

In [None]:
unique_words=set()
for title in tqdm.tqdm(product_title):
    for word in title:
        unique_words.add(word)

vocab={}
for index,value in tqdm.tqdm(enumerate(unique_words)):
    vocab[value]=index+1
    

features=[[vocab[word] for word in title] for title in tqdm.tqdm(product_title)]

In [None]:
# X_train,X_test,y_train,y_test=train_test_split(features,to_categorical(labels),test_size=0.1)

In [None]:
max_review_length = 8
X_train = sequence.pad_sequences(features[:1500000], maxlen=max_review_length)
X_test = sequence.pad_sequences(features[1500000:], maxlen=max_review_length)
y_train = to_categorical(labels[:1500000])
y_test = to_categorical(labels[1500000:])

In [24]:
# create the model
embedding_vector_length = 100
model = Sequential()
model.add(Embedding(8529, embedding_vector_length, input_length=max_review_length,mask_zero=True))
model.add(LSTM(500,activation="relu",recurrent_activation="relu"))
model.add(Dense(750, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(235, activation='softmax'))
model.compile(optimizer="Adagrad", loss='categorical_crossentropy',metrics=['accuracy'])
print(model.summary())

history=model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15, batch_size=100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 8, 100)            852900    
_________________________________________________________________
lstm_2 (LSTM)                (None, 500)               1202000   
_________________________________________________________________
dense_4 (Dense)              (None, 750)               375750    
_________________________________________________________________
dense_5 (Dense)              (None, 500)               375500    
_________________________________________________________________
dense_6 (Dense)              (None, 235)               117735    
Total params: 2,923,885
Trainable params: 2,923,885
Non-trainable params: 0
_________________________________________________________________
None
Train on 1500000 samples, validate on 257951 samples
Epoch 1/15


InternalError: GPU sync failed

In [22]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 8, 100)            852900    
_________________________________________________________________
lstm_1 (LSTM)                (None, 500)               1202000   
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              501000    
_________________________________________________________________
dense_2 (Dense)              (None, 750)               750750    
_________________________________________________________________
dense_3 (Dense)              (None, 235)               176485    
Total params: 3,483,135
Trainable params: 3,483,135
Non-trainable params: 0
_________________________________________________________________
