In [None]:
import json
import numpy as np
import pandas as pd
import keras
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Activation, Dropout, Input, Flatten, Embedding, LSTM
from keras.utils import np_utils
from keras.utils import to_categorical
from sklearn import preprocessing
import sklearn
import time
from random import shuffle
import pickle

In [None]:
os.getcwd()

In [None]:
with open("../../synthetic_data/data/final_train_labels.json") as f:
    data = json.load(f)

In [None]:
data = data[:50000]

In [None]:
x = [ea[1] for ea in data]
y = [ea[0] for ea in data]
df = pd.DataFrame(data={"X": x, "Y": y})
print (len(df))
df.head()

In [None]:
# Y Vocab
labels = y
le = preprocessing.LabelEncoder()
df['encoded_y'] = le.fit_transform(y)
print ("# categories = ", max(df.encoded_y.tolist())+1)
df.head(5)

In [None]:
le.inverse_transform([8681])

In [None]:
# X Vocab
MAX_SEQ_LEN = 25
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 256

flattened = np.asarray(x).reshape(-1)
t = Tokenizer(num_words=MAX_NUM_WORDS)
t.fit_on_texts(flattened)

## see encoded X --> do batch encoding, cannot load into memory
encoded_x = t.texts_to_sequences(flattened)
train_x = pad_sequences(encoded_x, maxlen=MAX_SEQ_LEN)
train_x

In [None]:
train_y = to_categorical(df.encoded_y.tolist(), num_classes=max(df.encoded_y.tolist())+1)
train_y

In [None]:
model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQ_LEN))
model.add(LSTM(100))
model.add(Dense(len(train_y[0]), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
print (train_x.shape)
print (train_y.shape)

In [None]:
start = time.time()
model.fit(train_x, train_y, epochs=5)
model.save("models/lstm_50k_epochs_5.h5")
print ("Took: ", time.time()-start)

# Evaluation

In [None]:
model = load_model("models/lstm_50k_epochs_5.h5")
test_num = 300
preds = model.predict(train_x[:test_num])
pred_labels = [[np.argmax(x)] for x in preds]

In [None]:
pred_titles = le.inverse_transform(pred_labels)
pred_titles

In [None]:
analysis_df = pd.DataFrame({'pred_x': pred_titles, 'x': x[:test_num], 'y': y[:test_num]})
analysis_df = analysis_df[['x', 'y', 'pred_x']]

In [None]:
analysis_df

In [None]:
np.save('pickled/labelencoder_classes.npy', le.classes_)

In [None]:
with open('pickled/tokenizer.pickle', 'wb') as handle:
    pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)