In [1]:
%load_ext autoreload
%autoreload 2

from toolbox.data_prep_helpers import *
from toolbox.evaluation import *
import pandas as pd
import numpy as np

from models.bagofwords_classifier import create_model

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Input, Flatten, Activation
from tensorflow.keras.optimizers import SGD, Adam
from keras.preprocessing.text import text
from sklearn.preprocessing import MultiLabelBinarizer


Using TensorFlow backend.


In [2]:
sample_size = 100000
n_top_labels=100
normalize_embeddings = False
learning_rate = 0.0000001
vocab_size = 1000

## Load Data

In [3]:
data_path = "../data/pythonquestions/"
total_data = load_data(data_path)

In [None]:
data = total_data.sample(sample_size)

## Prepare Data

In [None]:
print(data.shape)
remove_html_tags(data, ["Body_q"])
data = data[data["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(data.shape)
data = reduce_number_of_tags(data, 100)

In [None]:
train_size = int(len(data) * .8)
train_posts = data['Body_q'][:train_size]
train_tags = data['tags'][:train_size]
test_posts = data['Body_q'][train_size:]
test_tags = data['tags'][train_size:]

In [None]:
tokenize = text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(train_posts)

x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

In [None]:
encoder = MultiLabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)
n_col = y_train.shape[1]

## Create and Train Model

In [None]:
# Returned Model not Compiled yet to better play with optimizers!!
model = create_model(input_layer_size=256, vocab_size=vocab_size,output_dim=n_col)
model.summary()

In [None]:
# Optimizer
opt = SGD(lr=learning_rate, momentum=0.9)
opt_Adam = Adam(lr = learning_rate)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam' , metrics=["accuracy"])


In [None]:
model = Sequential()
model.add(Dense(256, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dense(n_col))
model.add(Activation('sigmoid'))

opt_Adam = Adam(lr = learning_rate)
model.compile(loss='categorical_crossentropy', optimizer='adam' , metrics=["accuracy"])


In [None]:
history = model.fit(x_train, y_train,
                    batch_size=64,
                    epochs=10,
                    verbose=1,
                    validation_split=0.1)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=opt , metrics=["accuracy"])

In [None]:
history = model.fit(x_train, y_train,
                    batch_size=128,
                    epochs=30,
                    verbose=1,
                    validation_split=0.1)

## Evaluate Model

In [None]:
n_predictions = 300

predictions = model.predict(x_test[:n_predictions])

l_pred = encoder.inverse_transform(binarize_model_output(predictions, threshold=0.1))
l_true = encoder.inverse_transform(y_test[:n_predictions])
raw_texts = test_posts[:n_predictions]

for pred, act, txt, i in zip(l_pred, l_true, raw_texts, range(5)):
    print(f"TRUE: {act}\nPREDICTION: {pred}\n")
    print(txt)

In [None]:
l_pred_binary = binarize_model_output(predictions, 0.1)
l_true_binary = y_test[:n_predictions]
output_evaluation(model, sample_size, None, n_top_labels, l_true_binary, l_pred_binary, normalize_embeddings, learning_rate, vocab_size)