# Training Pipeline using Title and Body Model

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from nltk.tokenize import word_tokenize
from toolbox.data_prep_helpers import *
from toolbox.evaluation import *

from models.lstm_classifier import create_model
from models.title_body_lstm import create_model as tb_create_model

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import numpy as np

In [None]:
data_path = "../data/pythonquestions/"
ft_path = "alldata.ft"  # set this to None if you want to train your own fasttext embeddings
n_top_labels = 100
n_epochs = 20
max_question_words = 100
sample_size = 10000  # set to -1 to use entire data
normalize_embeddings = True
use_titles = False

tokenized_field = "q_all_body_tokenized"
content_field = "Body_q"

In [None]:
import nltk
nltk.download('punkt')

### Load Data

In [None]:
df = load_data(data_path, ignore_cache=False, tokenized_field=tokenized_field, content_field=content_field)
df.shape

In [None]:
sample = df.sample(sample_size) if sample_size > 0 else df
del df
sample.head()

In [None]:
# remove_html_tags(chunk, ["Body_q"])
#print(f"{i}: generating question level tokens")
sample["q_title_tokenized"] = sample["Title"].apply(generate_question_level_tokens)

### Clean Data

In [None]:
sample.shape

In [None]:
# we have some nans in our tags which break target encoding
print(sample.shape)
sample = sample[sample["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(sample.shape)


# Reduce the number of tags and adjust dataframe accordingly
sample = reduce_number_of_tags(sample, n_top_labels)
sample.shape

In [None]:
sample["tags"].head(5)

### Prepare Training and Test data

In [None]:
# Tokenize text into words on question level
data = sample[sample[tokenized_field].apply(len) <= max_question_words]
train_data, test_data = train_test_split(data, test_size = 0.2)
print(train_data.shape)
print(test_data.shape)

In [None]:
# train word embeddings ONLY with training data
# wv = create_Word2Vec_embeddings(train_data, "Body_q")
# Use FastText to include solution for out-of-vocab words
if ft_path is not None:
    wv = load_fasttext_embeddings(ft_path)
else:
    wv = create_FastText_embedding(train_data, content_field)
wv.init_sims()

## Train with Title and Body

In [None]:
X_train_t = train_data["q_title_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))
X_train_b = train_data["q_all_body_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))


padding_element = np.array([0.0] * X_train_t.iloc[0].shape[-1])

X_train_t_padded = pad_sequences(X_train_t, padding="post", dtype='float32', value=padding_element)
X_train_b_padded = pad_sequences(X_train_b, padding="post", dtype='float32', value=padding_element)
print(X_train_t_padded.shape)
print(X_train_b_padded.shape)

In [None]:
X_test_t = test_data["q_title_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))
X_test_b = test_data["q_all_body_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))

X_test_t_padded = pad_sequences(X_test_t, padding="post", dtype='float32', value=padding_element)
X_test_b_padded = pad_sequences(X_test_b, padding="post", dtype='float32', value=padding_element)
print(X_test_t_padded.shape)
print(X_test_b_padded.shape)

In [None]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_data["tags"])
y_train = label_encoder.transform(train_data["tags"])
y_test = label_encoder.transform(test_data["tags"])

In [None]:
model = tb_create_model(embedding_dim=100, output_dim=100, mask_value=0.)
model.summary()

In [None]:
y_train = y_train.astype("float32")
y_test = y_test.astype("float32")

model.fit(x=[X_train_t_padded, X_train_b_padded], y=y_train, batch_size=32, epochs=n_epochs, validation_data=[[X_test_t_padded, X_test_b_padded], y_test])

In [None]:
n_predictions = 100

predictions = model.predict([X_test_t_padded, X_test_b_padded])

l_pred = label_encoder.inverse_transform(binarize_model_output(predictions, threshold=0.10))
l_true = label_encoder.inverse_transform(y_test)
texts = test_data[tokenized_field]
raw_texts = test_data[content_field]
titles = test_data["Title"]

for pred, act, txt, raw_txt, title in zip(l_pred, l_true, texts, raw_texts, titles):
    print(f"TRUE: {act}\nPREDICTION: {pred}\n")
    print(f"{title}\n----------")
    print(raw_txt)

In [None]:
predictions = model.predict([X_test_t_padded, X_test_b_padded], batch_size=64)
output_evaluation(model, sample_size, max_question_words, n_top_labels, y_train, predictions, normalize_embeddings, 1, None, n_epochs)