# Training Pipeline using Title and Body Model

In [69]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from nltk.tokenize import word_tokenize
from toolbox.data_prep_helpers import *
from toolbox.evaluation import *

from models.lstm_classifier import create_model
from models.title_body_lstm import create_model as tb_create_model

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
data_path = "../data/pythonquestions/"
ft_path = "alldata_sg.ft"  # set this to None if you want to train your own fasttext embeddings
n_top_labels = 100
n_epochs = 30
max_question_words = 100
sample_size = -1  # set to -1 to use entire data
normalize_embeddings = True
use_titles = False

tokenized_field = "q_all_body_tokenized"
content_field = "Body_q"

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dschr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Data

In [72]:
df = load_data(data_path, ignore_cache=False, tokenized_field=tokenized_field, content_field=content_field)
df.shape

loading data from cached pickle
reading chunk 0


(607282, 5)

In [71]:
sample = df.sample(sample_size) if sample_size > 0 else df
del df
sample.head()

NameError: name 'df' is not defined

In [6]:
# remove_html_tags(chunk, ["Body_q"])
#print(f"{i}: generating question level tokens")
sample["q_title_tokenized"] = sample["Title"].apply(generate_question_level_tokens)

### Clean Data

In [None]:
sample.shape

In [7]:
# we have some nans in our tags which break target encoding
print(sample.shape)
sample = sample[sample["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(sample.shape)


# Reduce the number of tags and adjust dataframe accordingly
sample = reduce_number_of_tags(sample, n_top_labels)
sample.shape

(607282, 6)
(606841, 6)
deleting element python from top_tags


(425658, 6)

In [10]:
sample["tags"].head(5)

0               [osx]
1    [windows, image]
3     [sql, database]
4            [arrays]
5       [django, oop]
Name: tags, dtype: object

### Prepare Training and Test data

In [40]:
# Tokenize text into words on question level
data = sample[sample[tokenized_field].apply(len) <= max_question_words]
data = data[data["q_all_body_tokenized"].apply(len) > 0]

# train_data, test_data = train_test_split(data, test_size = 0.2)
# print(train_data.shape)
# print(test_data.shape)

(78101, 6)
(19526, 6)


In [41]:
train_data.head()

Unnamed: 0,Id,Title,Body_q,tags,q_all_body_tokenized,q_title_tokenized
384010,29941228,Evaluation of sympy.function containing a NumP...,My problem is that the following code doesn't ...,[numpy],"[my, problem, is, that, the, following, code, ...","[evaluation, of, sympy.function, containing, a..."
458459,33716181,ca_certs_locater/__init__.py import error,I was trying to get authentication of my api.H...,[api],"[i, was, trying, to, get, authentication, of, ...","[ca_certs_locater/__init__.py, import, error]"
48052,5299199,Python - Importing a global/site-packages modu...,I'm using python and virtualenv/pip. I have a ...,"[django, import]","[i, 'm, using, python, and, virtualenv/pip, i,...","[python, -, importing, a, global/site-packages..."
186264,17527949,Open windows photo gallery from python,I want the end of a python script to open wind...,[windows],"[i, want, the, end, of, a, python, script, to,...","[open, windows, photo, gallery, from, python]"
282620,24044330,SQLAlchemy query filter behavior confusing in ...,I am a little confused at how to use multiple ...,"[mysql, sqlalchemy]","[i, am, a, little, confused, at, how, to, use,...","[sqlalchemy, query, filter, behavior, confusin..."


In [74]:
# train word embeddings ONLY with training data
# wv = create_Word2Vec_embeddings(train_data, "Body_q")
# Use FastText to include solution for out-of-vocab words
if ft_path is not None:
    wv = load_fasttext_embeddings(ft_path)
else:
    wv = create_FastText_embeddings(df, content_field)   
wv.init_sims()

["I am using the Photoshop's javascript API to find the fonts in a given PSD", 'Given a font name returned by the API I want to find the actual physical font file that that font name corresponds to on the disc']
['i', 'am', 'using', 'the', 'photoshop', "'s", 'javascript', 'api', 'to', 'find', 'the', 'fonts', 'in', 'a', 'given', 'psd']


UnicodeEncodeError: 'utf-8' codec can't encode characters in position 1-14: surrogates not allowed

In [None]:
import pickle
with open("embedding_sg.ft", "wb") with out_file:
    pickle.dump(wv, out_file)


In [70]:
train_data.shape

(78101, 6)

## Train with Title and Body

In [1]:
X_t = data["q_title_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))
X_b = data["q_all_body_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))

padding_element = np.array([0.0] * X_train_t.iloc[0].shape[-1])

X_t = pad_sequences(X_t, padding="post", dtype='float32', value=padding_element)
X_b = pad_sequences(X_b, padding="post", dtype='float32', value=padding_element)
print(X_t.shape)
print(X_b.shape)

label_encoder = MultiLabelBinarizer()
label_encoder.fit(df["tags"])
y = label_encoder.transform(df["tags"])
print(y.shape)

NameError: name 'data' is not defined

In [42]:
X_train_t = train_data["q_title_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))
X_train_b = train_data["q_all_body_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))


padding_element = np.array([0.0] * X_train_t.iloc[0].shape[-1])

X_train_t_padded = pad_sequences(X_train_t, padding="post", dtype='float32', value=padding_element)
X_train_b_padded = pad_sequences(X_train_b, padding="post", dtype='float32', value=padding_element)
print(X_train_t_padded.shape)
print(X_train_b_padded.shape)

(78101, 39, 100)
(78101, 100, 100)


In [43]:
X_test_t = test_data["q_title_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))
X_test_b = test_data["q_all_body_tokenized"].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))

X_test_t_padded = pad_sequences(X_test_t, padding="post", dtype='float32', value=padding_element)
X_test_b_padded = pad_sequences(X_test_b, padding="post", dtype='float32', value=padding_element)
print(X_test_t_padded.shape)
print(X_test_b_padded.shape)

(19526, 40, 100)
(19526, 100, 100)


In [45]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_data["tags"])
y_train = label_encoder.transform(train_data["tags"])
y_test = label_encoder.transform(test_data["tags"])

In [None]:
from toolbox.training import grid_search_es

search_params = {
    # conduct big grid search with these params
    # "lstm_layer_size": [256, 128],
    # "lstm_dropout": [0.0, 0.2, 0.4],
    # "num_mid_dense": [1, 0],
    
    # test grid search with these params (comment out for actual run)
    "lstm_layer_size": [16],
    "lstm_dropout": [0.0],
    "num_mid_dense": [1, 0]
    
    # don't change these:
    "output_dim": [y.shape[-1]]
}

all_hists = grid_search_es(create_model, search_params)

best_params, best_hist, best_loss = min(all_hists, key=lambda x: x[2])

epoch_lengths = [len(h["val_loss"]) for h in best_hist]
print(f"best combindation: {best_params}")
print(f"avg min val_loss: {best_loss} -- epoch counts: {epoch_lengths}")

In [46]:
model = tb_create_model(**best_params)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 100)]  0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 100)]  0                                            
__________________________________________________________________________________________________
masking (Masking)               (None, None, 100)    0           input_1[0][0]                    
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 100)    0           input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
model_name = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir="logs/fit/" + model_name

callbacks = [
    EarlyStopping(monitor="val_loss", patience=10, verbose=0),
    TensorBoard(log_dir=log_dir, histogram_freq=1),
    ModelCheckpoint(filepath=f"checkpoints/{model_name}", monitor="val_loss", restore_best_weights=True, verbose=0)
]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(x=X_train, y=y_train, batch_size=128, epochs=100, validation_data=[X_test, y_test], callbacks=callbacks)

In [None]:
X_train_t

In [None]:

predictions = model.predict([X_test_t_padded, X_test_b_padded])

l_pred = label_encoder.inverse_transform(binarize_model_output(predictions, threshold=0.10))
l_true = label_encoder.inverse_transform(y_test)
texts = test_data[tokenized_field]
raw_texts = test_data[content_field]
titles = test_data["Title"]

for pred, act, txt, raw_txt, title in zip(l_pred, l_true, texts, raw_texts, titles):
    print(f"TRUE: {act}\nPREDICTION: {pred}\n")
    print(f"{title}\n----------")
    print(raw_txt)

In [59]:
predictions = model.predict([X_test_t_padded, X_test_b_padded], batch_size=64)
output_evaluation(model, sample_size, max_question_words, n_top_labels, y_test, predictions, normalize_embeddings, 1, None, n_epochs)

Model Evaluation

normalize_embeddings = True, learning_rate = 1, vocab_size = None, epochs=30
Parameter Settings:
 Sample size = -1, Max. number of words per question = 100, Number of Top Labels used = 100

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 100)]  0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 100)]  0                                            
__________________________________________________________________________________________________
masking (Masking)               (None, None, 100)    0           input_1[0][0]                    
____________________________________________________________________________________

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



Metrics with optimized threshold of 0.43
 Macro Evaluation: f1_Score= 0.5090096015650044 , Recall = 0.44893302869566176 , Precision = 0.6111371903996836
 Micro Evaluation: f1_Score= 0.5972185778011021 , Recall = 0.5130362912314975 , Precision = 0.7144501412577169


In [55]:
print(predictions.shape)
print(y_test.shape)

(19526, 100)
(19526, 100)
