# Training Pipeline

In [28]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from nltk.tokenize import word_tokenize
from toolbox.data_prep_helpers import *
from toolbox.evaluation import *

from models.lstm_classifier import create_model

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
data_path = "../data/pythonquestions/"
ft_path = "alldata.ft"  # set this to None if you want to train your own fasttext embeddings
n_top_labels = 100
max_question_words = 100
sample_size = 100000  # set to -1 to use entire data

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Data

In [6]:
df = load_data(data_path)
df.shape

(539238, 5)

In [33]:
sample = df.sample(sample_size) if sample_size > 0 else df
remove_html_tags(sample, ["Body_q"])
sample.head()

Unnamed: 0,Id,Title,Body_q,Body_a,tags
207123,19638887,"Python ""with"" statement syntax",I have some python code that parse the csv fil...,<p>You can do this by assigning the function y...,"[python, with-statement]"
272569,24326308,Get running python server IP address in Javasc...,I have a python flask app running on my server...,<p>Register a domain name and stick with it. U...,"[javascript, python, flask]"
22686,2842748,What will be the setup process for website dev...,I want to create a simple site for my personal...,<p>If the size of the community matters to you...,[python]
351265,29447626,"Invalid syntax when using ""%"" sign on print fu...",I'm trying to print stuff on python 3.4.3 on t...,<p>You dont need comma after Your string :</p>...,"[python, python-3.x, invalidate]"
287783,25334157,Insert selected text from one file into multip...,I have file(lets call it file_A) from which I ...,"<p>Put files in a list, and iterate over the l...","[python, file]"


In [34]:
sample["Body_q"].iloc[102]

"I hope this message finds you in good spirits. I am trying to find a quick tutorial on the \\b expression (apologies if there is a better term). I am writing a script at the moment to parse some xml files, but have ran into a bit of a speed bump. I will show an example of my xml:\n<....></...><...></...><OrderId>123456</OrderId><...></...>\n<CustomerId>44444444</CustomerId><...></...><...></...>\n\n<...> is unimportant and non relevant xml code. Focus primarily on the CustomerID and OrderId.\nMy issue lies in parsing a string, similar to the above statement. I have a regexParse definition that works perfectly. However it is not intuitive. I need to match only the part of the string that contains 44444444.\nMy Current setup is:\nsearchPattern = '>\\d{8}</CustomerId'\n\nGreat! It works, but I want to do it the right way. My thinking is 1) find 8 digits 2) if the some word boundary is non numeric after that matches CustomerId return it.\nIdea:\nsearchPattern = '\\bd{16}\\b'\n\nMy issue i

### Clean Data

In [35]:
# we have some nans in our tags which break target encoding
print(sample.shape)
sample = sample[sample["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(sample.shape)


# Reduce the number of tags and adjust dataframe accordingly
sample = reduce_number_of_tags(sample, n_top_labels)
sample.shape

(100000, 5)
(99928, 5)
deleting element python from top_tags


(70181, 5)

In [36]:
sample["tags"].head(5)

272569    [javascript, flask]
351265           [python-3.x]
287783                 [file]
312479           [matplotlib]
41734             [algorithm]
Name: tags, dtype: object

### Prepare Training and Test data

In [37]:
# Tokenize text into words on question level
sample["q_all_body_tokenized"] = sample["Body_q"].apply(generate_question_level_tokens)
data = sample[sample["q_all_body_tokenized"].apply(len) <= 100]
train_data, test_data = train_test_split(data, test_size = 0.2)
print(train_data.shape)
print(test_data.shape)

(28531, 6)
(7133, 6)


In [38]:
# train word embeddings ONLY with training data
# wv = create_Word2Vec_embeddings(train_data, "Body_q")
# Use FastText to include solution for out-of-vocab words
if ft_path is not None:
    wv = load_fasttext_embeddings(ft_path)
else:
    wv = create_FastText_embedding(train_data, "Body_q")

In [39]:
X_train = train_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))

In [40]:
X_train_padded = pad_sequences(X_train, padding="post")
X_train_padded.shape

(28531, 100, 100)

In [41]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_data["tags"])
y_train = label_encoder.transform(train_data["tags"])

In [42]:
X_test = test_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))
X_test_padded = pad_sequences(X_test, padding="post")
y_test = label_encoder.transform(test_data["tags"])

### Train Model

In [43]:
model = create_model(embedding_dim=100, output_dim=100)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dense_6 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_7 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_8 (Dense)              (None, 100)               12900     
Total params: 477,156
Trainable params: 477,156
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x=X_train_padded, y=y_train, batch_size=128, epochs=3, validation_data=[X_test_padded, y_test])

Train on 28531 samples, validate on 7133 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

In [None]:
n_predictions = 1000

predictions = model.predict(X_train_padded[:n_predictions])

l_pred = label_encoder.inverse_transform(binarize_model_output(predictions))
l_true = label_encoder.inverse_transform(y_train[:n_predictions])
texts = train_data["q_all_body_tokenized"][:n_predictions]

for pred, act, txt in zip(l_pred, l_true, texts):
    print(f"TRUE: {act}\nPREDICTION: {pred}\n")
    print(txt)