# Training Pipeline

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from nltk.tokenize import word_tokenize
from toolbox.data_prep_helpers import *
from toolbox.evaluation import *

from models.lstm_classifier import create_model

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import numpy as np

In [86]:
data_path = "../data/pythonquestions/"
ft_path = "alldata.ft"  # set this to None if you want to train your own fasttext embeddings
n_top_labels = 20
n_epochs = 10
max_question_words = 100
sample_size = 100000  # set to -1 to use entire data
normalize_embeddings = True
use_titles = True

tokenized_field = "q_title_tokenized" if use_titles else "q_all_body_tokenized"
content_field = "Title" if use_titles else "Body_q"

In [9]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Data

In [70]:
df = load_data(data_path)
df.shape

loading data from cached pickle


(539238, 5)

In [71]:
sample = df.sample(sample_size) if sample_size > 0 else df
remove_html_tags(sample, ["Body_q"])
sample.head()

Unnamed: 0,Id,Title,Body_q,Body_a,tags
521633,39173034,Using Matplotlib to plot over a subset of data,I am using matplotlib to plot bar charts of da...,<p>If I'm understanding your question correctl...,"[python, pandas, numpy, matplotlib, plot]"
300797,26248831,How do you save a python flickrapi response ob...,import sys\n\nimport os\n\nimport urllib\n\nfr...,<p>maybe this is helpful</p>\n\n<pre><code>imp...,"[python, api, flickr]"
430771,34107375,How to display a image using Templates in Django,I am creating one app using following two url'...,<p>Within the loop:</p>\n\n<pre><code>&lt;img ...,"[python, django]"
239432,22017705,TypeError: data() missing 4 required positiona...,"I have this assignment for school work, this i...",<p>It looks like you are trying to store infor...,[python]
401559,32459319,Insert negative lookahead when joining multipl...,How do I join a these three re.compile into a ...,"<p>I think you're looking for this <a href=""ht...","[python, regex]"


In [20]:
sample["Body_q"].iloc[102]

'i have model mykategori\nclass mykategori(models.Model):\n    w_id_kategori = models.AutoField(primary_key=True)\n    w_nama_kategori = models.CharField(max_length=50, null=True)\n\n    def __str__(self):\n       return self.w_nama_kategori\n\nand myblog\nclass myblog(models.Model):\n    w_id_article = models.AutoField(primary_key=True)\n    w_kategori = models.ForeignKey(mykategori)\n    w_penulis = models.ForeignKey(User,editable=False)\n\n    def __str__(self):\n       return self.w_judul\n\nand this is my view in myblog app\nfrom .models import myblog\ndef index(request):\n   blog={}\n   blog[\'data\']=myblog.objects.all()\n   blog[\'title\']="Halaman Utama"\n   return render(request,\'myblog/index.html\',blog)\n\nand this is my index.html\n{% for blog in data %}\n      {{blog.w_kategori}}\n{% endfor %}\n\nif i run it, it will show id from mykategori app in index.html, how can i print w_nama_kategori if i use {{blog.w_kategori}}?\n'

### Clean Data

In [72]:
# we have some nans in our tags which break target encoding
print(sample.shape)
sample = sample[sample["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(sample.shape)


# Reduce the number of tags and adjust dataframe accordingly
sample = reduce_number_of_tags(sample, n_top_labels)
sample.shape

(100000, 5)
(99922, 5)
deleting element python from top_tags


(70557, 5)

In [13]:
sample["tags"].head(5)

392708                [file]
318791        [json, django]
535897     [python-2.7, pip]
454629    [javascript, json]
437892                 [pip]
Name: tags, dtype: object

### Prepare Training and Test data

In [74]:
# Tokenize text into words on question level
sample[tokenized_field] = sample[content_field].apply(generate_question_level_tokens)
data = sample[sample[tokenized_field].apply(len) <= max_question_words]
train_data, test_data = train_test_split(data, test_size = 0.2)
print(train_data.shape)
print(test_data.shape)

(56445, 6)
(14112, 6)


In [16]:
# train word embeddings ONLY with training data
# wv = create_Word2Vec_embeddings(train_data, "Body_q")
# Use FastText to include solution for out-of-vocab words
if ft_path is not None:
    wv = load_fasttext_embeddings(ft_path)
else:
    wv = create_FastText_embedding(train_data, content_field)
wv.init_sims()

In [36]:
X_train = train_data[tokenized_field].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))

padding_element = np.array([0.0] * X_train[0].shape[-1])
X_train_padded = pad_sequences(X_train, padding="post", dtype='float32', value=padding_element)
X_train_padded.shape

KeyError: 'q_all_body_tokenized'

In [77]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_data["tags"])
y_train = label_encoder.transform(train_data["tags"])

In [22]:
X_test = test_data[tokenized_field].apply(lambda x: np.array([wv.word_vec(w, use_norm=normalize_embeddings) for w in x]))

X_test_padded = pad_sequences(X_test, padding="post", dtype='float32', value=padding_element)
y_test = label_encoder.transform(test_data["tags"])

### Train Model

In [78]:
padding_element = np.array([50] * X_train.iloc[0].shape[-1])
model = create_model(embedding_dim=100, output_dim=100, mask_value=0.)
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_3 (Masking)          (None, None, 100)         0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dense_12 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_13 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_14 (Dense)             (None, 100)               12900     
Total params: 477,156
Trainable params: 477,156
Non-trainable params: 0
_________________________________________________________________


In [79]:
y_train = y_train.astype("float32")
y_test = y_test.astype("float32")

model.fit(x=X_train_padded, y=y_train, batch_size=128, epochs=n_epochs, validation_data=[X_test, y_test])

Train on 56445 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f25aa58c0b8>

## Model Evaluation

In [89]:
n_predictions = 100

predictions = model.predict(X_train_padded[:n_predictions])

l_pred = label_encoder.inverse_transform(binarize_model_output(predictions, threshold=0.0))
l_true = label_encoder.inverse_transform(y_train[:n_predictions])
texts = train_data[tokenized_field][:n_predictions]
raw_texts = train_data[content_field][:n_predictions]

for pred, act, txt, raw_txt in zip(l_pred, l_true, texts, raw_texts):
    print(f"TRUE: {act}\nPREDICTION: {pred}\n")
    print(txt)
    print(raw_txt)

TRUE: ('list', 'tuples')
PREDICTION: ('algorithm', 'arrays', 'bash', 'beautifulsoup', 'c', 'c++', 'class', 'csv', 'database', 'dataframe', 'datetime', 'dictionary', 'django', 'django-forms', 'django-models', 'django-rest-framework', 'django-templates', 'django-views', 'encoding', 'excel', 'file', 'file-io', 'flask', 'for-loop', 'function', 'google-app-engine', 'html', 'http', 'if-statement', 'image', 'import', 'indexing', 'ipython', 'java', 'javascript', 'jquery', 'json', 'linux', 'list', 'logging', 'loops', 'lxml', 'machine-learning', 'matplotlib', 'matrix', 'module', 'mongodb', 'multiprocessing', 'multithreading', 'mysql', 'nltk', 'numpy', 'oop', 'opencv', 'osx', 'pandas', 'parsing', 'performance', 'php', 'pip', 'plot', 'postgresql', 'pygame', 'pyqt', 'pyqt4', 'python-2.7', 'python-3.x', 'python-imaging-library', 'python-requests', 'qt', 'random', 'recursion', 'regex', 'scikit-learn', 'scipy', 'scrapy', 'selenium', 'shell', 'sockets', 'sorting', 'sql', 'sqlalchemy', 'sqlite', 'string

In [88]:
predictions = model.predict(X_train_padded, batch_size=64)
l_pred_binary = binarize_model_output(predictions, 0.1)
l_true_binary = y_train
output_evaluation(model, sample_size, max_question_words, n_top_labels, l_true_binary, l_pred_binary, normalize_embeddings, 1, None, n_epochs)

Model Evaluation

normalize_embeddings = True, learning_rate = 1, vocab_size = None, epochs=10
Parameter Settings:
 Sample size = 100000, Max. number of words per question = 100, Number of Top Labels used = 20

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_3 (Masking)          (None, None, 100)         0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dense_12 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_13 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_14 (Dense)             (None, 100)               12900     
Total params: 477,156
Trainable params: 4