# Training Pipeline

In [70]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from nltk.tokenize import word_tokenize
from toolbox.data_prep_helpers import *

from models.lstm_classifier import create_model

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
data_path = "../data/pythonquestions/"

In [46]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dschr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Data

In [47]:
df = load_data(data_path)
df.shape

(539238, 5)

In [55]:
sample = df.sample(10000)
remove_html_tags(sample, ["Body_q"])
sample.head()

Unnamed: 0,Id,Title,Body_q,Body_a,tags
61123,6689502,Python: would this leave me with a clean envir...,I want to change directories in the python (no...,<p>Just use this</p>\n\n<pre><code>__import__(...,"[python, namespaces, read-eval-print-loop]"
439406,34601268,NameError: name 'result' is not defined. Can't...,"I'm a complete beginner in Python, trying to g...","<p>The error is clear, <code>result</code> var...",[python]
95689,10159430,Python thread waiting for copying the file,I have a c program which is running in thread ...,<p>There are a lot of important details to you...,"[python, c]"
296320,25938674,Count # of unique values for dictionary key in...,I have created nested dictionary with values e...,<p>You could walk through the list of lists an...,"[python, dictionary, nested]"
107995,11295325,"Receive a string, convert to calculate and dis...",My homework question:\n\nCreate a script to re...,"<p>You probably want <a href=""http://docs.pyth...",[python]


In [50]:
sample["Body_q"].iloc[100]

'using python 3.3 , I am supposed to answer the following questions:\nFrom a box full of discs, we would like to know the probability of pulling two blue discs\nin a row when all the discs in the box are either red or blue. Write a function which can\ncalculate this probability for a box ï¬\x81lled with any number of red discs and any number of\nblue discs. A test case you may want to use: if the box contains 15 blue discs and 6 red\ndiscs, you have a 50% chance of drawing two blue discs in a row.\nNow write a function that calculates the probability of drawing n blue discs in a row for\nsome n between 0 and the number of discs in the box.\ni have tried using this model\nimport random\ndef random_pick(some_list, probabilities):\n    x = random.uniform(0, 1)\n    cumulative_probability = 50.0\n    for item, item_probability in zip(some_list, probabilities):\n        cumulative_probability += item_probability\n        if x < cumulative_probability: break\n    return item\n\ni am almost c

### Clean Data

In [56]:
# we have some nans in our tags which break target encoding
print(sample.shape)
sample = sample[sample["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(sample.shape)

# reduce number of possible tags
all_tags = np.array([tag for tags in sample["tags"] for tag in tags])
unique_tags = np.unique(all_tags, return_counts=True)
top_tags = unique_tags[0][np.argsort(unique_tags[1])[-101:]]  # keep 101, we will remove "python"
print(f"deleting element {top_tags[-1]} from top_tags")
top_tags = np.delete(top_tags, -1)

# remove tags that are not within top_101 and remove "python" tag
sample["tags"] = sample["tags"].apply(lambda x: [tag for tag in x if tag in top_tags])

# remove any tag lists that we potentially made empty by doing this
sample = sample[sample["tags"].apply(len) > 0]
sample.shape

(10000, 5)
(9991, 5)
deleting element python from top_tags


(6996, 5)

In [57]:
sample["tags"].head(10)

95689                          [c]
296320                [dictionary]
502535                   [mongodb]
209085          [matplotlib, plot]
99761              [mysql, django]
45201     [for-loop, if-statement]
533569                     [numpy]
195001                       [csv]
224582                     [numpy]
431849                [sqlalchemy]
Name: tags, dtype: object

### Prepare Training and Test data

In [53]:
def generate_question_level_tokens(txt):
    sents = sent_tokenize_text(txt)
    words = [word_tokenize_sent(s) for s in sents]
    return [item for sublist in words for item in sublist]  # flatten nested list

In [58]:
sample["q_all_body_tokenized"] = sample["Body_q"].apply(generate_question_level_tokens)
data = sample[sample["q_all_body_tokenized"].apply(len) <= 100]
train_data, test_data = train_test_split(data, test_size = 0.2)
print(train_data.shape)
print(test_data.shape)

(2815, 6)
(704, 6)


In [59]:
# train word embeddings ONLY with training data
# wv = create_Word2Vec_embeddings(train_data, "Body_q")
# Use FastText to include solution for out-of-vocab words
wv = create_FastText_embedding(train_data, "Body_q")

["I know you can disable resizing; however I'd like it so when the window was resized it stuck to a certain width to height ratio", 'Say a way to take a Python Unicode string and pass it to a C function which catenates it with itself and returns that to Python which prints it']
['i', 'know', 'you', 'can', 'disable', 'resizing', ';', 'however', 'i', "'d", 'like', 'it', 'so', 'when', 'the', 'window', 'was', 'resized', 'it', 'stuck', 'to', 'a', 'certain', 'width', 'to', 'height', 'ratio']


In [72]:
X_train = train_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))

In [73]:
X_train_padded = pad_sequences(X_train, padding="post")
X_train_padded.shape

(2815, 100, 100)

In [74]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_data["tags"])
y_train = label_encoder.transform(train_data["tags"])

In [77]:
X_test = test_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))
y_test = label_encoder.transform(test_data["tags"])

### Train Model

In [78]:
model = create_model(embedding_dim=100, output_dim=100)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 32)                17024     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               3300      
Total params: 20,324
Trainable params: 20,324
Non-trainable params: 0
_________________________________________________________________


In [79]:
model.fit(x=X_train_padded, y=y_train, batch_size=128, epochs=10)

Train on 2815 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x174a0116448>

In [29]:
train_data.head()

Unnamed: 0,Id,Title,Body_q,Body_a,tags,q_all_body_tokenized
518827,39014670,"(centos6.6) before updating python2.7.3 ,it is...","(centos6.6) before updating python2.7.3 ,it is...",<p>Reasons could be any of the following:</p>\...,[linux],"[(, centos6.6, ), before, updating, python2.7...."
180026,17518304,run python file in python shell,I have a python file (my_code.py) in Home/Pyth...,<p>You should expand tilde(~) to actual path. ...,[ubuntu],"[i, have, a, python, file, (, my_code.py, ), i..."
268332,24024736,How to generate random int around specific mean?,I need to generate 100 age values between 23 a...,<p>I think you could populate an array of size...,[random],"[i, need, to, generate, 100, age, values, betw..."
480028,36843984,Django REST Framework - NoReverseMatch when us...,I've been trying to return some URLs using the...,<p>You shouldn't pass the request:</p>\n\n<pre...,"[django, django-rest-framework]","[i, 've, been, trying, to, return, some, urls,..."
24471,3016497,How to create a translucid/alpha-transparent r...,I have a wx.panel and I want to put a transluc...,"<p>You can do this using a <a href=""http://www...",[wxpython],"[i, have, a, wx.panel, and, i, want, to, put, ..."
