# Training Pipeline

In [10]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from nltk.tokenize import word_tokenize
from toolbox.data_prep_helpers import *

from models.lstm_classifier import create_model

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data_path = "../data/pythonquestions/"

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dschr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Data

In [5]:
df = load_data(data_path)
df.shape

(539238, 5)

In [7]:
sample = df.sample(10000)
remove_html_tags(sample, ["Body_q"])
sample.head()

Unnamed: 0,Id,Title,Body_q,Body_a,tags
386445,31556460,How to find the median and standard deviation ...,I have CSV file like below and i want to calcu...,<p>This is a perfect task for pandas:</p>\n\n<...,"[python, csv]"
139540,14108355,Python - how do I force the use of a factory m...,I have a set of related classes that all inher...,"<p>In Python, it's almost never worth trying t...","[python, class-factory]"
24474,3016974,How to get text in QlineEdit when QpushButton ...,I am trying to implement a function. My code i...,<p>My first suggestion is to use Designer to c...,"[python, pyqt4, qlineedit]"
56210,6182967,How to format a MySQL query into JSON using we...,I am trying to query a MySQL database using we...,<p>You can extend json.JSONEncoder to handle d...,"[python, mysql, json, web.py]"
95932,10183313,cvWriteFrame throws IplImage * : could not con...,I am trying to build a video from jpeg images ...,<p><code>cvWriteFrame()</code> needs a valid <...,"[python, opencv]"


In [8]:
sample["Body_q"].iloc[100]

'I\'m trying to access some values that are nested as an ajax response from a website.\nEverything is output as one giant line that I can\'t manage to navigate down. However to give you an idea of what it looks like, the pprint of the dictionary is something like: \n    {u\'d\': {u\'Type\': None,\n    u\'__type\': u\'TOPS.ajaxResponse\',\n    u\'actionOnSuccess\': None,\n    u\'data\': u\'{"BasicCodes":{"PRODUCTPRICES":[{"ProductId":"ProductA","CategoryId":"1","Color":"Red","Quantity":"0"},{"ProductId":"ProductA","CategoryId":"2","Color":"Blue","Quantity":"0"},{"ProductId":"ProductB","CategoryId":"1","Color":"Red","Quantity":"0"},{"ProductId":"ProductB","CategoryId":"2","Color":"Blue","Quantity":"0"}, ...and so on...\n\n    .\n    .\n    .\n\n    u\'data2\': None,\n    u\'dataExtra\': None,\n    u\'errors\': [],\n    u\'general_message\': None,\n    u\'success\': True}}\n\nThere are hundreds of products listed (ProductA, ProductB, etc), but all I want to do is get the number associated

### Clean Data

In [14]:
# we have some nans in our tags which break target encoding
print(sample.shape)
sample = sample[sample["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(sample.shape)


# Reduce the number of tags and adjust dataframe accordingly
reduce_number_of_tags(sample, 100)
sample.shape


(9995, 5)
(9995, 5)
deleting element pandas from top_tags


'\n\nhallo\n'

In [13]:
sample["tags"].head(10)

386445                  [csv]
139540                     []
24474                 [pyqt4]
56210           [mysql, json]
95932                [opencv]
28035               [tkinter]
438255                     []
456381    [pandas, dataframe]
210901                     []
387790                     []
Name: tags, dtype: object

### Prepare Training and Test data

In [58]:
# Tokenize text into words on question level
sample["q_all_body_tokenized"] = sample["Body_q"].apply(generate_question_level_tokens)
data = sample[sample["q_all_body_tokenized"].apply(len) <= 100]
train_data, test_data = train_test_split(data, test_size = 0.2)
print(train_data.shape)
print(test_data.shape)

(2815, 6)
(704, 6)


In [59]:
# train word embeddings ONLY with training data
# wv = create_Word2Vec_embeddings(train_data, "Body_q")
# Use FastText to include solution for out-of-vocab words
wv = create_FastText_embedding(train_data, "Body_q")

["I know you can disable resizing; however I'd like it so when the window was resized it stuck to a certain width to height ratio", 'Say a way to take a Python Unicode string and pass it to a C function which catenates it with itself and returns that to Python which prints it']
['i', 'know', 'you', 'can', 'disable', 'resizing', ';', 'however', 'i', "'d", 'like', 'it', 'so', 'when', 'the', 'window', 'was', 'resized', 'it', 'stuck', 'to', 'a', 'certain', 'width', 'to', 'height', 'ratio']


In [72]:
X_train = train_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))

In [73]:
X_train_padded = pad_sequences(X_train, padding="post")
X_train_padded.shape

(2815, 100, 100)

In [74]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_data["tags"])
y_train = label_encoder.transform(train_data["tags"])

In [77]:
X_test = test_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))
y_test = label_encoder.transform(test_data["tags"])

### Train Model

In [78]:
model = create_model(embedding_dim=100, output_dim=100)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 32)                17024     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               3300      
Total params: 20,324
Trainable params: 20,324
Non-trainable params: 0
_________________________________________________________________


In [79]:
model.fit(x=X_train_padded, y=y_train, batch_size=128, epochs=10)

Train on 2815 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x174a0116448>

In [29]:
train_data.head()

Unnamed: 0,Id,Title,Body_q,Body_a,tags,q_all_body_tokenized
518827,39014670,"(centos6.6) before updating python2.7.3 ,it is...","(centos6.6) before updating python2.7.3 ,it is...",<p>Reasons could be any of the following:</p>\...,[linux],"[(, centos6.6, ), before, updating, python2.7...."
180026,17518304,run python file in python shell,I have a python file (my_code.py) in Home/Pyth...,<p>You should expand tilde(~) to actual path. ...,[ubuntu],"[i, have, a, python, file, (, my_code.py, ), i..."
268332,24024736,How to generate random int around specific mean?,I need to generate 100 age values between 23 a...,<p>I think you could populate an array of size...,[random],"[i, need, to, generate, 100, age, values, betw..."
480028,36843984,Django REST Framework - NoReverseMatch when us...,I've been trying to return some URLs using the...,<p>You shouldn't pass the request:</p>\n\n<pre...,"[django, django-rest-framework]","[i, 've, been, trying, to, return, some, urls,..."
24471,3016497,How to create a translucid/alpha-transparent r...,I have a wx.panel and I want to put a transluc...,"<p>You can do this using a <a href=""http://www...",[wxpython],"[i, have, a, wx.panel, and, i, want, to, put, ..."
