# Training Pipeline

In [28]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from nltk.tokenize import word_tokenize
from toolbox.data_prep_helpers import *
from toolbox.evaluation import *

from models.lstm_classifier import create_model

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
data_path = "../data/pythonquestions/"
ft_path = "alldata.ft"  # set this to None if you want to train your own fasttext embeddings
n_top_labels = 100
max_question_words = 100
sample_size = 100000  # set to -1 to use entire data

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Data

In [6]:
df = load_data(data_path)
df.shape

(539238, 5)

In [74]:
sample = df.sample(sample_size) if sample_size > 0 else df
remove_html_tags(sample, ["Body_q"])
sample.head()

Unnamed: 0,Id,Title,Body_q,Body_a,tags
282786,25014124,How to protect mysqldb connection in python?,I'm creating a pygtk app that needs a mysql co...,<p>Database connections are generally made fro...,"[python, mysql, pygtk, mysql-python]"
374603,30869537,What am I missing in understanding round() fun...,I was looking at the online doc for round() fu...,"<p>In python 2.x, <a href=""https://docs.python...","[python, rounding, built-in]"
496456,37779845,How to develop a calendar GUI in Python?,here's the code I currently have... My class b...,"<p>First, you should consider creating, at the...","[python, python-3.x, user-interface, tkinter, ..."
141809,14324270,Matplotlib custom marker/symbol,So there is this guide:\nhttp://matplotlib.org...,<p>So found out that it was just using mathtex...,"[python, matplotlib]"
19472,2501136,How to step through debug twisted?,"I'd like to be able to debug Punjab, a twisted...",<p>Since you're trying to debug a twisted appl...,"[python, netbeans, debugging, twisted]"


In [75]:
sample["Body_q"].iloc[102]

'What chances do I have to instantiate, keep and serialize/deserialize to/from binary data Python classes reflecting this pattern (adopted from RFC 2246 [TLS]):\n   enum { apple, orange } VariantTag;\n   struct {\n       uint16 number;\n       opaque string<0..10>; /* variable length */\n   } V1;\n   struct {\n       uint32 number;\n       opaque string[10];    /* fixed length */\n   } V2;\n   struct {\n       select (VariantTag) { /* value of selector is implicit */\n           case apple: V1;   /* VariantBody, tag = apple */\n           case orange: V2;  /* VariantBody, tag = orange */\n       } variant_body;       /* optional label on variant */\n   } VariantRecord;\n\nBasically I would have to define a (variant) class VariantRecord, which varies depending on the value of VariantTag. That\'s not that difficult. The challenge is to find a most generic way to build a class, which serializes/deserializes to and from a byte stream... Pickle, Google protocol buffer, marshal is all not an

### Clean Data

In [76]:
# we have some nans in our tags which break target encoding
print(sample.shape)
sample = sample[sample["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(sample.shape)


# Reduce the number of tags and adjust dataframe accordingly
sample = reduce_number_of_tags(sample, n_top_labels)
sample.shape

(100000, 5)
(99935, 5)
deleting element python from top_tags


(70441, 5)

In [77]:
sample["tags"].head(5)

282786                                  [mysql]
496456    [python-3.x, user-interface, tkinter]
141809                             [matplotlib]
19472                                 [twisted]
133930                       [python-2.7, lxml]
Name: tags, dtype: object

### Prepare Training and Test data

In [None]:
# Tokenize text into words on question level
sample["q_all_body_tokenized"] = sample["Body_q"].apply(generate_question_level_tokens)
data = sample[sample["q_all_body_tokenized"].apply(len) <= 100]
train_data, test_data = train_test_split(data, test_size = 0.2)
print(train_data.shape)
print(test_data.shape)

In [79]:
# train word embeddings ONLY with training data
# wv = create_Word2Vec_embeddings(train_data, "Body_q")
# Use FastText to include solution for out-of-vocab words
if ft_path is not None:
    wv = load_fasttext_embeddings(ft_path)
else:
    wv = create_FastText_embedding(train_data, "Body_q")

In [80]:
X_train = train_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))

In [81]:
X_train_padded = pad_sequences(X_train, padding="post", dtype='float32')
X_train_padded.shape

(13184, 100, 100)

In [82]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_data["tags"])
y_train = label_encoder.transform(train_data["tags"])

In [83]:
X_test = test_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))
X_test_padded = pad_sequences(X_test, padding="post", dtype='float32')
y_test = label_encoder.transform(test_data["tags"])

### Train Model

In [100]:
model = create_model(embedding_dim=100, output_dim=100)
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dense_18 (Dense)             (None, 256)               65792     
_________________________________________________________________
dense_19 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_20 (Dense)             (None, 100)               12900     
Total params: 477,156
Trainable params: 477,156
Non-trainable params: 0
_________________________________________________________________


In [101]:
y_train = y_train.astype("float32")
y_test = y_test.astype("float32")

model.fit(x=X_train_padded, y=y_train, batch_size=128, epochs=3, validation_data=[X_test_padded, y_test])

Train on 13184 samples, validate on 3297 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f6e75340898>

In [91]:
np.argmax(y_train[:30], axis=-1)

array([40, 30, 32, 57, 51, 50, 14, 14, 65,  9, 16, 14, 97, 53, 99, 99, 30,
       53, 94, 40, 57, 27, 75,  0, 14, 40, 67, 35,  2, 73])

In [104]:
np.argmax(predictions[:30], axis=-1)

array([14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14])

In [107]:
X_train_padded[0]

array([[ 2,  4,  6, ...,  1, -1, -1],
       [ 1,  1, -2, ..., -4, -3, -2],
       [ 0,  0,  1, ...,  2,  1,  6],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int32)

In [105]:
n_predictions = 100

predictions = model.predict(X_train_padded[:n_predictions])

l_pred = label_encoder.inverse_transform(binarize_model_output(predictions, threshold=0.1))
l_true = label_encoder.inverse_transform(y_train[:n_predictions])
texts = train_data["q_all_body_tokenized"][:n_predictions]
raw_texts = train_data["Body_q"][:n_predictions]

for pred, act, txt, raw_txt in zip(l_pred, l_true, texts, raw_texts):
    print(f"TRUE: {act}\nPREDICTION: {pred}\n")
    print(txt)
    print(raw_txt)

TRUE: ('list',)
PREDICTION: ('django', 'list', 'pandas', 'python-2.7', 'python-3.x')

['let', "'s", 'say', 'that', 'i', 'have', 'this', 'list', 'in', 'python', 'a', '=', '[', '``', '(', 'a1', ')', "''", '``', '(', 'b2', ')', "''", '``', '(', 'c3', ')', "''", '``', '(', 'd4', ')', "''", ']', 'so', 'how', 'can', 'i', 'print', 'it', 'out', 'in', 'the', 'following', 'format', ':', '(', 'a1', ')', '(', 'b2', ')', '(', 'c3', ')', '(', 'd4', ')', 'using', 'one', 'line', 'better', 'without', 'using', 'for', 'loop', 'thanks', 'in', 'advance']
Let's say that I have this list in python
A = ["(a,1)", "(b,2)", "(c,3)", "(d,4)"]

so how can I print it out in the following format:
(a,1), (b,2), (c,3), (d,4)

using one line, better without using for loop
Thanks in advance

TRUE: ('http',)
PREDICTION: ('django', 'list', 'pandas', 'python-2.7', 'python-3.x')

['i', "'m", 'having', 'trouble', 'understanding', 'how', 'to', 'issue', 'an', 'http', 'post', 'request', 'using', 'curl', 'from', 'inside', 'of', 