# Training Pipeline

In [56]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from nltk.tokenize import word_tokenize
from toolbox.data_prep_helpers import *
from toolbox.evaluation import *

from models.lstm_classifier import create_model

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
data_path = "../data/pythonquestions/"
ft_path = "alldata.ft"  # set this to None if you want to train your own fasttext embeddings
n_top_labels = 100
max_question_words = 100
sample_size = 100000  # set to -1 to use entire data

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dschr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Data

In [11]:
df = load_data(data_path)
df.shape

(539238, 5)

In [13]:
sample = df.sample(sample_size) if sample_size > 0 else df
remove_html_tags(sample, ["Body_q"])
sample.head()

Unnamed: 0,Id,Title,Body_q,Body_a,tags
151376,15149039,Mocking an object that I'm testing with unittest,I have a function vertex_set that takes a vari...,"<p>As you already seem to know, unit tests are...","[python, unit-testing, python-3.x]"
342234,28903915,Kivy GUI with autobahn WAMP,And I am trying to combine between kivy applic...,"<p>You need to run Kivy with <a href=""http://k...","[python, kivy, autobahn]"
481925,36950050,numpy show only positive numbers with 2 columns,I want numpy to go through column 1 and find a...,<p>this is the solution how to sort by column1...,"[python, numpy]"
121966,12547646,Pyramid.security: Is getting user info from a ...,I'm trying to make an accesible cache of user ...,<p>The <code>unauthenticated_userid</code> cal...,"[python, authentication, security, pyramid]"
425742,33833358,Python lxml: How to traverse back up a tree,I have the following python code\nimport lxml....,<p><code>..</code> would bring you one level u...,"[python, xml, xpath, lxml]"


In [14]:
sample["Body_q"].iloc[102]

"As a quick example, let's say I have a Product class\nclass Product(models.Model):\n    tags = models.ManyToManyField('Tag',blank=True,null=True)\n\nMy Tag class looks like this\nclass Tag(models.Model):\n    name = models.CharField(max_length=50, unique=True, db_index=True)\n\nGiven one product, how would I sort a result set of all other products by most common tags?\nFor example I have the following:\nP1 with tags A, B, and C\nP2 with tags B, C\nP3 with tags B\nP4 with tags A, B, and C  \nI would want my result set for P1 to be P4, P2, P3 in that order, assuming we are excluding P1 from the result set.\n"

### Clean Data

In [15]:
# we have some nans in our tags which break target encoding
print(sample.shape)
sample = sample[sample["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(sample.shape)


# Reduce the number of tags and adjust dataframe accordingly
sample = reduce_number_of_tags(sample, n_top_labels)
sample.shape

(100000, 5)
(99922, 5)
deleting element python from top_tags


(70274, 5)

In [16]:
sample["tags"].head(5)

151376    [unit-testing, python-3.x]
342234                        [kivy]
481925                       [numpy]
425742            [xml, xpath, lxml]
430887                  [python-2.7]
Name: tags, dtype: object

### Prepare Training and Test data

In [17]:
# Tokenize text into words on question level
sample["q_all_body_tokenized"] = sample["Body_q"].apply(generate_question_level_tokens)
data = sample[sample["q_all_body_tokenized"].apply(len) <= 100]
train_data, test_data = train_test_split(data, test_size = 0.2)
print(train_data.shape)
print(test_data.shape)

(13182, 6)
(3296, 6)


In [18]:
# train word embeddings ONLY with training data
# wv = create_Word2Vec_embeddings(train_data, "Body_q")
# Use FastText to include solution for out-of-vocab words
if ft_path is not None:
    wv = load_fasttext_embeddings(ft_path)
else:
    wv = create_FastText_embedding(train_data, "Body_q")

In [19]:
X_train = train_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))

In [20]:
X_train_padded = pad_sequences(X_train, padding="post", dtype='float32')
X_train_padded.shape

(13182, 100, 100)

In [21]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_data["tags"])
y_train = label_encoder.transform(train_data["tags"])

In [22]:
X_test = test_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))
X_test_padded = pad_sequences(X_test, padding="post", dtype='float32')
y_test = label_encoder.transform(test_data["tags"])

### Train Model

In [23]:
model = create_model(embedding_dim=100, output_dim=100)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               365568    
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               12900     
Total params: 477,156
Trainable params: 477,156
Non-trainable params: 0
_________________________________________________________________


In [24]:
y_train = y_train.astype("float32")
y_test = y_test.astype("float32")

model.fit(x=X_train_padded, y=y_train, batch_size=128, epochs=3, validation_data=[X_test_padded, y_test])

Train on 13182 samples, validate on 3296 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x2a0ae89a148>

In [25]:
np.argmax(y_train[:30], axis=-1)

array([19, 35, 66, 12, 10,  3, 31, 12, 32,  1, 24,  0, 12, 55, 40,  1, 82,
       66, 73, 37, 51, 66, 23, 73, 57, 43, 12, 65, 84, 12], dtype=int64)

In [28]:
np.argmax(predictions[:30], axis=-1)

array([12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12], dtype=int64)

In [107]:
X_train_padded[0]

array([[ 2,  4,  6, ...,  1, -1, -1],
       [ 1,  1, -2, ..., -4, -3, -2],
       [ 0,  0,  1, ...,  2,  1,  6],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int32)

## Model Evaluation

In [27]:
n_predictions = 100

predictions = model.predict(X_train_padded[:n_predictions])

l_pred = label_encoder.inverse_transform(binarize_model_output(predictions, threshold=0.1))
l_true = label_encoder.inverse_transform(y_train[:n_predictions])
texts = train_data["q_all_body_tokenized"][:n_predictions]
raw_texts = train_data["Body_q"][:n_predictions]

for pred, act, txt, raw_txt in zip(l_pred, l_true, texts, raw_texts):
    print(f"TRUE: {act}\nPREDICTION: {pred}\n")
    print(txt)
    print(raw_txt)

TRUE: ('file', 'function')
PREDICTION: ('django', 'numpy', 'pandas', 'python-2.7', 'python-3.x')

['i', "'m", 'very', 'new', 'to', 'python', 'and', 'i', "'ve", 'spent', 'a', 'while', 'trying', 'to', 'get', 'a', 'function', 'working', 'to', 'snip', 'a', 'pwdump', 'format', 'file', 'into', 'just', 'the', 'username', ':', 'ntlmhash', 'function', 'i', "'d", 'ideally', 'like', 'the', 'code', 'to', 'output', 'the', 'snipped', 'hashes', 'from', 'the', 'input', 'file', 'into', 'a', 'new', 'text', 'file', 'but', 'overwriting', 'is', 'good', 'enough', 'at', 'the', 'moment', 'if', 'the', 'new', 'file', 'is', 'a', 'bit', 'too', 'much', 'any', 'help', 'would', 'be', 'really', 'appreciated', ':', ')', 'thanks']
I'm very new to Python and I've spent a while trying to get a function working to snip a pwdump format file into just the username:NTLMhash function. I'd ideally like the code to output the snipped hashes from the input file into a new text file, but overwriting is good enough at the moment i

In [57]:
l_pred_binary = binarize_model_output(predictions, 0.1)
l_true_binary = y_train[:n_predictions]
output_evaluation(model, sample_size, max_question_words, n_top_labels, l_true_binary, l_pred_binary)

Model Evaluation

Parameter Settings:
 Sample size = 100000, Max. number of words per question = 100, Number of Top Labels used = 100

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               365568    
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               12900     
Total params: 477,156
Trainable params: 477,156
Non-trainable params: 0
_________________________________________________________________
None

Metrics:
 Macro Evaluation: f1_Score= 0.008026927522340365 , Recall = 0.05 , Precision = 0.0044
 Micro Eval