# Training Pipeline

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from nltk.tokenize import word_tokenize
from toolbox.data_prep_helpers import *

from models.lstm_classifier import create_model

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
import numpy as np

In [2]:
data_path = "../data/pythonquestions/"

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Data

In [None]:
df = load_data(data_path)
df.shape

In [None]:
sample = df.sample(100000)
remove_html_tags(sample, ["Body_q"])
sample.head()

In [None]:
sample["Body_q"].iloc[100]

### Clean Data

In [None]:
# we have some nans in our tags which break target encoding
print(sample.shape)
sample = sample[sample["tags"].apply(lambda tags: all([isinstance(t, str) for t in tags]))]
print(sample.shape)

# reduce number of possible tags
all_tags = np.array([tag for tags in sample["tags"] for tag in tags])
unique_tags = np.unique(all_tags, return_counts=True)
top_tags = unique_tags[0][np.argsort(unique_tags[1])[-101:]]  # keep 101, we will remove "python"
print(f"deleting element {top_tags[-1]} from top_tags")
top_tags = np.delete(top_tags, -1)

# remove tags that are not within top_101 and remove "python" tag
sample["tags"] = sample["tags"].apply(lambda x: [tag for tag in x if tag in top_tags])

# remove any tag lists that we potentially made empty by doing this
sample = sample[sample["tags"].apply(len) > 0]
sample.shape

In [None]:
sample["tags"].head(10)

### Prepare Training and Test data

In [None]:
def generate_question_level_tokens(txt):
    sents = sent_tokenize_text(txt)
    words = [word_tokenize_sent(s) for s in sents]
    return [item for sublist in words for item in sublist]  # flatten nested list

In [None]:
sample["q_all_body_tokenized"] = sample["Body_q"].apply(generate_question_level_tokens)
data = sample[sample["q_all_body_tokenized"].apply(len) <= 100]
train_data, test_data = train_test_split(data, test_size = 0.2)
print(train_data.shape)
print(test_data.shape)

In [None]:
# train word embeddings ONLY with training data
wv = create_Word2Vec_embeddings(train_data, "Body_q")

In [None]:
X_train = train_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))

In [None]:
X_train_padded = pad_sequences(X_train, padding="post")
X_train_padded.shape

In [None]:
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_data["tags"])
y_train = label_encoder.transform(train_data["tags"])

In [None]:
X_test = test_data["q_all_body_tokenized"].apply(lambda x: np.array([wv[w] for w in x]))
y_test = label_encoder.transform(test_data["tags"])

### Train Model

In [None]:
model = create_model(embedding_dim=100, output_dim=100)
model.summary()

In [None]:
model.fit(x=X_train_padded, y=y_train, batch_size=128, epochs=10)

In [None]:
train_data.head()