In [None]:
import pymongo
import pandas as pd
from tqdm import tqdm

from notebook_modules.database import Database
from notebook_modules.half import Half
from notebook_modules.classification import aggregate_features, split_top, split_class, classify
from notebook_modules.lists import make_list
from notebook_modules.plots import make_distribution_plot

In [None]:
db = Database()
assert db.client, "No database client available!"
stackoverflow = db.client["stackoverflow"]
tags = stackoverflow["tags"]
posts = stackoverflow["posts"]

In [None]:
tags_h1 = aggregate_features(posts, Half.make_half(2018, 2))
tags_h2 = aggregate_features(posts, Half.make_half(2019, 1))

In [None]:
top_h1 = split_top(tags_h1, threshold=0.2)
top_h2 = split_top(tags_h2, threshold=0.2)
dataset = classify(top_current=top_h1, top_next=top_h2)
dataset

In [None]:
import tensorflow as tf
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.utils import plot_model
from sklearn.model_selection import train_test_split

In [None]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    rec = true_positives / (possible_positives + K.epsilon())
    return rec

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    pre = true_positives / (predicted_positives + K.epsilon())
    return pre

def f1_score(y_true, y_pred):
    pre = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    f1 = 2 * ((pre * rec) / (pre + rec + K.epsilon()))
    return f1

def f1_loss(y_true, y_pred):
    true_positives = K.sum(K.cast(y_true * y_pred, "float"), axis=0)
    true_negatives = K.sum(K.cast((1 - y_true) * (1 - y_pred), "float"), axis=0)
    false_positives = K.sum(K.cast((1 - y_true) * y_pred, "float"), axis=0)
    false_negatives = K.sum(K.cast(y_true * (1 - y_pred), "float"), axis=0)

    pre = true_positives / (true_positives + false_positives + K.epsilon())
    rec = true_positives / (true_positives + false_negatives + K.epsilon())

    f1 = 2 * pre * rec / (pre + rec + K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [None]:
# split X, Y into a train and test set
x, y = split_class(dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

In [None]:
# create model, add dense layers one by one specifying activation function
model = Sequential()
model.add(Dense(10, input_dim=len(x_train.columns), activation="relu")) # input layer requires input_dim param
model.add(Dense(1, activation="sigmoid")) # sigmoid instead of relu for final probability between 0 and 1

# compile the model, adam gradient descent (optimized)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy", f1_score, precision, recall])

# call the function to fit to the data (training the network)
model.fit(x_train, y_train, epochs=1, validation_data=(x_test, y_test))

In [None]:
plot_model(model, show_shapes=True, show_layer_names=True, to_file="output/models/model.png")

In [None]:
tags_h1_eval = aggregate_features(posts, Half.make_half(2019, 1))
tags_h2_eval = aggregate_features(posts, Half.make_half(2019, 2))

In [None]:
top_h1_eval = split_top(tags_h1_eval, threshold=0.2)
top_h2_eval = split_top(tags_h2_eval, threshold=0.2)
dataset_eval = classify(top_current=top_h1_eval, top_next=top_h2_eval)
dataset_eval

In [None]:
x_eval, y_eval = split_class(dataset_eval)
metrics = model.evaluate(x_eval, y_eval)
dict(zip(model.metrics_names, metrics))