In [None]:
import pymongo
import pandas as pd
import numpy as np
from tqdm import tqdm

from notebook_modules.database import Database
from notebook_modules.half import Half
from notebook_modules.distribution import aggregate
from notebook_modules.lists import save_db, load_db

In [None]:
load_from_fs = True

if not load_from_fs:
    db = Database()
    assert db.client, "No database client available!"
    stackoverflow = db.client["stackoverflow"]
    tags = stackoverflow["tags"]
    posts = stackoverflow["posts"]

In [None]:
current_period = Half.make_half(2019, 2)
halves = Half.make_halves(2008, 2020)

In [None]:
if load_from_fs:
    full_tags = load_db(topic="tsa", name="full-tags")
else:
    full_tags = aggregate(posts, current_period, full=True)
    save_db(full_tags, topic="tsa", name="full-tags")
full_tags

In [None]:
def aggregate_time_periods(periods, unit="half"):
    time_periods = []

    for period in tqdm(periods, unit=unit, ascii=True):
        tags_from_posts = aggregate(posts, period)
        if tags_from_posts.empty:
            time_periods.append({"_date": period.end})
        else:
            keys = tags_from_posts.tag.values
            values = tags_from_posts.frequency.values
            time_periods.append({"_date": period.end, **dict(zip(keys, values))})

    return time_periods

In [None]:
if load_from_fs:
    time_series = load_db(topic="tsa", name="time-series")
else:
    time_periods = aggregate_time_periods(halves, unit="half")
    time_series = pd.DataFrame(time_periods, columns=["_date", *full_tags.tag.values])
    time_series.fillna(0, inplace=True)
    save_db(time_series, topic="tsa", name="time-series")

In [None]:
time_series._date = pd.to_datetime(time_series._date)
time_series.set_index("_date", inplace=True)
time_series.index = pd.DatetimeIndex(time_series.index.values, freq=time_series.index.inferred_freq)

In [None]:
time_series_data = time_series.T
time_series_data.index.name = "tag"
time_series_data_full = time_series_data

# the top 18 % (n=10205) of all tags cover 95,61 % of all used tags in questions
time_series_data = time_series_data.head(int(len(time_series_data) * 0.18))

# remove all tags that don't occur in the second last time period
time_series_data_full = time_series_data_full[time_series_data_full[time_series_data_full.columns[-2]] > 0]
time_series_data = time_series_data[time_series_data[time_series_data.columns[-2]] > 0]
time_series_data

In [None]:
class_threshold = 0.05
# {"decaying": 1, "undecaying": 0}
classify = lambda d: 1 if d >= class_threshold else 0

def generate_classes(data):
    classes = []

    for row in data.itertuples():
        old_value = row[-2]
        new_value = row[-1]
        decrease = (old_value - new_value) / old_value
        given_class = classify(decrease)
        classes.append({"tag": row[0], "class": given_class})

    time_series_classes = pd.DataFrame(classes)
    time_series_classes.set_index("tag", inplace=True)
    return time_series_classes

In [None]:
time_series_classes_full = generate_classes(time_series_data_full)
time_series_classes = generate_classes(time_series_data)
time_series_classes

In [None]:
import tensorflow as tf
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.utils import plot_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    rec = true_positives / (possible_positives + K.epsilon())
    return rec

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    pre = true_positives / (predicted_positives + K.epsilon())
    return pre

def f1_score(y_true, y_pred):
    pre = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    f1 = 2 * ((pre * rec) / (pre + rec + K.epsilon()))
    return f1

def f1_loss(y_true, y_pred):
    true_positives = K.sum(K.cast(y_true * y_pred, "float"), axis=0)
    true_negatives = K.sum(K.cast((1 - y_true) * (1 - y_pred), "float"), axis=0)
    false_positives = K.sum(K.cast((1 - y_true) * y_pred, "float"), axis=0)
    false_negatives = K.sum(K.cast(y_true * (1 - y_pred), "float"), axis=0)

    pre = true_positives / (true_positives + false_positives + K.epsilon())
    rec = true_positives / (true_positives + false_negatives + K.epsilon())

    f1 = 2 * pre * rec / (pre + rec + K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(time_series_data, time_series_classes, test_size=0.1)
n_features = x_train.columns

In [None]:
# create model, add dense layers one by one specifying activation function
model = Sequential()
model.add(Dense(100, input_dim=len(n_features), activation="relu")) # input layer requires input_dim param
model.add(Dense(100, activation="relu"))
model.add(Dense(1, activation="sigmoid")) # sigmoid instead of relu for final probability between 0 and 1

# compile the model, adam gradient descent (optimized)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy", f1_score, precision, recall])

# call the function to fit to the data (training the network)
model.fit(x_train, y_train, epochs=200, batch_size=32, validation_data=(x_test, y_test))

In [None]:
metrics = model.evaluate(x_test, y_test)
dict(zip(model.metrics_names, metrics))

In [None]:
metrics = model.evaluate(time_series_data_full, time_series_classes_full)
dict(zip(model.metrics_names, metrics))