In [None]:
import pymongo
import pandas as pd
import numpy as np
from tqdm import tqdm

from notebook_modules.database import Database
from notebook_modules.half import Half
from notebook_modules.distribution import aggregate
from notebook_modules.lists import save_db, load_db

In [None]:
load_from_fs = True

if not load_from_fs:
    db = Database()
    assert db.client, "No database client available!"
    stackoverflow = db.client["stackoverflow"]
    tags = stackoverflow["tags"]
    posts = stackoverflow["posts"]

In [None]:
current_period = Half.make_half(2019, 2)
halves = Half.make_halves(2008, 2020)

In [None]:
if load_from_fs:
    full_tags = load_db(topic="tsa", name="full-tags")
else:
    full_tags = aggregate(posts, current_period, full=True)
    save_db(full_tags, topic="tsa", name="full-tags")
full_tags

In [None]:
def aggregate_time_periods(periods, unit="half"):
    time_periods = []

    for period in tqdm(periods, unit=unit, ascii=True):
        tags_from_posts = aggregate(posts, period)
        if tags_from_posts.empty:
            time_periods.append({"_date": period.end})
        else:
            keys = tags_from_posts.tag.values
            values = tags_from_posts.frequency.values
            time_periods.append({"_date": period.end, **dict(zip(keys, values))})

    return time_periods

In [None]:
if load_from_fs:
    time_series = load_db(topic="tsa", name="time-series")
else:
    time_periods = aggregate_time_periods(halves, unit="half")
    time_series = pd.DataFrame(time_periods, columns=["_date", *full_tags.tag.values])
    time_series.fillna(0, inplace=True)
    save_db(time_series, topic="tsa", name="time-series")

In [None]:
time_series._date = pd.to_datetime(time_series._date)
time_series.set_index("_date", inplace=True)
time_series.index = pd.DatetimeIndex(time_series.index.values, freq=time_series.index.inferred_freq)

In [None]:
time_series_data = time_series.T
time_series_data.index.name = "tag"
time_series_data_full = time_series_data

# the top 18 % (n=10205) of all tags cover 95,61 % of all used tags in questions
time_series_data = time_series_data.head(int(len(time_series_data) * 0.18))

# remove all tags that don't occur in the second last time period
time_series_data_full = time_series_data_full[time_series_data_full[time_series_data_full.columns[-2]] > 0]
time_series_data = time_series_data[time_series_data[time_series_data.columns[-2]] > 0]
time_series_data

In [None]:
from keras.models import Sequential
from keras.layers import Input, Dense, LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
time_series_data[time_series_data.columns] = scaler.fit_transform(time_series_data[time_series_data.columns]) 

In [None]:
time_series_data_current = time_series_data.values[:, :-1]
time_series_data_future = time_series_data.values[:, -1]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(time_series_data_current, time_series_data_future, test_size=0.1)

In [None]:
x_train_lstm = np.expand_dims(x_train, axis=2)
x_test_lstm = np.expand_dims(x_test, axis=2)

y_train_lstm = y_train
y_test_lstm = y_test

In [None]:
model = Sequential()
model.add(LSTM(200))
model.add(Dense(1, activation=None))

model.compile(optimizer="adam", loss="mse", metrics=["mse", "mae", "mape", "cosine"])

model.fit(x_train_lstm, y_train_lstm, epochs=5, batch_size=32, validation_data=(x_test_lstm, y_test_lstm))

In [None]:
metrics = model.evaluate(x_test_lstm, y_test_lstm)
dict(zip(model.metrics_names, metrics))

In [None]:
predicted_future = model.predict(np.expand_dims(time_series_data_current, axis=2))
scaler.inverse_transform(predicted_future)