In [None]:
import pymongo
import pandas as pd
import numpy as np
from tqdm import tqdm

from notebook_modules.database import Database
from notebook_modules.half import Half
from notebook_modules.distribution import aggregate
from notebook_modules.lists import save_db, load_db

In [None]:
load_from_fs = True

if not load_from_fs:
    db = Database()
    assert db.client, "No database client available!"
    stackoverflow = db.client["stackoverflow"]
    tags = stackoverflow["tags"]
    posts = stackoverflow["posts"]

In [None]:
current_period = Half.make_half(2019, 2)
halves = Half.make_halves(2008, 2020)

In [None]:
if load_from_fs:
    full_tags = load_db(topic="tsa", name="full-tags")
else:
    full_tags = aggregate(posts, current_period, full=True)
    save_db(full_tags, topic="tsa", name="full-tags")
full_tags

In [None]:
def aggregate_time_periods(periods, unit="half"):
    time_periods = []

    for period in tqdm(periods, unit=unit, ascii=True):
        tags_from_posts = aggregate(posts, period)
        if tags_from_posts.empty:
            time_periods.append({"_date": period.end})
        else:
            keys = tags_from_posts.tag.values
            values = tags_from_posts.frequency.values
            time_periods.append({"_date": period.end, **dict(zip(keys, values))})

    return time_periods

In [None]:
if load_from_fs:
    time_series = load_db(topic="tsa", name="time-series")
else:
    time_periods = aggregate_time_periods(halves, unit="half")
    time_series = pd.DataFrame(time_periods, columns=["_date", *full_tags.tag.values])
    time_series.fillna(0, inplace=True)
    save_db(time_series, topic="tsa", name="time-series")

In [None]:
time_series._date = pd.to_datetime(time_series._date)
time_series.set_index("_date", inplace=True)
time_series.index = pd.DatetimeIndex(time_series.index.values, freq=time_series.index.inferred_freq)

In [None]:
# only select tags that appeared over a certain threshold
time_series = time_series.loc[:, time_series.sum(axis="rows") >= 280.0]
train, test = time_series.iloc[:23, :], time_series.iloc[23:, :]
time_series

In [None]:
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers import Dense, LSTM, Dropout
from keras.models import Sequential
from keras.utils import plot_model
from keras import backend

In [None]:
full_current = time_series[time_series.columns[0]]
train_current = train[train.columns[0]]
test_current = test[test.columns[0]]

train_data = train_current.values.reshape(-1, 1)
test_data = test_current.values.reshape(-1, 1)

scaler = MinMaxScaler()
scaler.fit(train_data)

train_scaled = scaler.transform(train_data)
test_scaled = scaler.transform(test_data)

In [None]:
n_input = 1
n_features = 1
n_units = 20

generator = TimeseriesGenerator(train_scaled, train_scaled, length=n_input, batch_size=1)

In [None]:
model = Sequential()

model.add(LSTM(units=n_units, input_shape=(n_input, n_features)))
model.add(Dropout(0.15))

model.add(Dense(units=n_features))

model.compile(optimizer="adam", loss="mean_squared_error")

model.fit_generator(generator, epochs=10)

In [None]:
# plot_model(model, show_shapes=True, show_layer_names=True, to_file="output/models/model-tsa.png")