In [None]:
import pymongo
import pandas as pd
import numpy as np
from tqdm import tqdm

from notebook_modules.database import Database
from notebook_modules.half import Half
from notebook_modules.distribution import aggregate
from notebook_modules.lists import save_db, load_db

In [None]:
load_from_fs = True

if not load_from_fs:
    db = Database()
    assert db.client, "No database client available!"
    stackoverflow = db.client["stackoverflow"]
    tags = stackoverflow["tags"]
    posts = stackoverflow["posts"]

In [None]:
current_period = Half.make_half(2019, 2)
halves = Half.make_halves(2008, 2020)

In [None]:
if load_from_fs:
    full_tags = load_db(topic="tsa", name="full-tags")
else:
    full_tags = aggregate(posts, current_period, full=True)
    save_db(full_tags, topic="tsa", name="full-tags")
full_tags

In [None]:
def aggregate_time_periods(periods, unit="half"):
    time_periods = []

    for period in tqdm(periods, unit=unit, ascii=True):
        tags_from_posts = aggregate(posts, period)
        if tags_from_posts.empty:
            time_periods.append({"_date": period.end})
        else:
            keys = tags_from_posts.tag.values
            values = tags_from_posts.frequency.values
            time_periods.append({"_date": period.end, **dict(zip(keys, values))})

    return time_periods

In [None]:
if load_from_fs:
    time_series = load_db(topic="tsa", name="time-series")
else:
    time_periods = aggregate_time_periods(halves, unit="half")
    time_series = pd.DataFrame(time_periods, columns=["_date", *full_tags.tag.values])
    time_series.fillna(0, inplace=True)
    save_db(time_series, topic="tsa", name="time-series")

In [None]:
time_series._date = pd.to_datetime(time_series._date)
time_series.set_index("_date", inplace=True)
time_series.index = pd.DatetimeIndex(time_series.index.values, freq=time_series.index.inferred_freq)

In [None]:
# only select tags that appeared over a certain threshold
time_series = time_series.loc[:, time_series.sum(axis="rows") >= 280.0]
train, test = time_series.iloc[:23, :], time_series.iloc[23:, :]
time_series

In [None]:
class_threshold = 0.05
classify = lambda d: "decaying" if d >= class_threshold else "undecaying"
classes = []

for tag in time_series:
    old_value = time_series[tag][-2]
    new_value = time_series[tag][-1]
    decrease = (old_value - new_value) /  old_value
    given_class = classify(decrease)
    classes.append({"tag": tag, "decrease": decrease, "class": given_class})

time_series_classes = pd.DataFrame(classes)
time_series_classes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
# https://www.statsmodels.org/stable/generated/statsmodels.tsa.arima_model.ARIMA.html

In [None]:
full_current = time_series[time_series.columns[0]]
train_current = train[train.columns[0]]
test_current = test[test.columns[0]]

# fit model
# order=(p, d, q) sets the order of the model with
# autoregressive, differences, and moving average components
model = ARIMA(train_current, order=(1, 0, 2)) 
model_fit = model.fit()

# make prediction
prediction_current = model_fit.predict(0, len(train_current) + 1)
prediction_current = prediction_current.shift(-1)
prediction_current.dropna(inplace=True)
prediction_current.name = train_current.name + "-predicted"

In [None]:
compare_current = pd.concat([full_current, prediction_current], axis="columns")

sns.set(style="whitegrid")
plt.figure(figsize=(16,8))
ax = sns.lineplot(data=compare_current)
ax.get_figure().savefig("output/plots/arima-prediction.png")
compare_current

In [None]:
old_actual, new_actual = compare_current.iloc[-2:, 0]
old_predicted, new_predicted = compare_current.iloc[-2:, 1]

decrease_actual = (old_actual - new_actual) / old_actual
decrease_predicted = (old_predicted - new_predicted) / old_predicted

print("actual decrease:", decrease_actual, ", actual class:", classify(decrease_actual))
print("predicted decrease:", decrease_predicted, ", predicted class:", classify(decrease_predicted))