In [None]:
import pymongo
import pandas as pd
from tqdm import tqdm

from notebook_modules.database import Database
from notebook_modules.half import Half
from notebook_modules.classification import aggregate_features, split_top, split_class, classify
from notebook_modules.lists import make_list
from notebook_modules.plots import make_distribution_plot

In [None]:
db = Database()
assert db.client, "No database client available!"
stackoverflow = db.client["stackoverflow"]
tags = stackoverflow["tags"]
posts = stackoverflow["posts"]

In [None]:
tags_h1 = aggregate_features(posts, Half.make_half(2018, 2))
tags_h2 = aggregate_features(posts, Half.make_half(2019, 1))

In [None]:
tags_h1_eval = aggregate_features(posts, Half.make_half(2019, 1))
tags_h2_eval = aggregate_features(posts, Half.make_half(2019, 2))

In [None]:
top_h1 = split_top(tags_h1, threshold=0.2)
top_h2 = split_top(tags_h2, threshold=0.2)
dataset = classify(top_current=top_h1, top_next=top_h2)
dataset

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
x, y = split_class(dataset)

def rule_model(df, threshold=25):
    classes = df["frequency"].map(lambda f: 1 if f <= threshold else 0)
    return classes.values

y_pred = rule_model(x, threshold=25)

print("accuracy", accuracy_score(y, y_pred))
print("f1-score", f1_score(y, y_pred))
print("precision", precision_score(y, y_pred))
print("recall", recall_score(y, y_pred))

In [None]:
top_h1_eval = split_top(tags_h1_eval, threshold=0.2)
top_h2_eval = split_top(tags_h2_eval, threshold=0.2)
dataset_eval = classify(top_current=top_h1_eval, top_next=top_h2_eval)
dataset_eval

In [None]:
x_eval, y_eval = split_class(dataset_eval)

y_eval_pred = rule_model(x_eval, threshold=30)

print("accuracy", accuracy_score(y_eval, y_eval_pred))
print("f1-score", f1_score(y_eval, y_eval_pred))
print("precision", precision_score(y_eval, y_eval_pred))
print("recall", recall_score(y_eval, y_eval_pred))