In [None]:
import pymongo
import pandas as pd
from tqdm import tqdm

from notebook_modules.database import Database
from notebook_modules.half import Half
from notebook_modules.lists import make_list
from notebook_modules.plots import make_distribution_plot

In [None]:
db = Database()
assert db.client, "No database client available!"
stackoverflow = db.client["stackoverflow"]
tags = stackoverflow["tags"]
posts = stackoverflow["posts"]

In [None]:
halves = Half.make_halves(2008, 2021)

In [None]:
def aggregate(timespan):
    cursor = posts.aggregate([
        {"$match": {
            "PostTypeId": 1,
            "CreationDate": {
                "$gte": timespan.start,
                "$lt": timespan.end
            }
        }},
        {"$unwind": "$Tags"},
        {"$group": {
            "_id": "$Tags",
            "frequency": {"$sum": 1}
        }},
        {"$sort": {
            "frequency": -1
        }}
    ])
    result = list(cursor)
    if len(result) == 0:
        return pd.DataFrame()
    else:
        df = pd.DataFrame(result)
        df.rename({"_id": "tag"}, axis="columns", inplace=True)
        return df

In [None]:
def distribute(aggr):
    dist = []
    unique_tags = len(aggr.index)
    total_tags = aggr.frequency.sum()
    resolution = 100
    for p in range(0, resolution + 1):
        percentage = p / resolution
        index = int(unique_tags * percentage)
        part = aggr.head(index)
        share = part.frequency.sum() / total_tags
        dist.append([percentage, share, len(part.index)])
    return pd.DataFrame(dist, columns=["percentage", "share", "count"])

In [None]:
for half in tqdm(halves, unit="half", ascii=True):
    aggr = aggregate(half)
    if aggr.empty:
        dist = pd.DataFrame(columns=["percentage", "share", "count"])
    else:
        dist = distribute(aggr)
    make_list("distribution", str(half), dist)
    make_distribution_plot("distribution", str(half), dist)

In [None]:
tags_2020_h1 = aggregate(Half.make_half(2019, 1))
tags_2020_h2 = aggregate(Half.make_half(2019, 2))

In [None]:
top_2020_h1 = tags_2020_h1.head(int(0.1 * len(tags_2020_h1)))
bottom_2020_h2 = tags_2020_h2.tail(int(0.9 * len(tags_2020_h2)))

In [None]:
undecaying_left_join = pd.merge(top_2020_h1, bottom_2020_h2, how="left", on="tag", indicator=True)
undecaying = undecaying_left_join[undecaying_left_join["_merge"] == "left_only"].reset_index(drop=True)
decaying = pd.merge(top_2020_h1, bottom_2020_h2, how="inner", on="tag")