In [None]:
import pymongo
import pandas as pd
from tqdm import tqdm

from notebook_modules.database import Database
from notebook_modules.half import Half
from notebook_modules.lists import make_list
from notebook_modules.plots import make_distribution_plot

In [None]:
db = Database()
assert db.client, "No database client available!"
stackoverflow = db.client["stackoverflow"]
tags = stackoverflow["tags"]
posts = stackoverflow["posts"]

In [None]:
halves = Half.make_halves(2008, 2021)

In [None]:
def distribution(df):
    dist = pd.DataFrame(columns=["percentage", "share"])
    length = len(df.index)
    total_tags = df.frequency.sum()
    resolution = 100
    for p in range(0, resolution + 1):
        percentage = p / resolution
        index = int(length * percentage)
        part = df.head(index)
        share = part.frequency.sum() / total_tags
        dist.loc[len(dist)] = [percentage, share]
    return dist

In [None]:
for half in tqdm(halves, unit="half", ascii=True):
    cursor = posts.aggregate([
        {"$match": {
            "PostTypeId": 1,
            "CreationDate": {
                "$gte": half.start,
                "$lt": half.end
            }
        }},
        {"$unwind": "$Tags"},
        {"$group": {
            "_id": "$Tags",
            "frequency": {"$sum": 1}
        }},
        {"$sort": {
            "frequency": -1
        }}
    ])
    result = list(cursor)
    if len(result) == 0:
        dist = pd.DataFrame(columns=["percentage", "share"])
    else:
        df = pd.DataFrame(result)
        df.columns = ["tag", "frequency"]
        dist = distribution(df)
    make_list("distribution", str(half), dist)
    make_distribution_plot("distribution", str(half), dist)