In [59]:
import os
import sys
sys.path.insert(0, "..")

from tqdm import tqdm
import pandas as pd
pd.options.plotting.backend = "plotly"
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
plotly.io.templates.default = "plotly_dark"

from lib.cache import FileCache
from lib.giterator import Giterator

In [2]:
def get_all_my_commits():
    all_repo_paths = []
    for root, dirs, files in tqdm(os.walk("/home/bergi/prog/")):
        if "prog/go/" in root or "/env/" in root:
            continue
        if ".git" in dirs:
            all_repo_paths.append(root)
    
    repos = []
    for repo_path in tqdm(all_repo_paths):
        commits = []
        giter = Giterator(repo_path)
        email_set = set()
        for i, c in enumerate(giter.iter_commits(json_compatible=True)):
            email_set.add(c["author_email"])
            if (c["author_email"].startswith("s.berke") 
                or c["author_email"].startswith("stefan.berke")):
                commits.append(c)
            if i > 1000 and not commits:
                print("skipping", repo_path)#, "authors:", sorted(email_set))
                break
        if commits:
            repos.append({
                "path": repo_path,
                "commits": commits,
            })
    return repos

#repos = get_all_my_commits()
#print(len(repos))

In [8]:
repos = FileCache.execute_json(
    "all-bergi-commits.json", get_all_my_commits,
    #read_cache=False,
)
repos.sort(key=lambda r: r["path"])

In [154]:
# hide
CATEGORIES = {
    "ideals": ["/points/", "/botgard/", "/kultuerchen", "das_kapital", "nkbuch"],
    "money": ["/k3/", "/schlup/", "/bock/", "/nk/"],
    "lib": ["django", "elastipy", "event-sourcing", "fritztrack", "pyvolution", 
            "pector"],
    "blog": ["blog", "agenda90210", "/wahl17", "afd-chat", "bm-wahl", "/fefe/"],   
}
def get_project_category(path):
    for cat, keys in CATEGORIES.items():
        for key in keys:
            if key in path:
                return cat
    return "fun"

#for r in repos:
#    print("%5d %10s %s" % (len(r["commits"]), get_project_category(r["path"]), r["path"]))

In [99]:
commit_hash_set = set()
all_commits = []
for r in repos:
    name = r["path"].split("/")[-1]
    for c in r["commits"]:
        if c["hash"] in commit_hash_set:
            continue
        commit_hash_set.add(c["hash"])
        if name == "parking-data":
            if c["message"] == "add yesterday's numbers" or \
                    c["message"].startswith("add 202"):
                continue
        all_commits.append({
            "project": name,
            "path": r["path"],
            **c,
        })
print(len(all_commits))

6703


In [166]:
commits_df = (
    pd.DataFrame({
        "date": [c["author_date"] for c in all_commits],
        #"project": [c["project"] for c in all_commits],
        "category": [get_project_category(c["path"]) for c in all_commits],
        "message": [c["message"] for c in all_commits],
        "message_len": [len(c["message"]) for c in all_commits],
        "files": [len(c.get("changes", [])) for c in all_commits],
        "add": [sum(ch["additions"] for ch in c.get("changes", [])) for c in all_commits],
        "del": [sum(ch["deletions"] for ch in c.get("changes", [])) for c in all_commits],
    })
    .assign(date=lambda df: pd.to_datetime(df["date"], utc=True))
    .sort_values("date", ascending=True)
    .set_index(["date", "category"])
)
#commits_df
#all_commits[0]

In [165]:
#df = commits_df
#df= df.iloc[df.index.get_level_values(1) == "money"]
#df.iloc[df.index.get_level_values(0) < "2020-11-20"].tail(50)

In [175]:
def get_series(commits_df, interval="M"):
    df = pd.DataFrame(index=commits_df.index)
    df["commits"] = np.clip(commits_df["message_len"], 0, 100)
    #print(df.index.duplicated().astype(np.int).sum())
    df = df.loc[~df.index.duplicated()]
    df = df.unstack("category").droplevel(0, axis=1)
    return df.resample(level=0, rule=interval).sum()

get_series(commits_df, interval="3M").plot()