Hi there.

This notebook is analyzing the timelines of repo-push-events to find the ones that seems to be updated regularily by a cronjob.

It requires:
- downloading the data from gharchive.org
- processing it with `export.py`
- exporting that processed data to elasticsearch
- further crunching it with `proc.py`
- and finally running this notebook

In [None]:
import json
import time
from typing import List, Tuple, Optional, Union
from pathlib import Path

import pandas as pd
import numpy as np
import plotly
import plotly.express as px
from elastipy import Search, query, connections
from tqdm import tqdm

pd.options.plotting.backend = "plotly"
plotly.templates.default = "plotly_dark"

connections.set("default", {"timeout": 60})

In [None]:
spectra = (
    pd.read_csv("../cache/automated-commits/spectra-s28.csv")
    .set_index("repo")
)
spectra

In [None]:
cleaned_spectra = spectra.loc[
    np.isnan(spectra).sum(axis=1) < .2 * spectra.shape[1]
]
print(f"specta {spectra.shape[0]}, cleaned {cleaned_spectra.shape[0]}")

In [None]:
def calc_cleaned_timelines():
    timelines = (
        pd.read_csv("../cache/automated-commits/timelines.csv")
        .set_index("repo")
    )
    cleaned_timelines = timelines.loc[cleaned_spectra.index]
    cleaned_timelines = cleaned_timelines.loc[
        (cleaned_timelines > 0).sum(axis=1) > 40
    ]
    return cleaned_timelines

cache_file = Path("../cache/automated-commits/cleaned-timelines.csv")
if not cache_file.exists():
    timelines = calc_cleaned_timelines()
    timelines.to_csv(cache_file)
else:
    timelines = pd.read_csv(cache_file).set_index("repo")

spectra = cleaned_spectra.loc[timelines.index]
print("timelines", timelines.shape)
print("spectra", spectra.shape)

# detect by mean and std

In [None]:
tl_mean = timelines.mean(axis=1)
tl_std = timelines.std(axis=1)
is_automated_std = (tl_mean > .12) & (tl_std < .32)# & (tl_std > .31)
is_automated_std.sum()

# detect by interval diff

In [None]:
#shift = (timelines - timelines.shift(7, axis=1)).abs()
#shift_mean, shift_std = shift.mean(axis=1), shift.std(axis=1)
#px.line(shift_mean.sort_values().iloc[0:200].T.values)#plot(height=800)

In [None]:
is_automated_shift = timelines.apply(lambda c: False, axis=1)
for i in tqdm(range(1, 29)):
    shift = (timelines - timelines.shift(i, axis=1)).abs()
    shift_mean, shift_std = shift.mean(axis=1), shift.std(axis=1)
    is_automated_shift |= (tl_mean > .01) & (shift_mean < 0.15)

is_automated_shift.sum()

# detect by spectrum std

In [None]:
spectra_mean = spectra.mean(axis=1)
spectra_std = spectra.std(axis=1)

(spectra_std * spectra_mean).sort_values()[-500:-300].plot().show()
timelines.loc[(spectra_std * spectra_mean).sort_values().index[-500:-300]].T.plot()

In [None]:

is_automated_spectrum = (spectra_std * spectra_mean) >= .09
is_automated_spectrum.sum()

# combine all detections

In [None]:
is_automated = is_automated_shift | is_automated_std | is_automated_spectrum
print(is_automated.sum())

In [None]:
df = timelines.loc[is_automated]
df = df.div(df.max(axis=1), axis=0)
px.imshow(df, height=100+is_automated.sum()*15)
#for i in range(0, df.shape[0], 150):
#    timelines.loc[is_automated].iloc[i:i+150].T.plot().show()

# prepare data export

## from elasticsearch

In [None]:
export_timelines = timelines.loc[is_automated]

repo_info = (Search("gharchive-push-2018")
 .terms("repo", export_timelines.index.to_list())
 .agg_terms("repo", field="repo", size=export_timelines.shape[0])
 .metric_sum("all_push_events", field="events")
 .metric_sum("commits", field="commits")
 .metric_sum("distinct_commits", field="distinct_commits")
 .metric_cardinality("push_users", field="user")
 .metric_cardinality("refs", field="ref.keyword")
 
 .execute().df()
 .set_index("repo")
 .rename({"repo.doc_count": "push_events"}, axis=1)
)

df = (Search("gharchive-watch-2018")
 .terms("repo", export_timelines.index.to_list())
 .agg_terms("repo", field="repo", size=export_timelines.shape[0])
 .metric_cardinality("users", field="user")
 .execute().df()
 .set_index("repo")
)

repo_info["stars"] = df["repo.doc_count"]
repo_info

## from github api

In [None]:
import sys
sys.path.append("..")
from src.credentials import GITHUB_TOKEN
from github import Github, UnknownObjectException, GithubException
github = Github(GITHUB_TOKEN)

CACHE_FILE = Path("../cache/repo-api-cache.json")
repo_api_cache = dict()
if CACHE_FILE.exists():
    repo_api_cache = json.loads(CACHE_FILE.read_text())

In [None]:
for i, name in enumerate(tqdm(repo_info.index)):
    if i % 500 == 0:
        print(f"rate-limit: {github.rate_limiting[0]}/{github.rate_limiting[1]}")
    if name not in repo_api_cache:
        try:
            data = github.get_repo(name).raw_data
        except UnknownObjectException:
            data = {"deleted": True}
        except GithubException as e:
            data = {"exception": e.status}
        repo_api_cache[name] = data
        
CACHE_FILE.write_text(json.dumps(repo_api_cache, indent=2))

In [None]:
def map_status(name: str) -> str:
    r = repo_api_cache[name]
    if r.get("deleted"):
        return "deleted"
    elif r.get("exception"):
        return f'code: {r["exception"]}'
    return "active"
    
repo_info["size"] = repo_info.index.map(lambda n: repo_api_cache[n].get("size"))
repo_info["stars_today"] = repo_info.index.map(lambda n: repo_api_cache[n].get("stargazers_count"))
repo_info["watchers_today"] = repo_info.index.map(lambda n: repo_api_cache[n].get("watchers_count"))
repo_info["status"] = repo_info.index.map(map_status)
repo_info["name"] = repo_info.index.map(lambda n: repo_api_cache[n].get("name"))
repo_info["fork"] = repo_info.index.map(lambda n: repo_api_cache[n].get("fork"))
repo_info["created_at"] = repo_info.index.map(lambda n: repo_api_cache[n].get("created_at"))
repo_info["description"] = repo_info.index.map(lambda n: repo_api_cache[n].get("description"))
repo_info["homepage"] = repo_info.index.map(lambda n: repo_api_cache[n].get("homepage"))
repo_info["language"] = repo_info.index.map(lambda n: repo_api_cache[n].get("language"))

repo_info = repo_info.apply(lambda c: c.replace(np.nan, 0).astype(int) if c.dtype != "object" else c)
repo_info

# store data

In [None]:
#timelines.loc[is_automated].to_csv("../docs/data/automated-tl-1d.csv")
ri = repo_info.reset_index()
data = {
    "columns": ri.columns.to_list(),
    "rows": [
        row.to_dict()
        for i, row in ri.iterrows()
    ],
    "timelines": {
        repo: tl.to_list()
        for repo, tl in export_timelines.replace(np.nan, 0).astype(int).iterrows()
    }
}
Path("../docs/data/automated-2018.json").write_text(json.dumps(data))