In [1]:
"""
Part 2: Analysis & Plotting
---------------------------
Reads CSVs from 'data/' for one or more repositories,
performs baseline analysis, and creates visual charts.
"""

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Repositories you'd like to analyze
repos = [
    "optimism",
    "supersim",
    "superchainerc20-starter",
    "superchain-registry",
    "superchain-ops",
    "docs",
    "specs",
    "design-docs",
    "infra",
]

# ---------------------------
# Example: read single timeseries CSV, plot
# ---------------------------
for repo in repos:
    ts_path = f"data/{repo}_timeseries.csv"
    print(f"Reading {ts_path} ...")
    timeseries_df = pd.read_csv(ts_path)
    if timeseries_df.empty:
        print(f"No data found for {repo}, skipping plots.")
        continue

    timeseries_df.set_index("date", inplace=True)
    timeseries_df.sort_index(inplace=True)
    timeseries_df.index = pd.to_datetime(timeseries_df.index)
    timeseries_df.fillna(0, inplace=True)
    print(f"{repo} timeseries_df shape => {timeseries_df.shape}")

    # Some rolling window line plots
    metrics_to_plot = {
        "avg_time_to_approval_days": "Average Time to Approval",
        "avg_time_to_first_non_bot_comment_days": "Average Time to First Non-Bot Comment",
        "avg_time_to_merge_days": "Average Time to Merge",
        "approval_ratio": "Approval Ratio",
        "number_of_prs": "Number of PRs",
        "avg_comments_per_pr": "Average Comments per PR"
    }

    # for metric, title in metrics_to_plot.items():
    #     # We'll do a simple rolling-30-day average
    #     plt.figure(figsize=(12, 6))
    #     timeseries_df[metric].rolling(window=30, min_periods=1).mean().plot(kind="line")
    #     plt.title(f"{repo} - {title} (30-Day Rolling Avg)")
    #     plt.xlabel("Date")
    #     plt.ylabel(title)
    #     plt.grid(True)
    #     plt.tight_layout()
    #     plt.show()


# Iterate over each repository and sample the CSV files
for repo in repos:
    ts_path = f"data/{repo}_timeseries.csv"
    print(f"Reading {ts_path} ...")
    try:
        timeseries_df = pd.read_csv(ts_path)
    except FileNotFoundError:
        print(f"No CSV file found for {repo}, skipping.")
        continue

    if timeseries_df.empty:
        print(f"No data found in {ts_path}, skipping.")
        continue

    # Sample the first few rows
    print(f"\nSample data from {ts_path}:")
    print(timeseries_df.head())

    # Print a summary of the DataFrame
    print(f"\nSummary statistics for {repo}:")
    print(timeseries_df.describe(include='all'))

    # Print the columns and data types
    print(f"\nColumns and data types for {repo}:")
    print(timeseries_df.dtypes)
    print("\n" + "-"*50 + "\n")

Reading data/optimism_timeseries.csv ...
optimism timeseries_df shape => (346, 12)
Reading data/supersim_timeseries.csv ...
supersim timeseries_df shape => (102, 12)
Reading data/superchainerc20-starter_timeseries.csv ...
superchainerc20-starter timeseries_df shape => (19, 12)
Reading data/superchain-registry_timeseries.csv ...
superchain-registry timeseries_df shape => (216, 12)
Reading data/superchain-ops_timeseries.csv ...
superchain-ops timeseries_df shape => (157, 12)
Reading data/docs_timeseries.csv ...
docs timeseries_df shape => (249, 12)
Reading data/specs_timeseries.csv ...
specs timeseries_df shape => (190, 12)
Reading data/design-docs_timeseries.csv ...
design-docs timeseries_df shape => (83, 12)
Reading data/infra_timeseries.csv ...
infra timeseries_df shape => (59, 12)
Reading data/optimism_timeseries.csv ...

Sample data from data/optimism_timeseries.csv:
       repo        date  number_of_prs  avg_time_to_merge_days  \
0  optimism  2024-01-01              7             

In [None]:
# ---------------------------
# Combine multiple repos into a single dataset for baseline analysis
# ---------------------------
all_datasets = []
for repo in repos:
    ts_path = f"data/{repo}_timeseries.csv"
    try:
        timeseries_df = pd.read_csv(ts_path)
    except FileNotFoundError:
        print(f"No timeseries data for {repo}, skipping.")
        continue

    if timeseries_df.empty:
        continue

    timeseries_df.set_index("date", inplace=True)
    timeseries_df.index = pd.to_datetime(timeseries_df.index)
    timeseries_df.sort_index(inplace=True)
    timeseries_df["repo"] = repo
    all_datasets.append(timeseries_df)

if not all_datasets:
    print("No data from any repo, cannot do combined analysis.")
    exit()

combined_df = pd.concat(all_datasets)
combined_df.fillna(0, inplace=True)
combined_df.reset_index(inplace=True)
combined_df.rename(columns={"index":"date"}, inplace=True)

# Example: group weekly, produce a baseline dataset
combined_df["date"] = pd.to_datetime(combined_df["date"])
weekly_dataset = combined_df.groupby(["repo", pd.Grouper(key="date", freq="W")]).agg(
    number_of_prs=("number_of_prs", "sum"),
    avg_time_to_approval_days=("avg_time_to_approval_days", "mean"),
    avg_time_to_first_non_bot_comment_days=("avg_time_to_first_non_bot_comment_days", "mean"),
    avg_time_to_merge_days=("avg_time_to_merge_days", "mean"),
    approval_ratio=("approval_ratio", "mean"),
    avg_comments_per_pr=("avg_comments_per_pr", "mean"),
).reset_index()

# Save combined weekly data to CSV
weekly_dataset.to_csv("data/github_timeseries_dataset.csv", index=False)
print("Wrote 'data/github_timeseries_dataset.csv' with shape:", weekly_dataset.shape)

# If you'd like some baseline stats for a specific date range:
cutoff_start = "2024-07-01"
df_filtered = weekly_dataset[weekly_dataset["date"] >= cutoff_start].copy()

# Summaries
print("\n== Baseline Stats since 2024-07-01 ==\n")
stats = df_filtered.groupby("repo")[
    ["number_of_prs", "avg_time_to_approval_days", "avg_time_to_first_non_bot_comment_days", 
     "avg_time_to_merge_days", "approval_ratio", "avg_comments_per_pr"]
].agg(["mean","median","std","min","max"])
print(stats)

# Plot an example bar chart of number_of_prs with error bars
import numpy as np

metric = "number_of_prs"

print(f"\nBar Chart for {metric} ...")
means = stats[metric]["mean"].sort_values()
stds = stats[metric]["std"].reindex(means.index)
repos_order = means.index.tolist()

plt.figure(figsize=(12, 6))
bars = plt.bar(repos_order, means.values, yerr=stds.values, color="skyblue", 
               capsize=5, edgecolor='black')

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.05*height,
             f'{height:.2f}', ha='center', va='bottom', fontsize=9)

plt.title(f"{metric.replace('_',' ').title()} Across Repositories")
plt.xlabel("Repository")
plt.ylabel(metric.replace('_',' ').title())
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.ylim(bottom=0)
plt.tight_layout()
plt.show()

# Example: Rolling average plot
window_size = 4
df_filtered["rolling_avg_prs"] = df_filtered.sort_values("date") \
    .groupby("repo")["number_of_prs"] \
    .apply(lambda x: x.rolling(window=window_size, min_periods=1).mean()) \
    .reset_index(level=0, drop=True)

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_filtered, x="date", y="rolling_avg_prs", hue="repo", marker="o")
plt.title(f"{window_size}-Week Rolling Average of Number of PRs Over Time")
plt.xlabel("Week")
plt.ylabel("Number of PRs (Rolling Avg)")
plt.xticks(rotation=45, ha='right')
plt.legend(title="Repository")
plt.grid(True)
plt.tight_layout()
plt.show()