In [None]:
import sys
import logging
import pandas as pd
import numpy as np
from tqdm import tqdm
from google.cloud import bigquery
from collections import defaultdict
import matplotlib.pyplot as plt

client = bigquery.Client()

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

commit_mapping = defaultdict(list)

In [None]:
def batch_commits_for_commit_build_times(
    commits: pd.DataFrame,
    workflow_runs: pd.DataFrame,
    batch_max_wait_time: int,
):
    """
    Given a DataFrame of commits, batch them into groups based on the time between commits
    """
    current_batch_workflows = defaultdict(list)
    current_batch_end_time = commits.iloc[0]["date"] + pd.Timedelta(
        minutes=batch_max_wait_time
    )
    curr_commit_batch = [commits.iloc[0]]
    curr_batch_build_time = 0
    print("The commits", commits)

    for i in range(1, len(commits)):
        curr_commit = commits.iloc[i]

        if curr_commit["date"] > current_batch_end_time:
            for _, build_times in current_batch_workflows.items():
                curr_batch_build_time += np.mean(
                    build_times
                )  # Take the average of all builds for a certain workflow across the batch of commits

            current_batch_end_time = curr_commit["date"] + pd.Timedelta(
                minutes=batch_max_wait_time
            )

            for c in curr_commit_batch:
                commit_mapping[c["sha"]].append(
                    (
                        curr_batch_build_time / len(curr_commit_batch),
                        batch_max_wait_time,
                    )
                )

            current_batch_workflows = defaultdict(list)
            curr_commit_batch = [curr_commit]
            curr_batch_build_time = 0

        workflows_for_commit = workflow_runs.loc[
            workflow_runs["head_sha"] == curr_commit["sha"]
        ]
        for _, workflow in workflows_for_commit.iterrows():
            current_batch_workflows[workflow["workflow_id"]].append(
                workflow["build_minutes"]
            )

    # Process the last batch
    for _, build_times in current_batch_workflows.items():
        curr_batch_build_time += np.mean(
            build_times
        )  # Take the average of all builds for a certain workflow across the batch of commits

    for c in curr_commit_batch:
        commit_mapping[c["sha"]].append(
            (curr_batch_build_time / len(curr_commit_batch), batch_max_wait_time)
        )

    return commit_mapping

In [None]:
def run_monte_carlo_simulation(
    all_commits: pd.DataFrame, workflow_runs: pd.DataFrame, iterations: int = 10
):
    simulation_results = []
    for _ in tqdm(range(iterations)):
        bootstrap_sample = all_commits.sample(
            n=1000, replace=True
        ).sort_index()  # retain original sorted order which which started at the earliest commit in range and is ascending by time

        merge_queue_batch_delay = np.random.randint(1, 61)
        batch_commits_for_commit_build_times(
            bootstrap_sample, workflow_runs, merge_queue_batch_delay
        )

    #     simulation_results.append(
    #         {
    #             "merge_queue_batch_delay": merge_queue_batch_delay,
    #             "total_ci_minutes": build_time,
    #             "total_delay": total_delay,
    #             "mean_delay": mean_delay,
    #         }
    #     )

    # simulation_results_df = pd.DataFrame(simulation_results)

    print(commit_mapping)

    return []