In [2]:
from op_analytics.datasources.github.dataaccess import Github
from op_analytics.coreutils.duckdb_inmem.client import init_client

prs_table = Github.PRS.read()
ctx = init_client()

# Get a count by date.
print(ctx.client.sql(f"SELECT dt, count(*) FROM {prs_table} GROUP BY 1 ORDER BY 1"))

# Get the raw PR data.
prs_df = ctx.client.sql(f"SELECT dt, repo, number FROM {prs_table}").pl()

[2m2025-01-28 14:14:45[0m [[32m[1minfo     [0m] [1mReading data from 'github/github_prs_v1' with filters min_date=None, max_date=None, date_range_spec=None[0m [36mfilename[0m=[35mdailydata.py[0m [36mlineno[0m=[35m187[0m [36mprocess[0m=[35m58371[0m
[2m2025-01-28 14:14:47[0m [[32m[1minfo     [0m] [1mregistered view: 'github_github_prs_v1' using uri wildcard 'gs://oplabs-tools-data-sink/github/github_prs_v1/dt=*/out.parquet'[0m [36mfilename[0m=[35mclient.py[0m [36mlineno[0m=[35m291[0m [36mprocess[0m=[35m58371[0m
┌──────────────────────┐
│         name         │
│       varchar        │
├──────────────────────┤
│ github_github_prs_v1 │
└──────────────────────┘

┌────────────┬──────────────┐
│     dt     │ count_star() │
│    date    │    int64     │
├────────────┼──────────────┤
│ 2020-12-31 │          260 │
│ 2021-12-31 │         1316 │
│ 2022-12-31 │         2352 │
│ 2023-12-31 │         3961 │
│ 2025-01-13 │         6411 │
│ 2025-01-15 │          42

In [4]:
# Helper functions.

import os
from unittest.mock import patch

import polars as pl

from op_analytics.coreutils.partitioned import dailydata
from op_analytics.coreutils.partitioned.location import DataLocation
from op_analytics.coreutils.time import date_fromstr
from op_analytics.datasources.github.activity.githubapi import OptimismRepo, bulk_fetch_reviews
from op_analytics.datasources.github.dataaccess import Github


def fetch_backfill_reviews(target_date: str, pull_requests_df: pl.DataFrame):
    """Fetch the backfill reviews dataframe."""
    
    target_dt = date_fromstr(target_date)
    dfs = []
    for (dt, repo), group_df in pull_requests_df.sort("dt", "repo").group_by("dt", "repo"):
        if dt != target_dt:
            continue

        print(dt, repo, len(group_df))
        repo_obj = OptimismRepo(repo)
        pr_numbers = group_df["number"].to_list()
        pr_reviews = bulk_fetch_reviews(repo_obj, pr_numbers)

        extracols = dict(repo=pl.lit(repo), dt=pl.lit(target_date))
        dfs.append(pr_reviews.with_columns(**extracols))

    return pl.concat(dfs)


def overwrite_reviews(new_reviews_df):
    """Overwrite GCS with the backfill reviews df."""
    
    # Required to allow writing data to GCS from local.
    os.environ["ALLOW_WRITE"] = "true"

    # Override location for local. Othersise It will default to DataLocation.LOCAL when running from laptop.
    def mock_location():
        return DataLocation.GCS

    with patch.object(dailydata, "determine_location", mock_location):
        Github.PR_REVIEWS.write(
            dataframe=new_reviews_df,
            sort_by=["repo", "pr_number", "submitted_at"],
        )

In [6]:
# The various original backfill "dt"s
# The first few dates have a lot of historical data, so I ran them by hand one by one.

# target_date = "2020-12-31"
# target_date = "2021-12-31"
# target_date = "2022-12-31"
# target_date = "2023-12-31"
# target_date = "2025-01-13"
# target_date = "2025-01-15"
reviews_df = fetch_backfill_reviews(target_date="2025-01-16", pull_requests_df=prs_df)


2025-01-16 design-docs 27
[2m2025-01-28 14:16:04[0m [[32m[1minfo     [0m] [1mfetching reviews for 27 prs   [0m [36mfilename[0m=[35mgithubapi.py[0m [36mlineno[0m=[35m99[0m [36mprocess[0m=[35m58371[0m
[2m2025-01-28 14:16:06[0m [[32m[1minfo     [0m] [1mfetching data for prs. completed 1 of 27[0m [36mfilename[0m=[35mgithubapi.py[0m [36mlineno[0m=[35m122[0m [36mprocess[0m=[35m58371[0m
[2m2025-01-28 14:16:15[0m [[32m[1minfo     [0m] [1mfetching data for prs. completed 6 of 27[0m [36mfilename[0m=[35mgithubapi.py[0m [36mlineno[0m=[35m122[0m [36mprocess[0m=[35m58371[0m
[2m2025-01-28 14:16:25[0m [[32m[1minfo     [0m] [1mfetching data for prs. completed 11 of 27[0m [36mfilename[0m=[35mgithubapi.py[0m [36mlineno[0m=[35m122[0m [36mprocess[0m=[35m58371[0m
[2m2025-01-28 14:16:35[0m [[32m[1minfo     [0m] [1mfetching data for prs. completed 16 of 27[0m [36mfilename[0m=[35mgithubapi.py[0m [36mlineno[0m=[35m122[0m 

In [7]:
len(reviews_df)

1731

In [8]:
overwrite_reviews(new_reviews_df=reviews_df)

[2m2025-01-28 14:30:42[0m [[32m[1mdebug    [0m] [1mFound vault variable GOOGLE_SERVICE_ACCOUNT (has JSON key)[0m [36mfilename[0m=[35mgcpauth.py[0m [36mlineno[0m=[35m18[0m [36mprocess[0m=[35m58371[0m [36mroot[0m=[35mgithub/github_pr_reviews_v2[0m
[2m2025-01-28 14:30:42[0m [[32m[1minfo     [0m] [1mInitialized gcsfs client for bucket=gs://oplabs-tools-data-sink[0m [36mfilename[0m=[35mgcs_parquet.py[0m [36mlineno[0m=[35m32[0m [36mprocess[0m=[35m58371[0m [36mroot[0m=[35mgithub/github_pr_reviews_v2[0m
[2m2025-01-28 14:30:42[0m [[32m[1minfo     [0m] [1mdone writing 1.7Krows 88.2KB  [0m [36mfilename[0m=[35mgcs_parquet.py[0m [36mlineno[0m=[35m57[0m [36mmaxrss[0m=[35m301973504[0m [36mpath[0m=[35moplabs-tools-data-sink/github/github_pr_reviews_v2/dt=2025-01-16/out.parquet[0m [36mprocess[0m=[35m58371[0m [36mroot[0m=[35mgithub/github_pr_reviews_v2[0m [36mrows[0m=[35m1731[0m [36msize[0m=[35m88172[0m
[2m2025-01-28 1

In [None]:
# The last dates I ran in a loop since each one of them goes much faster.

for val in [
    "2025-01-17",
    "2025-01-18",
    "2025-01-19",
    "2025-01-20",
    "2025-01-21",
    "2025-01-22",
    "2025-01-23",
    "2025-01-24",
    "2025-01-25",
    "2025-01-26",
    "2025-01-27",
    "2025-01-28",
]:
    reviews_df = fetch_backfill_reviews(target_date=val, pull_requests_df=prs_df)
    print("\n---")
    print(val, len(reviews_df))
    overwrite_reviews(new_reviews_df=reviews_df)