In [1]:
# Auto update imports
%load_ext autoreload
%autoreload 2

In [None]:
from op_analytics.datasources.github.metrics.execute import execute_pull_pr_metrics
from op_analytics.datasources.github.dataaccess import Github
from op_analytics.coreutils.logger import structlog
from op_analytics.coreutils.partitioned.dailydatautils import dt_summary

log = structlog.get_logger()

# Cell 2: Load and Verify Activity Data
# Use the DailyDataset read functionality to load data
try:
    # Read all available data without date filters initially
    activity_views = {
        "prs": Github.PRS.read(),
        "pr_comments": Github.PR_COMMENTS.read(),
        "pr_reviews": Github.PR_REVIEWS.read(),
    }
    
    # Query the data using DuckDB context
    from op_analytics.coreutils.duckdb_inmem.client import init_client
    duckdb_ctx = init_client()
    
    activity_dfs = {}
    for name, view in activity_views.items():
        df = duckdb_ctx.client.sql(f"SELECT * FROM {view}").pl()
        activity_dfs[name] = df
        
        # Show data summary
        summary = dt_summary(df)
        log.info(
            f"loaded {name} data",
            rows=df.height,
            date_range=[min(list(summary["dts"].items())), max(list(summary["dts"].items()))],
            repos=df["repo"].unique().to_list()
        )
        print(name)
        print(view)
        display(summary)
except Exception as e:
    log.error("Failed to load activity data", error=str(e))
    raise

### Sanity checks

In [None]:
for name, df in activity_dfs.items():
    print(f"Name: {name}, Columns: {df.columns}, df.schema: {df.schema}")
    print(df["dt"].min(), df["dt"].max())
# Assert that all data is in the same date range
assert activity_dfs["prs"]["dt"].min() == activity_dfs["pr_comments"]["dt"].min() == activity_dfs["pr_reviews"]["dt"].min()
assert activity_dfs["prs"]["dt"].max() == activity_dfs["pr_comments"]["dt"].max() == activity_dfs["pr_reviews"]["dt"].max()

In [None]:
BACKFILL_START = activity_dfs["prs"]["dt"].min()
BACKFILL_END = activity_dfs["prs"]["dt"].max()
# Process all data at once
log.info(
    "processing full date range",
    start=BACKFILL_START, 
    end=BACKFILL_END
)

try:
    # Execute metrics computation and write to GCS for full date range
    summary = execute_pull_pr_metrics(
        min_date=BACKFILL_START,
        max_date=BACKFILL_END
    )
    log.info(
        "completed full date range",
        start=BACKFILL_START,
        end=BACKFILL_END, 
        summary=summary
    )
except Exception as e:
    log.error(
        "failed to process full date range",
        start=BACKFILL_START,
        end=BACKFILL_END,
        error=str(e)
    )
    raise

## Verify the data

In [None]:
from op_analytics.coreutils.partitioned.dailydata import DataLocation
pr_metrics = Github.PR_METRICS.read(location=DataLocation.LOCAL)

In [None]:

# Cell 6: Create Clickhouse View
Github.PR_METRICS.create_clickhouse_view()
log.info("backfill and view creation completed")