# CNCF 2024 - Pull Requests

In [1]:
import altair as alt
from pathlib import Path
import os
import pyarrow.parquet as pq
import pandas as pd
from datetime import datetime

# Enable VegaFusion data transformer to handle large datasets
alt.data_transformers.enable("vegafusion")

# Optionally, set the data transformer to use a local cache to improve performance
alt.data_transformers.enable('json', prefix='altair-data')

print("Altair data transformer configuration:", alt.data_transformers.active)

DATASETS_ROOT_PATH="/Users/matt/gharchive-CNCF.2024"
DATASETS_ROOT = Path(DATASETS_ROOT_PATH)
DATASET_PATHS = {}

for file in os.listdir(DATASETS_ROOT):
    file_path = DATASETS_ROOT / file
    if file_path.is_file() and file.endswith('.parquet'):
        DATASET_PATHS[file] = file_path

# Read CNCF projects data
cncf_projects = pd.read_csv('../../../notebooks/cncf/projects-with-github.csv')

# Create mapping from org to tag
org_to_tag = cncf_projects.set_index('org')['tag'].to_dict()

Altair data transformer configuration: json


In [2]:
pr_events_path = DATASET_PATHS['PullRequestEvent-consolidated.parquet']
pr_events_df = pq.read_table(pr_events_path).to_pandas()

# Display basic info about the dataset
print(f"WatchEvents dataset shape: {pr_events_df.shape}")
pr_events_df.head()


WatchEvents dataset shape: (767758, 440)


Unnamed: 0,actor.avatar_url,actor.display_login,actor.gravatar_id,actor.id,actor.login,actor.url,created_at,day,id,month,...,year,payload.pull_request.assignee.user_view_type,payload.pull_request.auto_merge.enabled_by.user_view_type,payload.pull_request.base.user.user_view_type,payload.pull_request.head.user.user_view_type,payload.pull_request.merged_by.user_view_type,payload.pull_request.user.user_view_type,payload.pull_request.milestone.creator.user_view_type,payload.pull_request.base.repo.owner.user_view_type,payload.pull_request.head.repo.owner.user_view_type
0,https://avatars.githubusercontent.com/u/3943066?,havetisyan,,3943066,havetisyan,https://api.github.com/users/havetisyan,2024-01-07T19:34:07Z,7,34632623082,1,...,2024,,,,,,,,,
1,https://avatars.githubusercontent.com/u/3943066?,havetisyan,,3943066,havetisyan,https://api.github.com/users/havetisyan,2024-01-21T21:01:01Z,21,34997032059,1,...,2024,,,,,,,,,
2,https://avatars.githubusercontent.com/u/3943066?,havetisyan,,3943066,havetisyan,https://api.github.com/users/havetisyan,2024-03-23T18:07:24Z,23,36823152849,3,...,2024,,,,,,,,,
3,https://avatars.githubusercontent.com/u/3943066?,havetisyan,,3943066,havetisyan,https://api.github.com/users/havetisyan,2024-03-31T22:45:08Z,31,37035494843,3,...,2024,,,,,,,,,
4,https://avatars.githubusercontent.com/u/3943066?,havetisyan,,3943066,havetisyan,https://api.github.com/users/havetisyan,2024-03-03T01:11:39Z,3,36191632461,3,...,2024,,,,,,,,,


In [3]:
# Map tag to pull request events using existing org_name column
pr_events_df['tag'] = pr_events_df['org_name'].map(org_to_tag)

# Convert created_at to datetime if not already
pr_events_df['created_at'] = pd.to_datetime(pr_events_df['created_at'])

# Create daily pull request counts by tag
daily_prs = (
    pr_events_df[pr_events_df['tag'].notna()]
    .groupby([pd.Grouper(key='created_at', freq='D'), 'tag'])
    .size()
    .reset_index(name='pr_count')
)

# Create visualization
chart = alt.Chart(daily_prs).mark_line().encode(
    x=alt.X('created_at:T', title='Date'),
    y=alt.Y('pr_count:Q', title='Pull Request Events'),
    color=alt.Color('tag:N', title='Project Tag'),
    tooltip=['created_at', 'tag', 'pr_count']
).properties(
    width=800,
    height=400,
    title='GitHub Pull Request Events by CNCF Project Tag Over Time'
).interactive()

chart

In [4]:
# Filter for observability tag
observability_prs = pr_events_df[pr_events_df['tag'] == 'observability']

# Create daily pull request counts by project for observability tag
daily_observability_prs = (
    observability_prs
    .groupby([pd.Grouper(key='created_at', freq='D'), 'repo_name'])
    .size()
    .reset_index(name='pr_count')
)

# Create visualization for observability tag
observability_chart = alt.Chart(daily_observability_prs).mark_line().encode(
    x=alt.X('created_at:T', title='Date'),
    y=alt.Y('pr_count:Q', title='Pull Request Events'),
    color=alt.Color('repo_name:N', title='Project Name'),
    tooltip=['created_at', 'repo_name', 'pr_count']
).properties(
    width=800,
    height=400,
    title='GitHub Pull Request Events for Observability Projects Over Time'
).interactive()

observability_chart

In [5]:
# Filter for observability tag
observability_prs_top15 = (
    observability_prs
    .groupby('repo_name')
    .size()
    .reset_index(name='total_pr_count')
    .nlargest(15, 'total_pr_count')
    .merge(observability_prs, on='repo_name')
)

# Create daily pull request counts by project for top 15 observability projects
daily_observability_prs_top15 = (
    observability_prs_top15
    .groupby([pd.Grouper(key='created_at', freq='D'), 'repo_name'])
    .size()
    .reset_index(name='pr_count')
)

# Create visualization for top 15 observability projects
observability_chart_top15 = alt.Chart(daily_observability_prs_top15).mark_line().encode(
    x=alt.X('created_at:T', title='Date'),
    y=alt.Y('pr_count:Q', title='Pull Request Events'),
    color=alt.Color('repo_name:N', title='Project Name'),
    tooltip=['created_at', 'repo_name', 'pr_count']
).properties(
    width=800,
    height=400,
    title='GitHub Pull Request Events for Top 15 Observability Projects Over Time'
).interactive()

observability_chart_top15


In [6]:
# Create cumulative pull request counts by project for top 15 observability projects
cumulative_observability_prs_top15 = (
    observability_prs_top15
    .groupby(['repo_name', pd.Grouper(key='created_at', freq='D')])
    .size()
    .groupby(level=0).cumsum()
    .reset_index(name='cumulative_pr_count')
)

# Create stacked line chart for cumulative pull request counts
cumulative_observability_pr_chart_top15 = alt.Chart(cumulative_observability_prs_top15).mark_line().encode(
    x=alt.X('created_at:T', title='Date'),
    y=alt.Y('cumulative_pr_count:Q', title='Cumulative Pull Request Events'),
    color=alt.Color('repo_name:N', title='Project Name'),
    tooltip=['created_at', 'repo_name', 'cumulative_pr_count']
).properties(
    width=800,
    height=400,
    title='Cumulative GitHub Pull Request Events for Top 15 Observability Projects Over Time'
).interactive()

cumulative_observability_pr_chart_top15