# CNCF 2024 - Watch Events

In [1]:
# !echo "---"
# !ls     ~/gharchive-cncf.2024
# !echo "---"
# !du -hs ~/gharchive-cncf.2024
# !echo "---"

# !du -hs ~/gharchive-cncf.2024/*

In [2]:
import altair as alt
from pathlib import Path
import os
import pyarrow.parquet as pq
import pandas as pd
from datetime import datetime

# Enable VegaFusion data transformer to handle large datasets
alt.data_transformers.enable("vegafusion")

# Optionally, set the data transformer to use a local cache to improve performance
alt.data_transformers.enable('json', prefix='altair-data')

print("Altair data transformer configuration:", alt.data_transformers.active)

DATASETS_ROOT_PATH="/Users/matt/gharchive-CNCF.2024"
DATASETS_ROOT = Path(DATASETS_ROOT_PATH)
DATASET_PATHS = {}

for file in os.listdir(DATASETS_ROOT):
    file_path = DATASETS_ROOT / file
    if file_path.is_file() and file.endswith('.parquet'):
        DATASET_PATHS[file] = file_path

# Read CNCF projects data
cncf_projects = pd.read_csv('../../../notebooks/cncf/projects-with-github.csv')

# Create mapping from org to tag
org_to_tag = cncf_projects.set_index('org')['tag'].to_dict()

Altair data transformer configuration: json


In [3]:
# Read WatchEvent dataset
watch_events_path = DATASET_PATHS['WatchEvent-consolidated.parquet']
watch_events_df = pq.read_table(watch_events_path).to_pandas()

# Display basic info about the dataset
print(f"WatchEvents dataset shape: {watch_events_df.shape}")
watch_events_df.head()


WatchEvents dataset shape: (322934, 25)


Unnamed: 0,actor.avatar_url,actor.display_login,actor.gravatar_id,actor.id,actor.login,actor.url,created_at,day,id,month,...,org_name,payload.action,public,repo.id,repo.name,repo.url,repo_name,repo_org_and_name,type,year
0,https://avatars.githubusercontent.com/u/82771176?,jcunniff,,82771176,jcunniff,https://api.github.com/users/jcunniff,2024-01-07T20:53:32Z,7,34633260024,1,...,Azure,started,True,379248577,Azure/AKS-Construction,https://api.github.com/repos/Azure/AKS-Constru...,AKS-Construction,Azure/AKS-Construction,WatchEvent,2024
1,https://avatars.githubusercontent.com/u/25544967?,soberich,,25544967,soberich,https://api.github.com/users/soberich,2024-02-18T14:06:33Z,18,35774846935,2,...,Azure,started,True,379248577,Azure/AKS-Construction,https://api.github.com/repos/Azure/AKS-Constru...,AKS-Construction,Azure/AKS-Construction,WatchEvent,2024
2,https://avatars.githubusercontent.com/u/50720069?,MarianoChic09,,50720069,MarianoChic09,https://api.github.com/users/MarianoChic09,2024-01-19T11:27:44Z,19,34958652592,1,...,Azure,started,True,379248577,Azure/AKS-Construction,https://api.github.com/repos/Azure/AKS-Constru...,AKS-Construction,Azure/AKS-Construction,WatchEvent,2024
3,https://avatars.githubusercontent.com/u/12050419?,kjarus,,12050419,kjarus,https://api.github.com/users/kjarus,2024-04-12T12:51:15Z,12,37407306788,4,...,Azure,started,True,379248577,Azure/AKS-Construction,https://api.github.com/repos/Azure/AKS-Constru...,AKS-Construction,Azure/AKS-Construction,WatchEvent,2024
4,https://avatars.githubusercontent.com/u/511705?,Xoib,,511705,Xoib,https://api.github.com/users/Xoib,2024-04-08T07:17:20Z,8,37252864695,4,...,Azure,started,True,379248577,Azure/AKS-Construction,https://api.github.com/repos/Azure/AKS-Constru...,AKS-Construction,Azure/AKS-Construction,WatchEvent,2024


In [4]:
# Map tag to watch events using existing org_name column
watch_events_df['tag'] = watch_events_df['org_name'].map(org_to_tag)

# Convert created_at to datetime if not already
watch_events_df['created_at'] = pd.to_datetime(watch_events_df['created_at'])

# Create daily watch counts by tag
daily_watches = (
    watch_events_df[watch_events_df['tag'].notna()]
    .groupby([pd.Grouper(key='created_at', freq='D'), 'tag'])
    .size()
    .reset_index(name='watch_count')
)

# Create visualization
chart = alt.Chart(daily_watches).mark_line().encode(
    x=alt.X('created_at:T', title='Date'),
    y=alt.Y('watch_count:Q', title='Watch Events'),
    color=alt.Color('tag:N', title='Project Tag'),
    tooltip=['created_at', 'tag', 'watch_count']
).properties(
    width=800,
    height=400,
    title='GitHub Watch Events by CNCF Project Tag Over Time'
).interactive()

chart


In [5]:
# Filter for observability tag
observability_watches = watch_events_df[watch_events_df['tag'] == 'observability']

# Create daily watch counts by project for observability tag
daily_observability_watches = (
    observability_watches
    .groupby([pd.Grouper(key='created_at', freq='D'), 'repo_name'])
    .size()
    .reset_index(name='watch_count')
)

# Create visualization for observability tag
observability_chart = alt.Chart(daily_observability_watches).mark_line().encode(
    x=alt.X('created_at:T', title='Date'),
    y=alt.Y('watch_count:Q', title='Watch Events'),
    color=alt.Color('repo_name:N', title='Project Name'),
    tooltip=['created_at', 'repo_name', 'watch_count']
).properties(
    width=800,
    height=400,
    title='GitHub Watch Events for Observability Projects Over Time'
).interactive()

observability_chart


In [6]:
# Filter for observability tag
observability_watches_top15 = (
    observability_watches
    .groupby('repo_name')
    .size()
    .reset_index(name='total_watch_count')
    .nlargest(15, 'total_watch_count')
    .merge(observability_watches, on='repo_name')
)

# Create daily watch counts by project for top 7 observability projects
daily_observability_watches_top15 = (
    observability_watches_top15
    .groupby([pd.Grouper(key='created_at', freq='D'), 'repo_name'])
    .size()
    .reset_index(name='watch_count')
)

# Create visualization for top 7 observability projects
observability_chart_top15 = alt.Chart(daily_observability_watches_top15).mark_line().encode(
    x=alt.X('created_at:T', title='Date'),
    y=alt.Y('watch_count:Q', title='Watch Events'),
    color=alt.Color('repo_name:N', title='Project Name'),
    tooltip=['created_at', 'repo_name', 'watch_count']
).properties(
    width=800,
    height=400,
    title='GitHub Watch Events for Top 15 Observability Projects Over Time'
).interactive()

observability_chart_top15


In [7]:
# Create cumulative watch counts by project for top 15 observability projects
cumulative_observability_watches_top15 = (
    observability_watches_top15
    .groupby(['repo_name', pd.Grouper(key='created_at', freq='D')])
    .size()
    .groupby(level=0).cumsum()
    .reset_index(name='cumulative_watch_count')
)

# Create stacked line chart for cumulative watch counts
cumulative_observability_chart_top15 = alt.Chart(cumulative_observability_watches_top15).mark_line().encode(
    x=alt.X('created_at:T', title='Date'),
    y=alt.Y('cumulative_watch_count:Q', title='Cumulative Watch Events'),
    color=alt.Color('repo_name:N', title='Project Name'),
    tooltip=['created_at', 'repo_name', 'cumulative_watch_count']
).properties(
    width=800,
    height=400,
    title='Cumulative GitHub Watch Events for Top 15 Observability Projects Over Time'
).interactive()

cumulative_observability_chart_top15


In [8]:
# Group by tag and date to get daily watch counts by tag
daily_watches_by_tag = (
    watch_events_df
    .groupby([pd.Grouper(key='created_at', freq='D'), 'tag'])
    .size()
    .reset_index(name='watch_count')
)

# Create visualization for watches by tag over time
watches_by_tag_chart = alt.Chart(daily_watches_by_tag).mark_line().encode(
    x=alt.X('created_at:T', title='Date'),
    y=alt.Y('watch_count:Q', title='Watch Events'),
    color=alt.Color('tag:N', title='Tag'),
    tooltip=['created_at', 'tag', 'watch_count']
).properties(
    width=800,
    height=400,
    title='GitHub Watch Events by Tag Over Time'
).interactive()

watches_by_tag_chart
