# Dataset: Swift and First Party Libraries (apple/swift-*, swift-server/*)

## Dataset: Size and Scope

In [None]:
from notebook_utils import *
# from arrow_utils import *

In [None]:
!echo "---"
!ls     ~/gharchive-swift
!echo "---"
!du -hs ~/gharchive-swift
!echo "---"
!du -hs ~/gharchive-swift/*gz

In [None]:
!echo "---"
!ls     ~/gharchive-swift/swift.all
!echo "---"
!du -hs ~/gharchive-swift/swift.all/*

### Dataset: Orgs and Repos

In [None]:
# each dataset has an org/repo directory structure.
!tree -L 2 -d /Users/matt/gharchive-swift/swift.all/PullRequestEvent/

### Dataset Time Range: 2015 - 2024 (thru Feb 2024)

In [None]:
!ls -lh /Users/matt/gharchive-swift/swift.all/PullRequestEvent/apple/swift | head -n 10
!echo "---"
!ls -lh /Users/matt/gharchive-swift/swift.all/PullRequestEvent/apple/swift | tail -n 10

## Notebook Setup, Helpers

In [None]:
# from pathlib import Path
# import os
import pyarrow.parquet as pq


DATASETS_ROOT_PATH="/Users/matt/gharchive-swift/swift.all"
DATASETS_ROOT = Path(DATASETS_ROOT_PATH)
DATASET_PATHS = {}

for subdir in os.listdir(DATASETS_ROOT):
    subdir_path = DATASETS_ROOT / subdir
    if subdir_path.is_dir():
        DATASET_PATHS[subdir] = subdir_path

DATASET_PATHS

In [None]:
DATASETS = {}

for name, path in DATASET_PATHS.items():
    print(f'Processing: {name} -> {path}')
    DATASETS[name] = load_parquet_dataset(name, path)

### Persist per-event Schema -> files

In [None]:
for name, dataset in DATASETS.items():
    print(f'name: {name}')
    with open(f'{name}.schema', 'a') as f:
        f.write(str(dataset.schema))

In [None]:
!ls -l *.schema

#### WatchEvent.Schema (example 1)

In [None]:
!cat WatchEvent.schema

#### ReleaseEvent Schema (example 2)

In [None]:
!cat ReleaseEvent.schema

In [None]:
schema_summaries = {}

def dataset_schema_summary(dataset_path: str) -> pd.DataFrame:
    """
    Creates a summary DataFrame for all fragments in a dataset, including schema details
    and partition information.

    Parameters:
    - dataset_path: The file system path to the dataset.

    Returns:
    A pandas DataFrame with columns for each schema field, including fragment and partition keys.
    """
    dataset = ds.dataset(dataset_path, format="parquet")  # Adjust format as needed
    summary = []

    for fragment in dataset.get_fragments():
        schema = fragment.physical_schema
        
        for field in schema:
            summary.append({
                "Fragment": fragment.path,
                "Field Name": field.name,
                "Type": str(field.type),
                "Nullable": field.nullable
            })

    # Create a DataFrame from the summary list
    df = pd.DataFrame(summary)
    print(f'{df.shape}')
    df.drop_duplicates(inplace=True, subset=["Field Name", "Type", "Nullable"])
    print(f'{df.shape}')
    return df

## Construction Zone (WIP)

In [None]:
# TODO: after we create monthlies, too expensive to run now
# for name, path in DATASET_PATHS.items():
#     print(f'name: {name}')
#     df = dataset_schema_summary(path)
#     schema_summaries[name] = df
# schema_summaries

In [None]:
# Assuming `df` is your DataFrame loaded from the Parquet dataset
summary_df = df.groupby(['partition_key1', 'partition_key2']).agg('count').reset_index()

In [None]:
import plotly.express as px

def generate_sunburst_chart(summary_df, filename="sunburst_chart.png"):
    fig = px.sunburst(summary_df, 
                      path=['partition_key1', 'partition_key2'],  # Adjust based on your dataset
                      values='count',  # This should be your aggregate column
                      color='count',
                      title='Data Distribution Across Partitions',
                      color_continuous_scale='RdBu')
    fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))
    save_plotly_chart_as_png(fig, filename)



In [None]:
import altair as alt

def generate_timeline_chart(summary_df, filename="timeline_chart.png"):
    chart = alt.Chart(summary_df).mark_line(point=True).encode(
        x='time:T',  # Adjust for your time-related partition key
        y='count:Q',
        tooltip=['partition_key1', 'partition_key2', 'count']  # Adjust tooltips as needed
    ).properties(
        width=800,
        height=400,
        title='Data Counts Over Time'
    )
    save_altair_chart_as_png(chart, filename)



In [None]:
def generate_scatterplot_over_time(summary_df, filename="scatterplot_over_time.png"):
    chart = alt.Chart(summary_df).mark_point().encode(
        x='time:T',  # Adjust for your time-related partition key
        y='count:Q',
        tooltip=['partition_key1', 'partition_key2', 'count']  # Adjust tooltips as needed
    ).properties(
        title='Scatterplot of Data Over Time',
        width=800,
        height=400
    ).interactive()  # Enables panning and zooming
    save_altair_chart_as_png(chart, filename)