In [6]:
URL = 'abfss://github_event@onelake.dfs.fabric.microsoft.com/github_event_lakehouse.Lakehouse/Tables/'

# Format URLs to access each table's data in the Data Lake Storage 
adl_url_events = f"{URL}/public_events/"
adl_url_push_events = f"{URL}/public_push_events/"
adl_url_commits = f"{URL}/public_commits/"
adl_url_pull_request_events = f"{URL}/public_pull_request_events/"
adl_url_trending = f"{URL}/public_trending/"

# Load data from the specified folders in the Azure Data Lake Storage
events_df = spark.read.format("Delta").load(adl_url_events)
push_events_df = spark.read.format("Delta").load(adl_url_push_events)
commits_df = spark.read.format("Delta").load(adl_url_commits)
pull_request_events_df = spark.read.format("Delta").load(adl_url_pull_request_events)
trending_df = spark.read.format("Delta").load(adl_url_trending)

events_df.createOrReplaceTempView("events")
push_events_df.createOrReplaceTempView("push_events")
pull_request_events_df.createOrReplaceTempView("pull_requests")
trending_df.createOrReplaceTempView("trending")
commits_df.createOrReplaceTempView("commits")

activity_df = spark.sql("""
    SELECT date(created_at) as event_date, COUNT(*) as event_count
    FROM events
    GROUP BY event_date
    ORDER BY event_date
""")

contributor_activity_df = spark.sql("""
    SELECT actor_login, COUNT(*) as event_count
    FROM events
    GROUP BY actor_login
    ORDER BY event_count DESC
""")

repo_activity_df = spark.sql("""
    SELECT events.repo_name, 
           COUNT(DISTINCT push_events.id) AS push_count,
           COUNT(DISTINCT pull_requests.id) AS pull_request_count
    FROM events
    LEFT JOIN push_events ON events.id = push_events.event_id
    LEFT JOIN pull_requests ON events.id = pull_requests.event_id
    GROUP BY events.repo_name
    ORDER BY push_count DESC, pull_request_count DESC
""")

language_popularity_df = spark.sql("""
    SELECT language, COUNT(*) as repo_count, SUM(stars) as total_stars
    FROM trending
    WHERE language IS NOT NULL
    GROUP BY language
    ORDER BY total_stars DESC
""")

repo_popularity_df = spark.sql("""
    SELECT repo_name, COUNT(*) as repo_count
    FROM trending
    GROUP BY repo_name
    ORDER BY repo_count DESC
""")

commits_by_author_df = spark.sql("""
    SELECT author_name, COUNT(*) as commit_count
    FROM commits
    GROUP BY author_name
    ORDER BY commit_count DESC
""")

activity_df.write.format("delta").mode("overwrite").saveAsTable("activity_count")
contributor_activity_df.write.format("delta").mode("overwrite").saveAsTable("contributor_activity")
repo_activity_df.write.format("delta").mode("overwrite").saveAsTable("repo_activity")
language_popularity_df.write.format("delta").mode("overwrite").saveAsTable("language_popularity")
commits_by_author_df.write.format("delta").mode("overwrite").saveAsTable("commits_by_author")
repo_popularity_df.write.format("delta").mode("overwrite").saveAsTable("repo_popularity")

StatementMeta(, , , Waiting, , Waiting)