# Event triggers

This notebook collects the prevalence of event trigger types across all repos.

In [None]:
# Load necessary libraries
import glob
import json
import os
import pandas as pd

In [None]:
# Find the most recent code search directory in data/processed, and obtain the final JSONL file path
PROCESSED_DATA_DIR = os.path.join(os.getcwd(), "..", "data", "processed")
latest_dir_pattern = os.path.join(PROCESSED_DATA_DIR, "code_search_*")
latest_dir = max(glob.glob(latest_dir_pattern), key=os.path.getmtime, default=None)

if latest_dir:
    RESULTS_DIR = os.path.join(latest_dir, "results")
    os.makedirs(RESULTS_DIR, exist_ok=True)
else:
    raise FileNotFoundError("No matching code_search_YYYYMMDD_hhmmss directory found.")

INPUT_FILENAME = os.path.join(RESULTS_DIR, "aws_provider_repos.jsonl")

In [None]:
# We load that JSONL file
rows = []
with open(INPUT_FILENAME, "r") as f:
    for line in f:
        rows.append(json.loads(line))

# And we extract the events from each function in each repo
events = []
for repo in rows:
    project_id = repo.get("repository")
    serverless_config = repo.get("serverless_config", [])
    # serverless_config is a list of dicts, each with a 'config' key
    for config_obj in serverless_config:
        config = config_obj.get("config", {})
        functions = config.get("events") or config.get("functions", {})
        if isinstance(functions, dict):
            for function_name, function_data in functions.items():
                if function_data is None or isinstance(function_data, str):
                    events.append([project_id, function_name, "N/A"])
                    continue
                event_list = function_data.get("events", [])
                if isinstance(event_list, list):
                    for event_dict in event_list:
                        if isinstance(event_dict, dict):
                            for event_type in event_dict.keys():
                                events.append([project_id, function_name, event_type])
                        else:
                            events.append([project_id, function_name, str(event_dict)])
                elif isinstance(event_list, dict):
                    for event_type in event_list.keys():
                        events.append([project_id, function_name, event_type])
                else:
                    events.append([project_id, function_name, "N/A"])

# Create DataFrame
df = pd.DataFrame(events, columns=["project_id", "function_name", "event"])
df.head()

In [None]:
filtered_df = df.dropna(subset=['event'])
filtered_df

In [None]:
# Events supported by the Serverless Framework
supported_events = [
'httpApi',
'http',
'activemq',
'alb',
'alexaSkill',
'alexaSmartHome',
'cloudwatchEvent',
'cloudwatchLog',
'cloudFront',
'cognitoUserPool',
'eventBridge',
'iot',
'iotFleetProvisioning',
'kafka',
'stream',
'msk',
'rabbitmq',
's3',
'schedule',
'sns',
'sqs',
'websocket'
]
supported_events

In [None]:
filtered_df = filtered_df[filtered_df['event'].isin(supported_events)]
filtered_df

In [None]:
# Group the filtered DataFrame by the 'event' column
grouped_df = filtered_df.groupby('event').agg({
    'project_id': 'count'  # Count of occurrences
}).reset_index()

# Rename the columns for clarity
grouped_df = grouped_df.rename(columns={'project_id': 'count'})

# Calculate the total count of runtimes
total_count = grouped_df['count'].sum()

# Add a new column "occurrence" with the percentage values
grouped_df['occurrence'] = (grouped_df['count'] / total_count) * 100

# Sort df by the "occurrence" column in descending order
grouped_df = grouped_df.sort_values(by='occurrence', ascending=False)

# Reset the index
grouped_df = grouped_df.reset_index(drop=True)
grouped_df

In [None]:
# Group the filtered DataFrame by the 'event' column
grouped_df = filtered_df.groupby('event').agg({
    'project_id': 'count'  # Count of occurrences (event triggers)
}).reset_index()

# Rename the columns for clarity
grouped_df = grouped_df.rename(columns={'project_id': 'count'})

# Calculate the total count of event triggers
total_count = grouped_df['count'].sum()

# Add a new column "occurrence" with the percentage values (event triggers)
grouped_df['occurrence'] = (grouped_df['count'] / total_count) * 100

# Calculate the percentage of repositories in which each event appears
repo_counts = filtered_df.groupby('event')['project_id'].nunique()
total_repos = filtered_df['project_id'].nunique()
grouped_df['repo_percentage'] = grouped_df['event'].map(lambda e: (repo_counts[e] / total_repos) * 100)

# Sort df by the "occurrence" column in descending order
grouped_df = grouped_df.sort_values(by='occurrence', ascending=False)

# Reset the index
grouped_df = grouped_df.reset_index(drop=True)
grouped_df