# Event triggers

This notebook collects the prevalence of event trigger types across all repos.

In [None]:
# Load necessary libraries
import glob
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

In [None]:
# Find the most recent code search directory in data/processed, and obtain the final JSONL file path
PROCESSED_DATA_DIR = os.path.join(os.getcwd(), "..", "data", "processed")
latest_dir_pattern = os.path.join(PROCESSED_DATA_DIR, "code_search_*")
latest_dir = max(glob.glob(latest_dir_pattern), key=os.path.getmtime, default=None)

if latest_dir:
    RESULTS_DIR = os.path.join(latest_dir, "results")
    os.makedirs(RESULTS_DIR, exist_ok=True)
else:
    raise FileNotFoundError("No matching code_search_YYYYMMDD_hhmmss directory found.")

INPUT_FILENAME = os.path.join(RESULTS_DIR, "aws_provider_repos.jsonl")

In [None]:
# We load that JSONL file
rows = []
with open(INPUT_FILENAME, "r") as f:
    for line in f:
        rows.append(json.loads(line))

# And we extract the events from each function in each repo
events = []
for repo in rows:
    project_id = repo.get("repository")
    serverless_config = repo.get("serverless_config", [])
    # serverless_config is a list of dicts, each with a 'config' key
    for config_obj in serverless_config:
        config = config_obj.get("config", {})
        functions = config.get("events") or config.get("functions", {})
        if isinstance(functions, dict):
            for function_name, function_data in functions.items():
                if function_data is None or isinstance(function_data, str):
                    events.append([project_id, function_name, "N/A"])
                    continue
                event_list = function_data.get("events", [])
                if isinstance(event_list, list):
                    for event_dict in event_list:
                        if isinstance(event_dict, dict):
                            for event_type in event_dict.keys():
                                events.append([project_id, function_name, event_type])
                        else:
                            events.append([project_id, function_name, str(event_dict)])
                elif isinstance(event_list, dict):
                    for event_type in event_list.keys():
                        events.append([project_id, function_name, event_type])
                else:
                    events.append([project_id, function_name, "N/A"])

# Create DataFrame
df = pd.DataFrame(events, columns=["project_id", "function_name", "event"])
df.head()

In [None]:
filtered_df = df.dropna(subset=['event'])
filtered_df

In [None]:
# Events supported by the Serverless Framework
supported_events = [
'httpApi',
'http',
'activemq',
'alb',
'alexaSkill',
'alexaSmartHome',
'cloudwatchEvent',
'cloudwatchLog',
'cloudFront',
'cognitoUserPool',
'eventBridge',
'iot',
'iotFleetProvisioning',
'kafka',
'stream',
'msk',
'rabbitmq',
's3',
'schedule',
'sns',
'sqs',
'websocket'
]
supported_events

In [None]:
filtered_df = filtered_df[filtered_df['event'].isin(supported_events)]
filtered_df

In [None]:
# Group the filtered DataFrame by the 'event' column
grouped_df = filtered_df.groupby('event').agg({
    'project_id': 'count'  # Count of occurrences
}).reset_index()

# Rename the columns for clarity
grouped_df = grouped_df.rename(columns={'project_id': 'count'})

# Calculate the total count of runtimes
total_count = grouped_df['count'].sum()

# Add a new column "occurrence" with the percentage values
grouped_df['occurrence'] = (grouped_df['count'] / total_count) * 100

# Sort df by the "occurrence" column in descending order
grouped_df = grouped_df.sort_values(by='occurrence', ascending=False)

# Reset the index
grouped_df = grouped_df.reset_index(drop=True)
grouped_df

In [None]:
# Group the filtered DataFrame by the 'event' column
grouped_df = filtered_df.groupby('event').agg({
    'project_id': 'count'  # Count of occurrences (event triggers)
}).reset_index()

# Rename the columns for clarity
grouped_df = grouped_df.rename(columns={'project_id': 'count'})

# Calculate the total count of event triggers
total_count = grouped_df['count'].sum()

# Add a new column "occurrence" with the percentage values (event triggers)
grouped_df['occurrence'] = (grouped_df['count'] / total_count) * 100

# Calculate the percentage of repositories in which each event appears
repo_counts = filtered_df.groupby('event')['project_id'].nunique()
total_repos = filtered_df['project_id'].nunique()
grouped_df['repo_percentage'] = grouped_df['event'].map(lambda e: (repo_counts[e] / total_repos) * 100)

# Sort df by the "occurrence" column in descending order
grouped_df = grouped_df.sort_values(by='occurrence', ascending=False)

# Reset the index
grouped_df = grouped_df.reset_index(drop=True)
grouped_df

In [None]:
event_counts_df = df.drop(columns=['function_name'])
event_counts_df['common_id'] = event_counts_df['project_id'].str.replace(r'_\d+$', '', regex=True)
event_counts_df = event_counts_df.groupby(['common_id', 'event']).size().reset_index(name='count')
event_counts_df

In [None]:
grouped_df = event_counts_df.groupby('common_id')['count'].sum().reset_index()
grouped_df

In [None]:
# Statistical summary of the 'count' column
events_count_summary = grouped_df['count'].describe()

# Maximum value
max_value = grouped_df['count'].max()

# Minimum value
min_value = grouped_df['count'].min()

# Range (difference between maximum and minimum)
range_value = max_value - min_value

# Median
median_value = grouped_df['count'].median()

# Variance
variance_value = grouped_df['count'].var()

# Standard deviation
std_deviation_value = grouped_df['count'].std()

# Cleaner display of floating point numbers
pd.options.display.float_format = '{:.2f}'.format

# Show the results
print("Statistical analysis:")
print(events_count_summary)
print("\nMax.:", max_value)
print("Min.:", min_value)
print("Range:", range_value)
print("Median:", median_value)
print("Variance:", variance_value)
print("Standard deviation:", std_deviation_value)

In [None]:
# Extract the 'count' data for plotting
events_count_data = grouped_df['count']

# Calculate the interquartile range (IQR)
Q1 = events_count_data.quantile(0.25)
Q3 = events_count_data.quantile(0.75)
IQR = Q3 - Q1

# Calculate the percentage of values within the IQR bounds
iqr_lower_bound = Q1 - 1.5 * IQR
iqr_upper_bound = Q3 + 1.5 * IQR
iqr_lower_bound = max(0, iqr_lower_bound) # IQR lower bound cannot be negative
percentage_in_iqr = ((events_count_data >= iqr_lower_bound) & (events_count_data <= iqr_upper_bound)).mean() * 100

# PDF plot
fig_pdf, ax_pdf = plt.subplots(figsize=(10, 6))
ax_pdf.hist(events_count_data, bins=20, density=True, alpha=0.7, color='blue', edgecolor='black')
ax_pdf.set_xlabel('Number of event triggers', fontsize=12)
ax_pdf.set_ylabel('Probability', fontsize=12)
ax_pdf.set_title('Probability density function (PDF) for the number of event triggers', fontsize=14)
ax_pdf.axvline(iqr_lower_bound, color='green', linestyle='--', label='IQR lower bound')
ax_pdf.axvline(iqr_upper_bound, color='green', linestyle='--', label='IQR upper bound')
ax_pdf.legend(loc='upper right', fontsize=10)
ax_pdf.tick_params(axis='both', labelsize=10)
ax_pdf.grid(axis='y', linestyle='--', alpha=0.7)
ax_pdf.text(0.2, 0.7, 'Mean: {:.2f}'.format(events_count_data.mean()), transform=ax_pdf.transAxes, fontsize=10, color='purple')
ax_pdf.text(0.2, 0.675, 'Standard deviation: {:.2f}'.format(events_count_data.std()), transform=ax_pdf.transAxes, fontsize=10, color='purple')
ax_pdf.text(0.2, 0.650, f'Data in IQR: {percentage_in_iqr:.2f}%', transform=ax_pdf.transAxes, fontsize=10, color='purple')
pdf_file = '../paper/figs/pdf_events_count_data.pdf'
fig_pdf.savefig(pdf_file, format='pdf', dpi=300)

# CDF plot with increased font sizes
fig_cdf, ax_cdf = plt.subplots(figsize=(10, 6))
sorted_data = np.sort(events_count_data)
y = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
ax_cdf.plot(sorted_data, y, marker='.', linestyle='-', color='red', markersize=4, markeredgecolor='black')
ax_cdf.set_xlabel('Number of event triggers', fontsize=22)  # Increased font size
ax_cdf.set_ylabel('Cumulative probability', fontsize=22)  # Increased font size
ax_cdf.set_title('CDF of the number of event triggers per repo.', fontsize=22)  # Increased font size
ax_cdf.axvline(iqr_lower_bound, color='blue', linestyle='--', label='IQR lower bound')
ax_cdf.axvline(iqr_upper_bound, color='blue', linestyle='--', label='IQR upper bound')
ax_cdf.legend(loc='lower right', fontsize=16)  # Increased font size
ax_cdf.tick_params(axis='both', labelsize=16)  # Increased tick label font size
ax_cdf.grid(axis='y', linestyle='--', alpha=0.7)
ax_cdf.text(0.4, 0.6, '25th percentile: {:.2f}'.format(np.percentile(sorted_data, 25)), transform=ax_cdf.transAxes, fontsize=16, color='purple')  # Increased font size
ax_cdf.text(0.4, 0.55, '75th percentile: {:.2f}'.format(np.percentile(sorted_data, 75)), transform=ax_cdf.transAxes, fontsize=16, color='purple')  # Increased font size
ax_cdf.text(0.4, 0.5, f'Data in IQR: {percentage_in_iqr:.2f}%', transform=ax_cdf.transAxes, fontsize=16, color='purple')  # Increased font size
cdf_file = '../paper/figs/cdf_events_count_data.pdf'
fig_cdf.savefig(cdf_file, format='pdf', dpi=300)

# Display the plot
plt.show()