In [1]:
!pip3 install seaborn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
import os
import re
import json
import matplotlib.pyplot as plt
import seaborn as sns
import difflib

In [3]:
reports_to_exclude = ["amazon", "ubereats", "glassdoor", "ziprecruiter", "behance"]

In [28]:
def get_files_in_directory(directory_path):
  """Returns a list of all files in a directory as an array of strings."""
  file_list = []
  for filename in os.listdir(directory_path):
    formatted_filename = filename.split(".")[0]
    if formatted_filename in reports_to_exclude:
      continue

    if os.path.isfile(os.path.join(directory_path, filename)):
      file_list.append(formatted_filename)
  return file_list

In [29]:
path_to_original_reports = "./../dataset/lh-original-reports"
reports = get_files_in_directory(path_to_original_reports)
websites = pd.DataFrame(reports, columns=["website"])

websites.describe()

Unnamed: 0,website
count,15
unique,15
top,airbnb
freq,1


In [30]:
# save modifications to csv
path_to_original_chunks = "./../dataset/chunks"

original_data_list = []

for filename in os.listdir(path_to_original_chunks):
    if filename.endswith(".json") and not any(exclude in filename for exclude in reports_to_exclude):
        filepath = os.path.join(path_to_original_chunks, filename)
        cleaned_filename = filename.split(".")[0]

        # Open and read the JSON file
        with open(filepath, 'r') as f:
            data = json.load(f)

            for obj in data:
                obj["website"] = cleaned_filename  
                original_data_list.append(obj)

original_chunks_df = pd.DataFrame(original_data_list)
original_chunks_df.head()

Unnamed: 0,id,content,chunk_token_size,section,website
0,2a44c327-e28e-4100-b0c6-dd39a527b1b3,"\n<meta charset=""utf-8""></meta>\n<!-- Hi there...",3620.0,head,airbnb
1,82dce37b-d2f2-423c-a046-785485bd6ee0,"\n<div id=""site-skip-links"">\n<a class=""screen...",14606.0,body,airbnb
2,1fcae085-d44f-4b19-ac2b-8db1e9d954ed,"<path d=""M8 .25a7.77 7.77 0 0 1 7.75 7.78 7.75...",14566.0,body,airbnb
3,60c59c7b-eaf6-43b7-bb1e-71860b015b54,"<span class=""s15ewrxi atm_y_1x514to atm_12_q7p...",14606.0,body,airbnb
4,60c79bf1-9ef3-4724-a844-01fc6f30b8b1,"\n</div>\n<div class=""fb4nyux atm_da_cbdd7d di...",14463.0,body,airbnb


In [31]:
def is_valid_audit(audit):
  if((audit['scoreDisplayMode'] == 'notApplicable') or
    (audit['scoreDisplayMode'] == 'binary' and audit['score'] == 1) or
    (audit['scoreDisplayMode'] == 'informative') or
    (audit['scoreDisplayMode'] == 'manual') or
    (audit['scoreDisplayMode'] == 'error') or
    (audit['scoreDisplayMode'] == 'metricSavings' and audit['score'] == 1) or
    (audit['scoreDisplayMode'] == 'numeric' and audit['score'] == 1)):
    return False

  return True

In [47]:
# get list of all original audits to be resolved.
original_audits_path = "./../dataset/lh-original-reports"

original_audits = []

for filename in os.listdir(original_audits_path):
    if filename.endswith(".json") and not any(exclude in filename for exclude in reports_to_exclude):
        filepath = os.path.join(original_audits_path, filename)
        cleaned_filename = filename.split(".")[0]

        # Open and read the JSON file
        with open(filepath, 'r') as f:
            data = json.load(f)
            for audit in data["audits"].values():
                audit["website"] = cleaned_filename
                if is_valid_audit(audit):
                    original_audits.append(audit)

original_audits_df = pd.DataFrame(original_audits)
original_audits_df.head()

Unnamed: 0,id,title,description,score,scoreDisplayMode,numericValue,numericUnit,displayValue,scoringOptions,website,metricSavings,details,guidanceLevel,warnings,explanation
0,first-contentful-paint,First Contentful Paint,First Contentful Paint marks the time at which...,0.0,numeric,14200.1936,millisecond,14.2 s,"{'p10': 1800, 'median': 3000}",airbnb,,,,,
1,largest-contentful-paint,Largest Contentful Paint,Largest Contentful Paint marks the time at whi...,0.0,numeric,18493.7327,millisecond,18.5 s,"{'p10': 2500, 'median': 4000}",airbnb,,,,,
2,speed-index,Speed Index,Speed Index shows how quickly the contents of ...,0.01,numeric,14200.1936,millisecond,14.2 s,"{'p10': 3387, 'median': 5800}",airbnb,,,,,
3,total-blocking-time,Total Blocking Time,Sum of all time periods between FCP and Time t...,0.83,numeric,261.5,millisecond,260 ms,"{'p10': 200, 'median': 600}",airbnb,,,,,
4,max-potential-fid,Max Potential First Input Delay,The maximum potential First Input Delay that y...,0.72,numeric,185.0,millisecond,190 ms,,airbnb,,,,,


In [33]:
# Get the number of audits for each website. only count audits where isvalidaudit is true
audits_per_website = original_audits_df.groupby("website").size().reset_index(name="count")
audits_per_website.head()

Unnamed: 0,website,count
0,airbnb,19
1,aliexpress,27
2,ebay,24
3,facebook,6
4,github,17


In [48]:
# How many unique audits in total?
unique_audits = original_audits_df["id"].nunique()
print(f"Unique audits: {unique_audits}")

Unique audits: 38


In [49]:
audit_groupings_path = "./../dataset/audit_groupings.csv"
audit_groupings = pd.read_csv(audit_groupings_path)
audit_groupings.head()

Unnamed: 0,audit_name,category
0,first-contentful-paint,initial load performance
1,speed-index,initial load performance
2,total-blocking-time,interactivity performance
3,max-potential-fid,interactivity performance
4,interactive,interactivity performance


In [50]:
# Get the number of audits for each category. Add the category name to the original audits dataframe. audit_name on the audit_groupings dataframe is the same as the id on the original_audits_df
original_audits_df = original_audits_df.merge(audit_groupings, left_on="id", right_on="audit_name", how="left", suffixes=('_original', '_grouping'))
audits_per_category = original_audits_df.groupby("category").size().reset_index(name="count")
audits_per_category.head()

Unnamed: 0,category,count
0,initial load performance,73
1,interactivity performance,44
2,network optimization,36
3,resource optimization,60
4,runtime performance,23


In [51]:
# get number of audits per website per category
audits_per_website_per_category = original_audits_df.groupby(["website", "category"]).size().reset_index(name="count")
audits_per_website_per_category.head()

Unnamed: 0,website,category,count
0,airbnb,initial load performance,5
1,airbnb,interactivity performance,3
2,airbnb,network optimization,2
3,airbnb,resource optimization,6
4,airbnb,runtime performance,2


In [53]:
# Calculate audit incidence ratio for each website. For each audit, we calculate the number of unique webpages that contain the specific audit and divide it by the total number of webpages in the dataset.
audit_counts = original_audits_df.groupby("id")["website"].nunique().reset_index(name="count")

total_websites = original_audits_df["website"].nunique()
print(f"Total websites: {total_websites}")

audit_counts["incidence_ratio"] = audit_counts["count"] / total_websites

# Sort the audits by incidence ratio from highest to lowest
sorted_audits = audit_counts.sort_values(by="incidence_ratio", ascending=False)

# add category to the sorted audits with only category column, no other columns from the audit_groupings dataframe
sorted_audits = sorted_audits.merge(audit_groupings[["audit_name", "category"]], left_on="id", right_on="audit_name", how="left")
sorted_audits = sorted_audits.drop(columns=["audit_name"])

# print(sorted_audits.head(10))

# Group the audits by category and calculate the average incidence ratio for each category
average_incidence_ratio_per_category = sorted_audits.groupby("category")["incidence_ratio"].mean().reset_index(name="average_incidence_ratio")
average_incidence_ratio_per_category = average_incidence_ratio_per_category.sort_values(by="average_incidence_ratio", ascending=False)

print(average_incidence_ratio_per_category.head(10))

Total websites: 15
                    category  average_incidence_ratio
1  interactivity performance                 0.733333
0   initial load performance                 0.608333
3      resource optimization                 0.444444
2       network optimization                 0.400000
4        runtime performance                 0.383333
5           visual stability                 0.288889


In [54]:
# save results to './results' directory
output_path = "./../results/summaries"

if not os.path.exists(output_path):
    os.makedirs(output_path)

# save the average incidence ratio per category to a csv file
average_incidence_ratio_per_category.to_csv(os.path.join(output_path, "average_incidence_ratio_per_category.csv"), index=False)
sorted_audits.to_csv(os.path.join(output_path, "sorted_audits.csv"), index=False)
audits_per_category.to_csv(os.path.join(output_path, "audits_per_category.csv"), index=False)
audits_per_website.to_csv(os.path.join(output_path, "audits_per_website.csv"), index=False)
audits_per_website_per_category.to_csv(os.path.join(output_path, "audits_per_website_per_category.csv"), index=False)

In [62]:
# path_to_modified_chunks = "../final-dataset/dataset/lh-reports-modified-single"
path_to_modified_chunks = "./../dataset/lh-modified-reports/llama3.3-70b"
output_path = "./../results/summaries/llama3.3-70b"
if not os.path.exists(output_path):
    os.makedirs(output_path)

audit_data_list = []

# how many original audits are there?
original_audits_count = len(original_audits_df)
print(f"Original audits count: {original_audits_count}")

# how many files in the folder
file_count = len([name for name in os.listdir(path_to_modified_chunks) if os.path.isfile(os.path.join(path_to_modified_chunks, name))])
print(f"Total files: {file_count}")

# for each audit, calculate the incidence ratio for modifications. the files are in the format {website}-{audit_name}.json
# loop through each audit in sorted_audits and calculate the incidence ratio for modifications. Does not have to be website specific.
for index, audit in sorted_audits.iterrows():
    audit_name = audit["id"]
    count_of_audit = 0
    # get all jsons with the audit_name in the filename
    # matching_files = [f for f in os.listdir(path_to_modified_chunks) if audit_name in f]
    matching_files = [f for f in os.listdir(path_to_modified_chunks) if f != "summaries"]
    matching_files_count = len(matching_files)
    if matching_files_count == 0:
        continue

    # for each of the matching files, calculate the incidence ratio
    # check the audits property in the json file
    # check if the audit occurs in the audits property
    for webpage_audit in matching_files:
        filepath = os.path.join(path_to_modified_chunks, webpage_audit)
        with open(filepath, 'r') as f:
            data = json.load(f)
            if audit_name in data["audits"] and is_valid_audit(data["audits"][audit_name]):
                count_of_audit += 1

    incidence_ratio = count_of_audit / total_websites
    audit_data_list.append({
        "audit_name": audit_name,
        "original_incidence_ratio": audit["incidence_ratio"],
        "modified_incidence_ratio": incidence_ratio,
        "initial_count": audit["count"],
        "modified_count": count_of_audit,
        "total_websites": total_websites,
        "category": audit["category"]
    })

# create a dataframe from the audit_data_list
audit_data_mf_df = pd.DataFrame(audit_data_list)
audit_data_mf_df.head()

# print(audit_data_mf_df.head(10))
audit_data_mf_df.to_csv(os.path.join(output_path, "audits_comparison.csv"), index=False)

# get the average incidence ratio and average modified incidence ratio for each category. Sum the initial count for each category and the modified count for each category
average_incidence_ratio_per_category_2 = audit_data_mf_df.groupby("category")[
    ["original_incidence_ratio", "modified_incidence_ratio"]
].mean().reset_index()

average_incidence_ratio_per_category_2 = average_incidence_ratio_per_category_2.sort_values(
    by="original_incidence_ratio", ascending=False
)

average_incidence_ratio_per_category_2["initial_count"] = audit_data_mf_df.groupby("category")["initial_count"].sum().reset_index()["initial_count"]
average_incidence_ratio_per_category_2["modified_count"] = audit_data_mf_df.groupby("category")["modified_count"].sum().reset_index()["modified_count"]

# add the percentage_change column, the original count of audits in audit_data_mf_df for each category and the modified count of audits for each category
average_incidence_ratio_per_category_2["percentage_change"] = (
    (average_incidence_ratio_per_category_2["modified_incidence_ratio"]
     - average_incidence_ratio_per_category_2["original_incidence_ratio"])
    / average_incidence_ratio_per_category_2["original_incidence_ratio"]
) * 100

# sort by percentage change
average_incidence_ratio_per_category_2 = average_incidence_ratio_per_category_2.sort_values(
    by="percentage_change", ascending=True
)

audit_data_mf_df["percentage_change"] = (
    (audit_data_mf_df["modified_incidence_ratio"] - audit_data_mf_df["original_incidence_ratio"])
    / audit_data_mf_df["original_incidence_ratio"]
) * 100

audit_data_mf_df = audit_data_mf_df.sort_values(by="percentage_change", ascending=True)

audit_data_mf_df.to_csv(os.path.join(output_path, "audits_comparison.csv"))

print(average_incidence_ratio_per_category_2.head(10))
# average_incidence_ratio_per_category_2.to_csv(os.path.join(output_path, "average_incidence_ratio_per_category_2.csv"), index=False)

Original audits count: 269
Total files: 15
                    category  original_incidence_ratio  \
4        runtime performance                  0.383333   
5           visual stability                  0.288889   
1  interactivity performance                  0.733333   
2       network optimization                  0.400000   
0   initial load performance                  0.608333   
3      resource optimization                  0.444444   

   modified_incidence_ratio  initial_count  modified_count  percentage_change  
4                  0.066667             23               4         -82.608696  
5                  0.144444             26              13         -50.000000  
1                  0.400000             44              24         -45.454545  
2                  0.266667             36              24         -33.333333  
0                  0.416667             73              50         -31.506849  
3                  0.325926             60              44         -26

In [23]:
# for the audit_data_mf_df, add a % change field and sort by % change
audit_data_mf_df["percentage_change"] = (
    (audit_data_mf_df["modified_incidence_ratio"] - audit_data_mf_df["original_incidence_ratio"])
    / audit_data_mf_df["original_incidence_ratio"]
) * 100

audit_data_mf_df = audit_data_mf_df.sort_values(by="percentage_change", ascending=True)

audit_data_mf_df.to_csv(os.path.join(output_path, "audits_comparison.csv"))

In [3]:
models = [
    "gpt-4o-mini",
    "claude-3-7-sonnet-20250219",
    "claude-3-7-sonnet-20250219-non-reasoning",
    "deepseek-r1",
    "deepseek-v3-0324",
    "gpt-4.1",
    "o4-mini",
    "qwen2.5-32b-instruct",
    "llama3.3-70b",
]


# for each model, get the audits from the audit_data_mf_df and save to a csv file
all_audits_df = pd.DataFrame()
for model in models:
    path_to_comparison = f"./../results/summaries/{model}/audits_comparison.csv"
    if os.path.exists(path_to_comparison):
        comparison_df = pd.read_csv(path_to_comparison)
        comparison_df["model"] = model
        all_audits_df = pd.concat([all_audits_df, comparison_df], ignore_index=True)
    else:
        print(f"File {path_to_comparison} does not exist.")

# get a table with category, model, category, percentage_change
all_audits_df = all_audits_df.groupby(["category", "model"]).agg(
    percentage_change=("percentage_change", "mean")
).reset_index()

# get only the columns we need
all_audits_df = all_audits_df[["category", "model", "percentage_change"]]
# make -ve values positive and +ve values negative
# all_audits_df["percentage_change"] = all_audits_df["percentage_change"].apply(lambda x: -x if x < 0 else x)
# round to 2 decimal places
all_audits_df["percentage_change"] = all_audits_df["percentage_change"].round(2)

# save the all_audits_df to a csv file
all_audits_df.to_csv(os.path.join("./../results/summaries", "all_audits_comparison.csv"), index=False)