In [6]:
import json
import numpy as np
from scipy.stats import pearsonr

# load data from JSON file
with open('pagesize_load_time.json', 'r') as file:
    data = json.load(file)

# size categories
size_categories = [
    (0, 500),             # 0 bytes to 500 bytes
    (500, 1024),          # 500 bytes to 1 KB
    (1024, 5120),         # 1 KB to 5 KB
    (5120, 25600),        # 5 KB to 25 KB
    (25600, 128000),      # 25 KB to 125 KB
    (128000, 512000),     # 125 KB to 500 KB
    (512000, 1048576),    # 500 KB to 1 MB
    (1048576, 5242880)    # 1 MB to 5 MB
]

# dictionaries for storing load times and counts
load_times = {category: [] for category in size_categories}
counts = {category: 0 for category in size_categories}

# store sizes and load times for correlation calculation
sizes = []
load_times_all = []

# put pages into size categories and record load times
for entry in data:
    size = entry['size']
    load_time = entry['load_time']
    
    if size is not None and load_time is not None:
        sizes.append(size)
        load_times_all.append(load_time)
        for category in size_categories:
            if category[0] <= size < category[1]:
                load_times[category].append(load_time)
                counts[category] += 1
                break

# get average load times for each category
average_load_times = {category: np.mean(times) if times else None for category, times in load_times.items()}

# results print
print("Average Load Times and Page Counts by Size Category:")
for category in size_categories:
    avg_load_time = average_load_times[category]
    count = counts[category]
    print(f"{category[0]} bytes to {category[1]} bytes: Average Load Time = {avg_load_time:.2f} seconds, Page Count = {count}")

# calculate correlation coefficient
correlation_coefficient, _ = pearsonr(sizes, load_times_all)
print(f"\nPearson correlation coefficient between page size and load time: {correlation_coefficient:.2f}")

# save results to JSON file
analysis_results = {
    'average_load_times': {f"{category[0]}-{category[1]}": avg_load_time for category, avg_load_time in average_load_times.items()},
    'page_counts': {f"{category[0]}-{category[1]}": count for category, count in counts.items()},
    'correlation_coefficient': correlation_coefficient
}

with open('pagesize_load_correlation_analysis.json', 'w') as json_file:
    json.dump(analysis_results, json_file, indent=4)

print("Analysis results saved to pagesize_load_correlation_analysis.json")

# save results to HTML file
html_content = "<html><head><title>URL Sizes and Load Times Analysis</title></head><body>"
html_content += "<h1>URL Sizes and Load Times Analysis</h1>"
html_content += "<h2>Average Load Times and Page Counts by Size Category</h2>"
html_content += "<table border='1'><tr><th>Size Category</th><th>Average Load Time (s)</th><th>Page Count</th></tr>"

for category in size_categories:
    avg_load_time = average_load_times[category]
    count = counts[category]
    size_range = f"{category[0]} bytes to {category[1]} bytes"
    avg_load_time_str = f"{avg_load_time:.2f}" if avg_load_time is not None else "N/A"
    html_content += f"<tr><td>{size_range}</td><td>{avg_load_time_str}</td><td>{count}</td></tr>"

html_content += "</table>"
html_content += f"<h2>Pearson Correlation Coefficient</h2><p>{correlation_coefficient:.2f}</p>"
html_content += "</body></html>"

with open('pagesize_load_correlation_analysis.html', 'w') as html_file:
    html_file.write(html_content)

print("Analysis results saved to pagesize_load_correlation_analysis.html")


Average Load Times and Page Counts by Size Category:
0 bytes to 500 bytes: Average Load Time = 0.10 seconds, Page Count = 6
500 bytes to 1024 bytes: Average Load Time = 0.02 seconds, Page Count = 3
1024 bytes to 5120 bytes: Average Load Time = 0.31 seconds, Page Count = 6
5120 bytes to 25600 bytes: Average Load Time = 0.84 seconds, Page Count = 1
25600 bytes to 128000 bytes: Average Load Time = 1.02 seconds, Page Count = 255
128000 bytes to 512000 bytes: Average Load Time = 1.06 seconds, Page Count = 247
512000 bytes to 1048576 bytes: Average Load Time = 1.23 seconds, Page Count = 10
1048576 bytes to 5242880 bytes: Average Load Time = 2.33 seconds, Page Count = 7

Pearson correlation coefficient between page size and load time: 0.23
Analysis results saved to pagesize_load_correlation_analysis.json
Analysis results saved to pagesize_load_correlation_analysis.html
