In [None]:
import os
import pandas as pd


In [None]:
def generate_merged_files(branch):
    # branch <- ["JOB", "LIGHT"]

    merged_data = []

    folder_path = "[{}]2-datapoints".format(branch)

    csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

    filtered_dataframes = {}

    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)

        if df.empty:
            continue

        if not df.empty:
            filtered_dataframes[file] = df
            merged_data.append(df)

    for file_name, df in filtered_dataframes.items():
        print(f"\n=== {file_name} (Filtered) ===")
        print(df.head()) 

    output_folder = "3-merged-results"
    os.makedirs(output_folder, exist_ok=True) 

    final_merged_data = pd.concat(merged_data, ignore_index=True)
    final_merged_data.to_csv(os.path.join(output_folder, "merged_{}.csv".format(branch.lower())), index=False)
    print("âœ… merged_{}.csv".format(branch.lower()))

In [None]:
generate_merged_files("JOB")

In [None]:
generate_merged_files("LIGHT")

# 2. Compute R and R0

In [None]:
import os
import pandas as pd

folder_path = "3-merged-results"
job_file = os.path.join(folder_path, "merged_job.csv")
light_file = os.path.join(folder_path, "merged_light.csv")

def conformal_score_lambda(cost, latency):
  return abs(cost/110 - latency)

def extract_last_column(file_path):
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        df['ConformalScore'] = conformal_score_lambda(df['cost'], df['runtime'])
        return df['ConformalScore'].tolist() 
    else:
        return []


job_values = extract_last_column(job_file)
light_values = extract_last_column(light_file)


In [None]:
# Copy from e-computation => Alignment
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from scipy.integrate import simps 

def compute_tv (R, R0, threshold=100000):
    print("threshold: ", threshold)
    R = [val for val in R if val < threshold]
    R0 = [val for val in R0 if val < threshold]
    # 2. Calculate two KDEs (probability density functions PDF)
    kde_p = gaussian_kde(R, bw_method='scott')  # p(x)
    kde_q = gaussian_kde(R0, bw_method='scott') # q(x)

    # 3. Define the integration interval
    x_vals = np.linspace(min(min(R), min(R0)), max(max(R), max(R0)), 1000)  
    pdf_p = kde_p(x_vals)  # Compute p(x)
    pdf_q = kde_q(x_vals)  # Compute q(x)

    # 4. Compute TVD
    integrand = np.abs(pdf_q - pdf_p)  # Compute |q(x) - p(x)|
    TV_distance = 0.5 * simps(integrand, x_vals)  # Integrate using Simpson's rule

    print(f"Total Variation Distance (TVD) = {TV_distance:.5f}")

    plt.figure(figsize=(8, 5))
    plt.plot(x_vals, pdf_p, label="p(x) (KDE of R - LIGHT)", color="blue", linewidth=2)
    plt.plot(x_vals, pdf_q, label="q(x) (KDE of R0 - JOB)", color="red", linewidth=2)
    plt.fill_between(x_vals, pdf_p, pdf_q, color="gray", alpha=0.3, label="|q(x) - p(x)|")
    plt.xlabel("X values")
    plt.ylabel("Density")
    plt.title("Kernel Density Estimation (KDE) and TV Distance")
    plt.legend()
    plt.show()

    return TV_distance

In [None]:
tv = compute_tv(light_values, job_values)

In [None]:
tv = compute_tv(light_values, job_values, 10000)

In [None]:
tv = compute_tv(light_values, job_values, 4000)