# Analysis of SQR Results Across Tau Levels with TikZ Plotting
Analysis of `results_sampled_10k_taus` directory with visualization using TikZ for LaTeX embedding. This notebook aggregates results across multiple quantile levels (τ) and generates publication-quality plots with dual y-axes.

In [16]:
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configuration
RESULTS_DIR = 'results_sampled_10k_taus/'
OUTPUT_DIR = 'tikz_plots/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Results directory: {RESULTS_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

Results directory: results_sampled_10k_taus/
Output directory: tikz_plots/


In [None]:
data_list = [
    "1027_ESL",
    "1028_SWD",
    "1029_LEV",
    "1030_ERA",
    "1089_USCrime",
    "1096_FacultySalaries",
    "1191_BNG_pbc",
    "1193_BNG_lowbwt",
    "1196_BNG_pharynx",
    "1199_BNG_echoMonths",
    "1201_BNG_breastTumor",
    "192_vineyard",
    "195_auto_price",
    "197_cpu_act",
    "201_pol",
    "207_autoPrice",
    "210_cloud",
    "215_2dplanes",
    "218_house_8L",
    "225_puma8NH",
    "227_cpu_small",
    "228_elusage",
    "229_pwLinear",
    "230_machine_cpu",
    "294_satellite_image",
    "344_mv",
    "4544_GeographicalOriginalofMusic",
    "485_analcatdata_vehicle",
    "503_wind",
    "505_tecator",
    "519_vinnie",
    "522_pm10",
    "523_analcatdata_neavote",
    "527_analcatdata_election2000",
    "529_pollen",
    "537_houses",
    "542_pollution",
    "547_no2",
    "556_analcatdata_apnea2",
    "557_analcatdata_apnea1",
    "560_bodyfat",
    "561_cpu",
    "562_cpu_small",
    "564_fried",
    "573_cpu_act",
    "574_house_16H",
    "579_fri_c0_250_5",
    "581_fri_c3_500_25",
    "582_fri_c1_500_25",
    "583_fri_c1_1000_50",
    "584_fri_c4_500_25",
    "586_fri_c3_1000_25",
    "588_fri_c4_1000_100",
    "589_fri_c2_1000_25",
    "590_fri_c0_1000_50",
    "591_fri_c1_100_10",
    "592_fri_c4_1000_25",
    "593_fri_c1_1000_10",
    "594_fri_c2_100_5",
    "595_fri_c0_1000_10",
    "596_fri_c2_250_5",
    "597_fri_c2_500_5",
    "598_fri_c0_1000_25",
    "599_fri_c2_1000_5",
    "601_fri_c1_250_5",
    "602_fri_c3_250_10",
    "603_fri_c0_250_50",
    "604_fri_c4_500_10",
    "605_fri_c2_250_25",
    "606_fri_c2_1000_10",
    "607_fri_c4_1000_50",
    "608_fri_c3_1000_10",
    "609_fri_c0_1000_5",
    "611_fri_c3_100_5",
    "612_fri_c1_1000_5",
    "613_fri_c3_250_5",
    "615_fri_c4_250_10",
    "616_fri_c4_500_50",
    "617_fri_c3_500_5",
    "618_fri_c3_1000_50",
    "620_fri_c1_1000_25",
    "621_fri_c0_100_10",
    "622_fri_c2_1000_50",
    "623_fri_c4_1000_10",
    "624_fri_c0_100_5",
    "626_fri_c2_500_50",
    "627_fri_c2_500_10",
    "628_fri_c3_1000_5",
    "631_fri_c1_500_5",
    "633_fri_c0_500_25",
    "634_fri_c2_100_10",
    "635_fri_c0_250_10",
    "637_fri_c1_500_50",
    "641_fri_c1_500_10",
    "643_fri_c2_500_25",
    "644_fri_c4_250_25",
    "645_fri_c3_500_50",
    "646_fri_c3_500_10",
    "647_fri_c1_250_10",
    "648_fri_c1_250_50",
    "649_fri_c0_500_5",
    "650_fri_c0_500_50",
    "651_fri_c0_100_25",
    "653_fri_c0_250_25",
    "654_fri_c0_500_10",
    "656_fri_c1_100_5",
    "657_fri_c2_250_10",
    "658_fri_c3_250_25",
    "659_sleuth_ex1714",
    "663_rabe_266",
    "665_sleuth_case2002",
    "666_rmftsa_ladata",
    "678_visualizing_environmental",
    "687_sleuth_ex1605",
    "690_visualizing_galaxy",
    "695_chatfield_4",
    "706_sleuth_case1202",
    "712_chscase_geyser1",
    "banana",
    "titanic"
]
TAUS = [0.5, 0.6, 0.7, 0.8, 0.9]

In [46]:
# Check whether results are complete
for tau in TAUS:
    for ds_name in data_list:
        # check whether file exists, print if that's not the case
        file_path = Path(RESULTS_DIR) / f"results{tau}_{ds_name}_tau{tau}.json"
        if not file_path.exists():
            print(f"File not found: {file_path} for tau={tau} and dataset={ds_name}")

File not found: results_sampled_10k_taus/results0.1_1027_ESL_tau0.1.json for tau=0.1 and dataset=1027_ESL
File not found: results_sampled_10k_taus/results0.1_1028_SWD_tau0.1.json for tau=0.1 and dataset=1028_SWD
File not found: results_sampled_10k_taus/results0.1_1029_LEV_tau0.1.json for tau=0.1 and dataset=1029_LEV
File not found: results_sampled_10k_taus/results0.1_1030_ERA_tau0.1.json for tau=0.1 and dataset=1030_ERA
File not found: results_sampled_10k_taus/results0.1_1089_USCrime_tau0.1.json for tau=0.1 and dataset=1089_USCrime
File not found: results_sampled_10k_taus/results0.1_1096_FacultySalaries_tau0.1.json for tau=0.1 and dataset=1096_FacultySalaries
File not found: results_sampled_10k_taus/results0.1_1191_BNG_pbc_tau0.1.json for tau=0.1 and dataset=1191_BNG_pbc
File not found: results_sampled_10k_taus/results0.1_1193_BNG_lowbwt_tau0.1.json for tau=0.1 and dataset=1193_BNG_lowbwt
File not found: results_sampled_10k_taus/results0.1_1196_BNG_pharynx_tau0.1.json for tau=0.1 and d

## Section 1: Load and Aggregate Results by Tau
Load JSON files from results_sampled_10k_taus directory and create a long-format DataFrame with tau as a separate dimension.

In [39]:
def load_tau_jsons_to_long_df(directory):
    """Load JSON files from tau results directory into long-format DataFrame."""
    records = []
    
    for filename in sorted(os.listdir(directory)):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            try:
                with open(filepath, 'r') as f:
                    data = json.load(f)
                
                for model, model_data in data.items():
                    tau = model_data.get('tau')
                    for metric, metric_data in model_data.items():
                        if metric == 'tau' or metric == 'sizes':
                            continue
                        for dataset, values in metric_data.items():
                            for run_idx, value in enumerate(values):
                                records.append({
                                    'filename': filename,
                                    'model': model,
                                    'tau': tau,
                                    'metric': metric,
                                    'dataset': dataset,
                                    'run': run_idx,
                                    'value': value
                                })
            except Exception as e:
                print(f"Error loading {filename}: {e}")
    
    df = pd.DataFrame.from_records(records)
    return df

# Load results
df_long = load_tau_jsons_to_long_df(RESULTS_DIR)
print(f"Loaded {len(df_long)} records from {RESULTS_DIR}")
print(f"Unique tau values: {sorted(df_long['tau'].unique())}")
print(f"Unique models: {sorted(df_long['model'].unique())}")
print(f"Unique metrics: {sorted(df_long['metric'].unique())}")
print(f"\\nDataFrame shape: {df_long.shape}")
df_long.head(10)

Error loading results0.6_503_wind_tau0.6.json: Expecting value: line 1 column 1 (char 0)
Loaded 56905 records from results_sampled_10k_taus/
Unique tau values: [np.float64(0.5), np.float64(0.6), np.float64(0.7), np.float64(0.8), np.float64(0.9)]
Unique models: ['DecisionTree', 'LightGBM', 'LinearQuantile', 'SQR']
Unique metrics: ['complexity', 'coverage', 'losses', 'time_all', 'time_fit']
\nDataFrame shape: (56905, 7)


Unnamed: 0,filename,model,tau,metric,dataset,run,value
0,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,0,0.035077
1,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,1,0.044006
2,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,2,0.034439
3,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,3,0.034149
4,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,4,0.038015
5,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,0,0.367347
6,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,1,0.112245
7,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,2,0.397959
8,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,3,0.283505
9,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,4,0.273196


## Section 2: Calculate Summary Statistics
Group data by tau, model, and metric to compute mean, median, and standard deviation.

In [40]:
# Calculate summary statistics grouped by tau, model, and metric
summary_stats = df_long.groupby(['tau', 'model', 'metric'])['value'].agg([
    ('mean', 'mean'),
    ('std', 'std'),
    ('median', 'median'),
    ('min', 'min'),
    ('max', 'max'),
    ('count', 'count')
]).reset_index()

print("Summary statistics shape:", summary_stats.shape)
print("\\nSample statistics:")
print(summary_stats.head(15))

# Pivot to get metrics as columns
stats_pivot = summary_stats.pivot_table(
    index=['tau', 'model'],
    columns='metric',
    values='mean'
).reset_index()

print("\\nPivoted statistics (means):")
print(stats_pivot)

Summary statistics shape: (95, 9)
\nSample statistics:
    tau           model      metric         mean          std     median  \
0   0.5    DecisionTree  complexity  1526.940000  9135.109392  79.000000   
1   0.5    DecisionTree    coverage     0.083617     0.099913   0.046872   
2   0.5    DecisionTree      losses     0.038717     0.017254   0.038650   
3   0.5    DecisionTree    time_all     7.856714    47.333635   0.096175   
4   0.5    DecisionTree    time_fit     0.526921     3.229699   0.014585   
5   0.5        LightGBM    coverage     0.058731     0.062022   0.040000   
6   0.5        LightGBM      losses     0.042091     0.021733   0.036913   
7   0.5        LightGBM    time_all     0.603711     1.052114   0.255252   
8   0.5        LightGBM    time_fit     0.117287     0.215799   0.053976   
9   0.5  LinearQuantile  complexity    19.891667    20.600256  11.000000   
10  0.5  LinearQuantile    coverage     0.059011     0.073643   0.040000   
11  0.5  LinearQuantile      loss

## Section 3: Prepare Data for Plotting
Organize metrics into groups for dual-axis visualization.

In [41]:
# Create tau-ordered data for each model and metric
models = sorted(df_long['model'].unique())
metrics = sorted(df_long['metric'].unique())
taus = sorted(df_long['tau'].unique())

print(f"Models: {models}")
print(f"Metrics: {metrics}")
print(f"Tau values: {taus}")

# Create a dictionary of DataFrames organized by metric
metric_data = {}
for metric in metrics:
    metric_subset = summary_stats[summary_stats['metric'] == metric].copy()
    metric_subset = metric_subset.sort_values('tau')
    metric_data[metric] = metric_subset

print("\\nExample: Losses across tau levels")
print(metric_data['losses'][['tau', 'model', 'mean', 'std']])

Models: ['DecisionTree', 'LightGBM', 'LinearQuantile', 'SQR']
Metrics: ['complexity', 'coverage', 'losses', 'time_all', 'time_fit']
Tau values: [np.float64(0.5), np.float64(0.6), np.float64(0.7), np.float64(0.8), np.float64(0.9)]
\nExample: Losses across tau levels
    tau           model      mean       std
2   0.5    DecisionTree  0.038717  0.017254
6   0.5        LightGBM  0.042091  0.021733
11  0.5  LinearQuantile  0.057968  0.029411
16  0.5             SQR  0.032786  0.023446
21  0.6    DecisionTree  0.038134  0.017316
25  0.6        LightGBM  0.043783  0.029528
30  0.6  LinearQuantile  0.058978  0.032384
35  0.6             SQR  0.031723  0.022169
54  0.7             SQR  0.029593  0.022415
49  0.7  LinearQuantile  0.058438  0.035775
44  0.7        LightGBM  0.041879  0.027761
40  0.7    DecisionTree  0.035276  0.016283
59  0.8    DecisionTree  0.029495  0.013844
63  0.8        LightGBM  0.036403  0.022113
68  0.8  LinearQuantile  0.056426  0.038815
73  0.8             SQR  0.027

## Section 4: Generate TikZ Plots with Dual Axes
Create TikZ code with dual y-axes (left for one metric, right for another).

In [20]:
def generate_tikz_dual_axis_plot(metric_data, left_metrics, right_metrics, models, taus, title="", y_label_left="", y_label_right=""):
    """Generate TikZ code for a dual-axis plot with tau on x-axis."""
    
    colors = {'SQR': 'red', 'LightGBM': 'blue', 'DecisionTree': 'green', 'LinearQuantile': 'purple'}
    line_styles = {'SQR': 'solid', 'LightGBM': 'dashed', 'DecisionTree': 'dotted', 'LinearQuantile': 'dashdotted'}
    mark_styles = {'SQR': '*', 'LightGBM': 'o', 'DecisionTree': 'square', 'LinearQuantile': 'diamond'}
    
    tikz_code = r"\begin{tikzpicture}" + "\\n"
    tikz_code += r"\begin{axis}[" + "\\n"
    tikz_code += f"    title={{{title}}},\\n"
    tikz_code += f"    xlabel={{$\\\\tau$}},\\n"
    tikz_code += f"    ylabel={{{y_label_left}}},\\n"
    tikz_code += f"    width=0.9\\\\textwidth,\\n"
    tikz_code += f"    height=6cm,\\n"
    tikz_code += f"    legend pos=outer north east,\\n"
    tikz_code += f"    xtick={{{','.join(map(str, taus))}}},\\n"
    tikz_code += f"]\\n"
    
    # Plot left axis metrics
    for metric in left_metrics:
        if metric not in metric_data:
            continue
        metric_df = metric_data[metric]
        
        for model in models:
            model_subset = metric_df[metric_df['model'] == model].sort_values('tau')
            if len(model_subset) == 0:
                continue
            
            x_vals = model_subset['tau'].tolist()
            y_vals = model_subset['mean'].tolist()
            coords = ' '.join([f"({x},{y:.4f})" for x, y in zip(x_vals, y_vals)])
            
            tikz_code += f"\\\\addplot[color={colors.get(model, 'black')}, {line_styles.get(model, 'solid')}, mark={mark_styles.get(model, 'o')}, mark size=2pt] coordinates {{{coords}}};\\n"
            tikz_code += f"\\\\addlegendentry{{{model} - {metric}}}\\n"
    
    tikz_code += r"\end{axis}" + "\\n"
    tikz_code += r"\end{tikzpicture}" + "\\n"
    
    return tikz_code

test_tikz = generate_tikz_dual_axis_plot(
    metric_data,
    left_metrics=['losses'],
    right_metrics=[],
    models=models,
    taus=taus,
    title="Model Performance vs Tau",
    y_label_left="Normalized Pinball Loss"
)

print("Generated TikZ code (first 400 chars):")
print(test_tikz[:400])

Generated TikZ code (first 400 chars):
\begin{tikzpicture}\n\begin{axis}[\n    title={Model Performance vs Tau},\n    xlabel={$\\tau$},\n    ylabel={Normalized Pinball Loss},\n    width=0.9\\textwidth,\n    height=6cm,\n    legend pos=outer north east,\n    xtick={0.5,0.6,0.7,0.8,0.9},\n]\n\\addplot[color=green, dotted, mark=square, mark size=2pt] coordinates {(0.5,0.0387) (0.6,0.0381) (0.7,0.0353) (0.8,0.0295) (0.9,0.0196)};\n\\addleg


## Section 5: Export TikZ Figures for LaTeX
Save TikZ files ready for inclusion in LaTeX documents.

In [21]:
def save_tikz_standalone(tikz_code, filename, directory='tikz_plots/'):
    """Save TikZ code as a file for LaTeX inclusion."""
    os.makedirs(directory, exist_ok=True)
    
    tikz_path = os.path.join(directory, f"{filename}.tikz")
    with open(tikz_path, 'w') as f:
        f.write(tikz_code)
    print(f"Saved TikZ: {tikz_path}")

# Generate and save plots
plot_configs = [
    {'name': 'losses', 'left_metrics': ['losses'], 'right_metrics': [], 'title': 'Loss vs Tau'},
    {'name': 'coverage', 'left_metrics': ['coverage'], 'right_metrics': [], 'title': 'Coverage Error vs Tau'},
    {'name': 'timing', 'left_metrics': ['time_all'], 'right_metrics': [], 'title': 'Computation Time vs Tau'}
]

for config in plot_configs:
    tikz_code = generate_tikz_dual_axis_plot(
        metric_data,
        left_metrics=config['left_metrics'],
        right_metrics=config['right_metrics'],
        models=models,
        taus=taus,
        title=config['title'],
        y_label_left=config['name']
    )
    save_tikz_standalone(tikz_code, config['name'])

print(f"\\n✓ Generated TikZ plots in {OUTPUT_DIR}/")

Saved TikZ: tikz_plots/losses.tikz
Saved TikZ: tikz_plots/coverage.tikz
Saved TikZ: tikz_plots/timing.tikz
\n✓ Generated TikZ plots in tikz_plots//


## Summary: Key Results
Export summary statistics and model rankings.

In [22]:
# Export summary statistics to CSV
summary_csv_path = os.path.join(OUTPUT_DIR, 'summary_statistics.csv')
summary_stats.to_csv(summary_csv_path, index=False)
print(f"Exported: {summary_csv_path}")

# Best models by metric and tau
print("\\n" + "="*60)
print("BEST MODELS BY METRIC AND TAU")
print("="*60)

for metric in ['losses', 'coverage']:
    print(f"\\n{metric.upper()}:")
    metric_data_full = summary_stats[summary_stats['metric'] == metric]
    
    for tau in taus:
        tau_metric = metric_data_full[metric_data_full['tau'] == tau]
        best_idx = tau_metric['mean'].idxmin()
        best_model = tau_metric.loc[best_idx, 'model']
        best_value = tau_metric.loc[best_idx, 'mean']
        print(f"  τ={tau}: {best_model:15s} (loss={best_value:.5f})")

print(f"\\n✓ Analysis complete. Results in {OUTPUT_DIR}/")

Exported: tikz_plots/summary_statistics.csv
BEST MODELS BY METRIC AND TAU
\nLOSSES:
  τ=0.5: SQR             (loss=0.03279)
  τ=0.6: SQR             (loss=0.03172)
  τ=0.7: SQR             (loss=0.02959)
  τ=0.8: SQR             (loss=0.02721)
  τ=0.9: DecisionTree    (loss=0.01962)
\nCOVERAGE:
  τ=0.5: LightGBM        (loss=0.05873)
  τ=0.6: LightGBM        (loss=0.06200)
  τ=0.7: LightGBM        (loss=0.05998)
  τ=0.8: LightGBM        (loss=0.05629)
  τ=0.9: LightGBM        (loss=0.03827)
\n✓ Analysis complete. Results in tikz_plots//


In [23]:
def generate_pgfplots_dual_axis(metric_data, left_metrics, right_metrics, models, taus,
                                  title="", y_label_left="", y_label_right="",
                                  include_std=False, grid=True):
    """
    Generate more advanced PGFPlots code with better styling options.
    """
    
    colors = {
        'SQR': 'red',
        'LightGBM': 'blue',
        'DecisionTree': 'green',
        'LinearQuantile': 'purple'
    }
    
    styles = {
        'SQR': 'solid',
        'LightGBM': 'dashed',
        'DecisionTree': 'dotted',
        'LinearQuantile': 'dashdotted'
    }
    
    code = r"\begin{tikzpicture}[scale=1.0]" + "\n"
    code += r"\begin{groupplot}[" + "\n"
    code += f"    group style={{group size=1 by 1}},\n"
    code += f"    title={{{title}}},\n"
    code += f"]\n"
    code += r"\nextgroupplot[" + "\n"
    code += f"    xlabel={{$\\tau$}},\n"
    code += f"    ylabel={{{y_label_left}}},\n"
    code += f"    width=12cm,\n"
    code += f"    height=6cm,\n"
    code += f"    legend pos=outer north east,\n"
    code += f"    legend columns=2,\n"
    code += f"    xtick={{{','.join(map(str, taus))}}},\n"
    
    if grid:
        code += f"    grid=major,\n"
    
    code += f"]\n"
    
    # Left axis plots
    for metric in left_metrics:
        if metric not in metric_data:
            continue
        metric_df = metric_data[metric]
        
        for model in models:
            model_subset = metric_df[metric_df['model'] == model].sort_values('tau')
            
            if len(model_subset) == 0:
                continue
            
            x_vals = model_subset['tau'].tolist()
            y_vals = model_subset['mean'].tolist()
            
            coords = ', '.join([f"({x:.1f},{y:.4f})" for x, y in zip(x_vals, y_vals)])
            
            code += f"\\addplot[color={colors.get(model, 'black')}, "
            code += f"{styles.get(model, 'solid')}, thick, "
            code += f"mark=*, mark size=3pt] coordinates {{{coords}}};\n"
            code += f"\\addlegendentry{{{model}}}\n"
    
    code += r"\end{groupplot}" + "\n"
    code += r"\end{tikzpicture}" + "\n"
    
    return code


# Generate advanced plots
advanced_config = {
    'name': 'advanced_performance',
    'left_metrics': ['losses'],
    'right_metrics': [],
    'title': 'Normalized Pinball Loss vs Quantile Level (τ)',
    'y_label_left': 'Normalized Loss',
    'y_label_right': ''
}

advanced_tikz = generate_pgfplots_dual_axis(
    metric_data,
    left_metrics=advanced_config['left_metrics'],
    right_metrics=advanced_config['right_metrics'],
    models=models,
    taus=taus,
    title=advanced_config['title'],
    y_label_left=advanced_config['y_label_left'],
    grid=True
)

save_tikz_standalone(advanced_tikz, 'pgfplots_losses')

print("✓ Generated advanced PGFPlots visualization")
print(f"  File: {OUTPUT_DIR}/pgfplots_losses.tikz")

# Create a comparison table showing tau effect on each model
print("\n" + "=" * 80)
print("LOSS TREND ANALYSIS ACROSS TAU")
print("=" * 80)

losses_pivot = summary_stats[summary_stats['metric'] == 'losses'].pivot_table(
    index='model',
    columns='tau',
    values='mean'
)

print("\nNormalized Pinball Loss by Model and Tau:")
print(losses_pivot.round(4))

# Calculate trend (improvement/degradation)
if len(taus) > 1:
    trend = pd.DataFrame()
    trend['tau_min'] = losses_pivot[taus[0]]
    trend['tau_max'] = losses_pivot[taus[-1]]
    trend['delta'] = trend['tau_max'] - trend['tau_min']
    trend['percent_change'] = (trend['delta'] / trend['tau_min'] * 100).round(2)
    
    print("\nTrend Analysis (from τ=" + str(taus[0]) + " to τ=" + str(taus[-1]) + "):")
    print(trend)


Saved TikZ: tikz_plots/pgfplots_losses.tikz
✓ Generated advanced PGFPlots visualization
  File: tikz_plots//pgfplots_losses.tikz

LOSS TREND ANALYSIS ACROSS TAU

Normalized Pinball Loss by Model and Tau:
tau                0.5     0.6     0.7     0.8     0.9
model                                                 
DecisionTree    0.0387  0.0381  0.0353  0.0295  0.0196
LightGBM        0.0421  0.0438  0.0419  0.0364  0.0244
LinearQuantile  0.0580  0.0590  0.0584  0.0564  0.0520
SQR             0.0328  0.0317  0.0296  0.0272     inf

Trend Analysis (from τ=0.5 to τ=0.9):
                 tau_min   tau_max     delta  percent_change
model                                                       
DecisionTree    0.038717  0.019622 -0.019095          -49.32
LightGBM        0.042091  0.024369 -0.017721          -42.10
LinearQuantile  0.057968  0.052025 -0.005943          -10.25
SQR             0.032786       inf       inf             inf


## Advanced: Custom Plot Generation
Generate custom TikZ plots with specific metric combinations and styling options.


In [24]:
# Generate summary statistics table
print("=" * 80)
print("SUMMARY STATISTICS ACROSS TAU LEVELS")
print("=" * 80)

for tau in taus:
    print(f"\n--- TAU = {tau} ---")
    tau_data = summary_stats[summary_stats['tau'] == tau]
    
    for metric in ['losses', 'coverage', 'time_all']:
        if metric not in tau_data['metric'].values:
            continue
        
        metric_data_tau = tau_data[tau_data['metric'] == metric][['model', 'mean', 'std']]
        if len(metric_data_tau) > 0:
            print(f"\n{metric.upper()}:")
            print(metric_data_tau.to_string(index=False))

# Find best and worst performing models
print("\n" + "=" * 80)
print("BEST MODEL BY METRIC AND TAU")
print("=" * 80)

for metric in ['losses', 'coverage']:
    print(f"\n{metric.upper()}:")
    metric_summary = summary_stats[summary_stats['metric'] == metric]
    
    for tau in taus:
        tau_metric = metric_summary[metric_summary['tau'] == tau]
        best_idx = tau_metric['mean'].idxmin()
        best_model = tau_metric.loc[best_idx, 'model']
        best_value = tau_metric.loc[best_idx, 'mean']
        print(f"  τ={tau}: {best_model:15s} (mean={best_value:.4f})")

# Export summary table to CSV
summary_csv_path = os.path.join(OUTPUT_DIR, 'summary_statistics.csv')
summary_stats.to_csv(summary_csv_path, index=False)
print(f"\n✓ Exported summary statistics to: {summary_csv_path}")

# Export model rankings
rankings_path = os.path.join(OUTPUT_DIR, 'model_rankings.csv')
ranking_data = []

for tau in taus:
    for metric in ['losses', 'coverage', 'time_all']:
        metric_data_tau = summary_stats[
            (summary_stats['tau'] == tau) & 
            (summary_stats['metric'] == metric)
        ].sort_values('mean')
        
        for rank, (idx, row) in enumerate(metric_data_tau.iterrows(), 1):
            ranking_data.append({
                'tau': tau,
                'metric': metric,
                'rank': rank,
                'model': row['model'],
                'mean': row['mean'],
                'std': row['std']
            })

rankings_df = pd.DataFrame(ranking_data)
rankings_df.to_csv(rankings_path, index=False)
print(f"✓ Exported model rankings to: {rankings_path}")

print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)
print(f"TikZ plots generated in: {OUTPUT_DIR}")
print(f"Supporting files:")
print(f"  - {summary_csv_path}")
print(f"  - {rankings_path}")


SUMMARY STATISTICS ACROSS TAU LEVELS

--- TAU = 0.5 ---

LOSSES:
         model     mean      std
  DecisionTree 0.038717 0.017254
      LightGBM 0.042091 0.021733
LinearQuantile 0.057968 0.029411
           SQR 0.032786 0.023446

COVERAGE:
         model     mean      std
  DecisionTree 0.083617 0.099913
      LightGBM 0.058731 0.062022
LinearQuantile 0.059011 0.073643
           SQR 0.081545 0.086229

TIME_ALL:
         model       mean        std
  DecisionTree   7.856714  47.333635
      LightGBM   0.603711   1.052114
LinearQuantile  71.279094 495.958995
           SQR 165.855329 170.784553

--- TAU = 0.6 ---

LOSSES:
         model     mean      std
  DecisionTree 0.038134 0.017316
      LightGBM 0.043783 0.029528
LinearQuantile 0.058978 0.032384
           SQR 0.031723 0.022169

COVERAGE:
         model     mean      std
  DecisionTree 0.082674 0.082266
      LightGBM 0.062003 0.070576
LinearQuantile 0.106894 0.082974
           SQR 0.079169 0.080322

TIME_ALL:
         model    

## Summary: Results Overview
Display key statistics and insights from the tau analysis.


In [25]:
def save_tikz_standalone(tikz_code, filename, directory='tikz_plots/'):
    """
    Save TikZ code as a standalone file that can be included in LaTeX.
    
    Parameters:
    -----------
    tikz_code : str
        TikZ code to save
    filename : str
        Name of output file (without extension)
    directory : str
        Output directory
    """
    os.makedirs(directory, exist_ok=True)
    
    # Option 1: Save as raw TikZ for use with \\input{}
    tikz_path = os.path.join(directory, f"{filename}.tikz")
    with open(tikz_path, 'w') as f:
        f.write(tikz_code)
    print(f"Saved TikZ plot: {tikz_path}")
    
    # Option 2: Save as standalone LaTeX document
    standalone_path = os.path.join(directory, f"{filename}_standalone.tex")
    standalone_doc = r"""\documentclass[tikz,border=10pt]{standalone}
\usepackage{tikz}
\usepackage{pgfplots}
\pgfplotsset{compat=1.16}
\usepgfplotslibrary{groupplots}

\begin{document}
""" + tikz_code + r"""
\end{document}"""
    
    with open(standalone_path, 'w') as f:
        f.write(standalone_doc)
    print(f"Saved standalone LaTeX: {standalone_path}")

# Generate and save multiple plot configurations
plot_configs = [
    {
        'name': 'performance_vs_time',
        'left_metrics': ['losses', 'coverage'],
        'right_metrics': ['time_all'],
        'title': 'Model Performance vs Computation Time Across Quantile Levels',
        'y_label_left': 'Normalized Loss / Coverage Error',
        'y_label_right': 'Total Time (s)'
    },
    {
        'name': 'complexity_vs_time',
        'left_metrics': ['complexity'],
        'right_metrics': ['time_fit'],
        'title': 'Model Complexity vs Fitting Time Across Quantile Levels',
        'y_label_left': 'Expression/Tree Complexity',
        'y_label_right': 'Fitting Time (s)'
    },
    {
        'name': 'all_metrics',
        'left_metrics': ['losses'],
        'right_metrics': ['time_all'],
        'title': 'All Metrics Across Quantile Levels',
        'y_label_left': 'Loss (Normalized Pinball)',
        'y_label_right': 'Computation Time (s)'
    }
]

# Generate plots for each configuration
for config in plot_configs:
    tikz_code = generate_tikz_dual_axis_plot(
        metric_data,
        left_metrics=config['left_metrics'],
        right_metrics=config['right_metrics'],
        models=models,
        taus=taus,
        title=config['title'],
        y_label_left=config['y_label_left'],
        y_label_right=config['y_label_right']
    )
    save_tikz_standalone(tikz_code, config['name'])

print(f"\nGenerated {len(plot_configs)} TikZ plots in {OUTPUT_DIR}/")
print("\nTo use in LaTeX, add to your document:")
print(r"  \input{tikz_plots/performance_vs_time.tikz}")


Saved TikZ plot: tikz_plots/performance_vs_time.tikz
Saved standalone LaTeX: tikz_plots/performance_vs_time_standalone.tex
Saved TikZ plot: tikz_plots/complexity_vs_time.tikz
Saved standalone LaTeX: tikz_plots/complexity_vs_time_standalone.tex
Saved TikZ plot: tikz_plots/all_metrics.tikz
Saved standalone LaTeX: tikz_plots/all_metrics_standalone.tex

Generated 3 TikZ plots in tikz_plots//

To use in LaTeX, add to your document:
  \input{tikz_plots/performance_vs_time.tikz}


## Section 5: Export TikZ Figures for LaTeX
Generate standalone TikZ files that can be included in LaTeX documents.


In [26]:
def generate_tikz_dual_axis_plot(metric_data, left_metrics, right_metrics, models, taus, 
                                  title="", y_label_left="", y_label_right=""):
    """
    Generate TikZ code for a dual-axis plot.
    
    Parameters:
    -----------
    metric_data : dict
        Dictionary with metric names as keys and DataFrames as values
    left_metrics : list
        Metrics to plot on left y-axis
    right_metrics : list
        Metrics to plot on right y-axis
    models : list
        Model names to include
    taus : list
        Tau values
    title, y_label_left, y_label_right : str
        Labels for the plot
    
    Returns:
    --------
    str : TikZ code as string
    """
    
    # Define line styles for models
    line_styles = {
        'SQR': 'solid',
        'LightGBM': 'dashed',
        'DecisionTree': 'dotted',
        'LinearQuantile': 'dashdotted'
    }
    
    mark_styles = {
        'SQR': '*',
        'LightGBM': 'o',
        'DecisionTree': 'square',
        'LinearQuantile': 'diamond'
    }
    
    colors = {
        'SQR': 'red',
        'LightGBM': 'blue',
        'DecisionTree': 'green',
        'LinearQuantile': 'purple'
    }
    
    # Start TikZ code
    tikz_code = r"\begin{tikzpicture}" + "\n"
    tikz_code += r"\begin{axis}[" + "\n"
    tikz_code += f"    title={{{title}}},\n"
    tikz_code += f"    xlabel={{$\\tau$}},\n"
    tikz_code += f"    ylabel={{{y_label_left}}},\n"
    tikz_code += f"    width=0.9\\textwidth,\n"
    tikz_code += f"    height=6cm,\n"
    tikz_code += f"    legend pos=outer north east,\n"
    tikz_code += f"    xtick={{{','.join(map(str, taus))}}},\n"
    tikz_code += f"]\n"
    
    # Plot left axis metrics
    for metric in left_metrics:
        if metric not in metric_data:
            continue
        metric_df = metric_data[metric]
        
        for model in models:
            model_data_subset = metric_df[metric_df['model'] == model].sort_values('tau')
            
            if len(model_data_subset) == 0:
                continue
            
            # Extract data
            x_vals = model_data_subset['tau'].tolist()
            y_vals = model_data_subset['mean'].tolist()
            
            # Create coordinate string
            coords = ' '.join([f"({x},{y:.4f})" for x, y in zip(x_vals, y_vals)])
            
            # Add plot
            tikz_code += f"\\addplot[color={colors.get(model, 'black')}, "
            tikz_code += f"{line_styles.get(model, 'solid')}, "
            tikz_code += f"mark={mark_styles.get(model, 'o')}, "
            tikz_code += f"mark size=2pt] coordinates {{{coords}}};\n"
            tikz_code += f"\\addlegendentry{{{model} - {metric}}}\n"
    
    tikz_code += r"\end{axis}" + "\n"
    
    # Right axis if needed
    if right_metrics:
        tikz_code += r"\begin{axis}[" + "\n"
        tikz_code += f"    at=(current axis.right of origin),\n"
        tikz_code += f"    axis y line=right,\n"
        tikz_code += f"    ylabel={{{y_label_right}}},\n"
        tikz_code += f"    legend pos=outer north east,\n"
        tikz_code += f"]\n"
        
        for metric in right_metrics:
            if metric not in metric_data:
                continue
            metric_df = metric_data[metric]
            
            for model in models:
                model_data_subset = metric_df[metric_df['model'] == model].sort_values('tau')
                
                if len(model_data_subset) == 0:
                    continue
                
                x_vals = model_data_subset['tau'].tolist()
                y_vals = model_data_subset['mean'].tolist()
                coords = ' '.join([f"({x},{y:.4f})" for x, y in zip(x_vals, y_vals)])
                
                # Add plot with different dash pattern to distinguish from left axis
                tikz_code += f"\\addplot[color={colors.get(model, 'black')}, "
                tikz_code += f"{line_styles.get(model, 'solid')}, "
                tikz_code += f"mark={mark_styles.get(model, 'o')}, "
                tikz_code += f"mark size=2pt, line width=1.5pt] coordinates {{{coords}}};\n"
                tikz_code += f"\\addlegendentry{{{model} - {metric}}}\n"
        
        tikz_code += r"\end{axis}" + "\n"
    
    tikz_code += r"\end{tikzpicture}" + "\n"
    
    return tikz_code

# Test the function with a simple example
test_tikz = generate_tikz_dual_axis_plot(
    metric_data,
    left_metrics=['losses'],
    right_metrics=['time_all'],
    models=['SQR', 'LightGBM', 'DecisionTree', 'LinearQuantile'],
    taus=taus,
    title="Model Performance vs Tau",
    y_label_left="Normalized Pinball Loss",
    y_label_right="Computation Time (s)"
)

print("Generated TikZ code (first 500 chars):")
print(test_tikz[:500])
print(f"\nTotal length: {len(test_tikz)} characters")


Generated TikZ code (first 500 chars):
\begin{tikzpicture}
\begin{axis}[
    title={Model Performance vs Tau},
    xlabel={$\tau$},
    ylabel={Normalized Pinball Loss},
    width=0.9\textwidth,
    height=6cm,
    legend pos=outer north east,
    xtick={0.5,0.6,0.7,0.8,0.9},
]
\addplot[color=red, solid, mark=*, mark size=2pt] coordinates {(0.5,0.0328) (0.6,0.0317) (0.7,0.0296) (0.8,0.0272) (0.9,inf)};
\addlegendentry{SQR - losses}
\addplot[color=blue, dashed, mark=o, mark size=2pt] coordinates {(0.5,0.0421) (0.6,0.0438) (0.7,0.0419)

Total length: 1896 characters


## Section 4: Generate TikZ Plots with Dual Axes
Create TikZ plot objects with dual y-axes (left for one metric group, right for another).


In [27]:
# Define metric groupings for dual-axis plots
# Typical arrangement: performance metrics (loss, coverage) on left; complexity/time on right
metric_groups = {
    'Performance': {
        'left': ['losses', 'coverage'],
        'right': ['time_all']
    },
    'Complexity': {
        'left': ['complexity'],
        'right': ['time_fit']
    },
    'All Metrics': {
        'left': ['losses', 'coverage', 'complexity'],
        'right': ['time_all', 'time_fit']
    }
}

# Create tau-ordered data for each model and metric
models = sorted(df_long['model'].unique())
metrics = sorted(df_long['metric'].unique())
taus = sorted(df_long['tau'].unique())

print(f"Models: {models}")
print(f"Metrics: {metrics}")
print(f"Tau values: {taus}")

# Create a dictionary of DataFrames organized by metric for easy access
metric_data = {}
for metric in metrics:
    metric_subset = summary_stats[summary_stats['metric'] == metric].copy()
    metric_subset = metric_subset.sort_values('tau')
    metric_data[metric] = metric_subset

# Display example data for losses metric
print("\nExample: Losses across tau levels")
print(metric_data['losses'][['tau', 'model', 'mean', 'std']])


Models: ['DecisionTree', 'LightGBM', 'LinearQuantile', 'SQR']
Metrics: ['complexity', 'coverage', 'losses', 'time_all', 'time_fit']
Tau values: [np.float64(0.5), np.float64(0.6), np.float64(0.7), np.float64(0.8), np.float64(0.9)]

Example: Losses across tau levels
    tau           model      mean       std
2   0.5    DecisionTree  0.038717  0.017254
6   0.5        LightGBM  0.042091  0.021733
11  0.5  LinearQuantile  0.057968  0.029411
16  0.5             SQR  0.032786  0.023446
21  0.6    DecisionTree  0.038134  0.017316
25  0.6        LightGBM  0.043783  0.029528
30  0.6  LinearQuantile  0.058978  0.032384
35  0.6             SQR  0.031723  0.022169
54  0.7             SQR  0.029593  0.022415
49  0.7  LinearQuantile  0.058438  0.035775
44  0.7        LightGBM  0.041879  0.027761
40  0.7    DecisionTree  0.035276  0.016283
59  0.8    DecisionTree  0.029495  0.013844
63  0.8        LightGBM  0.036403  0.022113
68  0.8  LinearQuantile  0.056426  0.038815
73  0.8             SQR  0.0272

## Section 3: Prepare Data for Plotting
Reshape aggregated data and organize metrics into groups for dual-axis visualization.


In [28]:
# Calculate summary statistics grouped by tau, model, and metric
# Compute across all datasets and runs for each combination
summary_stats = df_long.groupby(['tau', 'model', 'metric'])['value'].agg([
    ('mean', 'mean'),
    ('std', 'std'),
    ('median', 'median'),
    ('min', 'min'),
    ('max', 'max'),
    ('count', 'count')
]).reset_index()

print("Summary statistics shape:", summary_stats.shape)
print("\nSample statistics:")
print(summary_stats.head(15))

# Pivot to get metrics as columns for easier access
stats_pivot = summary_stats.pivot_table(
    index=['tau', 'model'],
    columns='metric',
    values='mean'
).reset_index()

print("\nPivoted statistics (means):")
print(stats_pivot)


Summary statistics shape: (95, 9)

Sample statistics:
    tau           model      metric         mean          std     median  \
0   0.5    DecisionTree  complexity  1526.940000  9135.109392  79.000000   
1   0.5    DecisionTree    coverage     0.083617     0.099913   0.046872   
2   0.5    DecisionTree      losses     0.038717     0.017254   0.038650   
3   0.5    DecisionTree    time_all     7.856714    47.333635   0.096175   
4   0.5    DecisionTree    time_fit     0.526921     3.229699   0.014585   
5   0.5        LightGBM    coverage     0.058731     0.062022   0.040000   
6   0.5        LightGBM      losses     0.042091     0.021733   0.036913   
7   0.5        LightGBM    time_all     0.603711     1.052114   0.255252   
8   0.5        LightGBM    time_fit     0.117287     0.215799   0.053976   
9   0.5  LinearQuantile  complexity    19.891667    20.600256  11.000000   
10  0.5  LinearQuantile    coverage     0.059011     0.073643   0.040000   
11  0.5  LinearQuantile      losse

## Section 2: Calculate Summary Statistics
Group data by tau, model, and metric to compute mean, median, and standard deviation.


In [29]:
def load_tau_jsons_to_long_df(directory):
    """Load JSON files from tau results directory into long-format DataFrame."""
    records = []
    
    for filename in sorted(os.listdir(directory)):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            try:
                with open(filepath, 'r') as f:
                    data = json.load(f)
                
                for model, model_data in data.items():
                    tau = model_data.get('tau')
                    for metric, metric_data in model_data.items():
                        if metric == 'tau' or metric == 'sizes':
                            continue
                        for dataset, values in metric_data.items():
                            for run_idx, value in enumerate(values):
                                records.append({
                                    'filename': filename,
                                    'model': model,
                                    'tau': tau,
                                    'metric': metric,
                                    'dataset': dataset,
                                    'run': run_idx,
                                    'value': value
                                })
            except Exception as e:
                print(f"Error loading {filename}: {e}")
    
    df = pd.DataFrame.from_records(records)
    return df

# Load results
df_long = load_tau_jsons_to_long_df(RESULTS_DIR)
print(f"Loaded {len(df_long)} records from {RESULTS_DIR}")
print(f"Unique tau values: {sorted(df_long['tau'].unique())}")
print(f"Unique models: {sorted(df_long['model'].unique())}")
print(f"Unique metrics: {sorted(df_long['metric'].unique())}")
print(f"\nDataFrame shape: {df_long.shape}")
df_long.head(10)


Error loading results0.6_503_wind_tau0.6.json: Expecting value: line 1 column 1 (char 0)
Loaded 56905 records from results_sampled_10k_taus/
Unique tau values: [np.float64(0.5), np.float64(0.6), np.float64(0.7), np.float64(0.8), np.float64(0.9)]
Unique models: ['DecisionTree', 'LightGBM', 'LinearQuantile', 'SQR']
Unique metrics: ['complexity', 'coverage', 'losses', 'time_all', 'time_fit']

DataFrame shape: (56905, 7)


Unnamed: 0,filename,model,tau,metric,dataset,run,value
0,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,0,0.035077
1,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,1,0.044006
2,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,2,0.034439
3,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,3,0.034149
4,results0.5_1027_ESL_tau0.5.json,SQR,0.5,losses,1027_ESL,4,0.038015
5,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,0,0.367347
6,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,1,0.112245
7,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,2,0.397959
8,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,3,0.283505
9,results0.5_1027_ESL_tau0.5.json,SQR,0.5,coverage,1027_ESL,4,0.273196


## Section 1: Load and Aggregate Results by Tau
Load JSON files from results_sampled_10k_taus directory and create a long-format DataFrame with tau as a separate dimension.


In [30]:
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Configuration
RESULTS_DIR = 'results_sampled_10k_taus/'
OUTPUT_DIR = 'tikz_plots/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Results directory: {RESULTS_DIR}")
print(f"Output directory: {OUTPUT_DIR}")


Results directory: results_sampled_10k_taus/
Output directory: tikz_plots/


# Analysis of SQR Results Across Tau Levels with TikZ Plotting
Analysis of `results_sampled_10k_taus` directory with visualization using TikZ for LaTeX embedding. This notebook aggregates results across multiple quantile levels (τ) and generates publication-quality plots with dual y-axes.
