## Get data info

In [1]:
import os
import json
import numpy as np

def get_dataset_info(base_path):
    datasets = []
    train_sizes = []
    num_features = []
    cat_features = []
    num_tot_features = []
    variances = []
    num_classes = []
    
    # Get all directories
    for dirname in os.listdir(base_path):
        dir_path = os.path.join(base_path, dirname)
        
        # Check if it's a directory
        if os.path.isdir(dir_path):
            json_path = os.path.join(dir_path, 'info.json')
            
            # Check if info.json exists
            if os.path.exists(json_path):
                try:
                    with open(json_path, 'r') as f:
                        info = json.load(f)
                        
                    
                    # Append values to respective lists
                    datasets.append(dirname)
                    train_sizes.append(info.get('train_size'))
                    num_features.append(info.get('n_num_features'))
                    cat_features.append(info.get('n_cat_features'))
                    num_tot_features.append(info.get('n_num_features') + info.get('n_cat_features'))
                except Exception as e:
                    print(f"Error processing {dirname}: {str(e)}")
            
            if info['task_type'] == 'multiclass':
                num_classes.append(info.get('n_classes'))
            else:
                num_classes.append(None)
                
            if info['task_type'] != 'regression':
                variances.append(None)
                continue
                
            if os.path.exists(json_path):
                try:
                    file_path = os.path.join(dir_path, 'y_test.npy')
                    # Load the data with allow_pickle=True
                    labels = np.load(file_path, allow_pickle=True)
                    # print("labels",labels)

                    # Convert data to numeric type if possible
                    if isinstance(labels[0], (list, tuple)):
                        labels = np.array(labels, dtype=float)
            
                    variance = np.var(labels)
                    variances.append(variance)
                except Exception as e:
                    print(f"Error processing {dirname}: {str(e)}")
            
    
    return datasets, train_sizes, num_classes, num_features, cat_features, num_tot_features, variances

# Usage
base_path = './datasets'
datasets, train_sizes, num_classes, num_features, cat_features, num_tot_features, variances = get_dataset_info(base_path)

var_dict = {}
for d, v in zip(datasets, variances):
    var_dict[d] = v

## Analyze results

In [2]:
import pandas as pd

# Read the CSV file
reg_df = pd.read_csv('results_regression.csv')
bin_df = pd.read_csv('results_binary_classification.csv')
multi_df = pd.read_csv('results_multi-class_classification.csv')

In [3]:
def add_results(df, metric_stats, method='rfm', tag=''):
    method_name = f'{method}{tag}'
    # Add rfm column if it doesn't exist, initialize with NaN
    if method not in df.columns:
        df[method_name] = float('nan')  # or df['rfm'] = pd.NA
        
    # Go through each dataset in metric_stats
    for dataset_name, metrics in metric_stats.items():
        # Remove the '-rfm' suffix to match with DataFrame
        base_name = dataset_name.replace(f'-{method}', '')
        
        # Check if this dataset exists in the DataFrame
        if base_name in df['Dataset/Model'].values:
            # Determine if it's classification based on presence of 'Accuracy' metric
            is_classification = 'Accuracy' in metrics
            
            # Get the appropriate metric value
            if is_classification:
                metric_value = metrics['Accuracy']['mean']
            else:
                metric_value = metrics['RMSE']['mean']
            
            # Update the rfm value in the DataFrame
            mask = df['Dataset/Model'] == base_name
            df.loc[mask, f'{method_name}'] = metric_value
            
    return df


In [4]:
import os
import pickle
from pathlib import Path
from statistics import mean, stdev
from typing import Dict, List, Tuple

import os
import pickle
from pathlib import Path

def load_results(results_dir='rfm_results'):
    """
    Load all pickle files from the specified directory into a dictionary.
    Each pickle file should contain results from different dataset-model combinations.
    
    Args:
        results_dir (str): Directory containing the pickle files
        
    Returns:
        dict: Dictionary with dataset-model combinations as keys and loaded data as values
    """
    results = {}
    
    # Convert to Path object for easier handling
    results_path = Path(results_dir)
    
    # Ensure directory exists
    if not results_path.exists():
        raise FileNotFoundError(f"Directory {results_dir} not found")
    
    # Iterate through all pickle files in directory
    for file_path in results_path.glob('*.pkl'):
        try:
            # Extract dataset and model type from filename
            filename = file_path.stem  # Get filename without extension
            
            # Load pickle file
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
                
            # Store in results dictionary
            # Using filename as key to maintain dataset-model relationship
            results[filename] = {
                'info': data['info'],
                'args': data['args'],
                'results': data['results'],
                'time': data['time'],
                'metric_name': data['metric_name']
            }
            
        except Exception as e:
            print(f"Error loading {file_path}: {str(e)}")
            continue
    
    return results

def calculate_metric_stats(results_dict: Dict) -> Dict[str, Dict[str, Dict[str, float]]]:
    """
    Calculate the mean and standard deviation of each metric for each dataset-model combination.
    
    Args:
        results_dict (dict): Dictionary containing the loaded results from pickle files
        
    Returns:
        dict: Dictionary with dataset-model combinations as keys and metric statistics as values
    """
    metric_stats = {}
    
    for filename, data in results_dict.items():
        # Initialize storage for this dataset-model combination
        metric_stats[filename] = {}
        
        # Get the results which contain metric tuples
        results = data['results']
        
        # Skip if no results
        if not results:
            print(f"Warning: No results found for {filename}")
            continue
            
        # Determine number of metrics in each tuple
        num_metrics = len(results[0])
        
        # Calculate statistics for each metric position
        for metric_idx in range(num_metrics):
            try:
                # Extract the metric at current position from all results
                metric_values = [result[metric_idx] for result in results]
                
                # Get metric name
                metric_name = f"metric_{metric_idx}"
                if data.get('metric_name') and isinstance(data['metric_name'], (list, tuple)):
                    metric_name = data['metric_name'][metric_idx]
                
                # Calculate statistics
                metric_stats[filename][metric_name] = {
                    'mean': mean(metric_values),
                    'std': stdev(metric_values) if len(metric_values) > 1 else 0
                }
                
            except Exception as e:
                print(f"Error calculating statistics for metric {metric_idx} in {filename}: {str(e)}")
                metric_stats[filename][f"metric_{metric_idx}"] = {
                    'mean': None,
                    'std': None
                }
    
    return metric_stats

def print_metric_summary(metric_stats: Dict[str, Dict[str, Dict[str, float]]]) -> None:
    """
    Print a formatted summary of the metric statistics for each dataset-model combination.
    
    Args:
        metric_stats (dict): Dictionary containing the calculated metric statistics
    """
    print("\nMetric Statistics Summary:")
    print("-" * 60)
    
    for filename, metrics in metric_stats.items():
        print(f"\nDataset-Model: {filename}")
        print("-" * 40)
        
        for metric_name, stats in metrics.items():
            print(f"\n{metric_name}:")
            if stats['mean'] is not None and stats['std'] is not None:
                print(f"  Mean: {stats['mean']:.4f}")
                print(f"  Std:  {stats['std']:.4f}")
            else:
                print("  Error calculating statistics")
    
# Example usage
if __name__ == "__main__":
    try:
        # Load results
        results_dict = load_results()
        metric_stats = calculate_metric_stats(results_dict)

        reg_df = add_results(reg_df, metric_stats)
        bin_df = add_results(bin_df, metric_stats)
        multi_df = add_results(multi_df, metric_stats)

        results_dict = load_results('rfm_results_power_3_23')
        metric_stats = calculate_metric_stats(results_dict)

        # reg_df = add_results(reg_df, metric_stats, method='rfm', tag='_power')
        # bin_df = add_results(bin_df, metric_stats, method='rfm', tag='_power')
        # multi_df = add_results(multi_df, metric_stats, method='rfm', tag='_power')


        results_dict = load_results('pfn-v2-results')
        metric_stats = calculate_metric_stats(results_dict)

        reg_df = add_results(reg_df, metric_stats, method='PFN-v2')
        bin_df = add_results(bin_df, metric_stats, method='PFN-v2')
        multi_df = add_results(multi_df, metric_stats, method='PFN-v2')


        results_dict = load_results('kernel_results')
        metric_stats = calculate_metric_stats(results_dict)

        reg_df = add_results(reg_df, metric_stats, method='kernel')
        bin_df = add_results(bin_df, metric_stats, method='kernel')
        multi_df = add_results(multi_df, metric_stats, method='kernel')
        
    except Exception as e:
        print(f"Error: {str(e)}")

In [5]:
from scipy.stats import gmean 

In [6]:
def analyze_performance(reg_df, bin_df, multi_df, methods=None):
    """
    Analyze performance metrics for specified methods across regression and classification tasks.
    
    Parameters:
    reg_df, bin_df, multi_df: DataFrames containing performance data
    methods: Optional list of method names to analyze. If None, analyzes all methods.
    
    Returns a DataFrame with average scores and ranks for each method.
    Skips methods that aren't found in the data without raising an error.
    """
    results = []
    
    # Process regression datasets
    if not reg_df.empty:
        # Get method columns (excluding Dataset/Model)
        all_method_cols = [col for col in reg_df.columns if col != 'Dataset/Model' and col != 'tabpfn' and col != 'dummy' and col != 'train_size']
        method_cols = methods if methods else all_method_cols
        method_cols = [col for col in method_cols if col in all_method_cols]

        
        # Calculate ranks for each row (smaller is better for RMSE)
        try:
            min_scores = reg_df[method_cols].min(axis=1)
            max_scores = reg_df[method_cols].max(axis=1)

            ranks = reg_df[all_method_cols].rank(axis=1, ascending=True)
            for method in method_cols:
                try:
                    avg_rank = ranks[method].mean()
                    avg_score = gmean(reg_df[method].values)
                    avg_norm_score = ((reg_df[method] - min_scores) / (max_scores - min_scores)).mean()
                    results.append({
                        'Type': 'Regression',
                        'Method': method,
                        'Datasets': len(reg_df),
                        'Metric': 'RMSE',
                        'Average_Score': avg_score,
                        'Average_Rank': avg_rank,
                        'Average_Normalized_Score': avg_norm_score
                    })
                except KeyError:
                    print(f"Warning: Method '{method}' not found in regression data")
                    continue
        except Exception as e:
            print(f"Error processing regression data: {str(e)}")
    
    # Process binary classification datasets
    if not bin_df.empty:
        all_method_cols = [col for col in bin_df.columns if col != 'Dataset/Model' and col != 'tabpfn' and col != 'dummy' and col != 'train_size']
        method_cols = methods if methods else all_method_cols
        method_cols = [col for col in method_cols if col in all_method_cols]

        min_scores = bin_df[method_cols].min(axis=1)
        max_scores = bin_df[method_cols].max(axis=1)
        
        # Calculate ranks for each row (larger is better for Accuracy)
        try:
            

            ranks = bin_df[all_method_cols].rank(axis=1, ascending=False)
            for method in method_cols:
                try:
                    avg_rank = ranks[method].mean()
                    avg_score = bin_df[method].mean()
                    avg_norm_score = ((bin_df[method] - min_scores) / (max_scores - min_scores)).mean()
                    
                    results.append({
                        'Type': 'Binary Classification',
                        'Method': method,
                        'Datasets': len(bin_df),
                        'Metric': 'Accuracy',
                        'Average_Score': avg_score,
                        'Average_Rank': avg_rank,
                        'Average_Normalized_Score': avg_norm_score
                    })
                except KeyError:
                    print(f"Warning: Method '{method}' not found in binary classification data")
                    continue
        except Exception as e:
            print(f"Error processing binary classification data: {str(e)}")
    
    # Process multiclass classification datasets
    if not multi_df.empty:
        all_method_cols = [col for col in multi_df.columns if col != 'Dataset/Model' and col != 'tabpfn' and col != 'dummy' and col != 'train_size']
        method_cols = methods if methods else all_method_cols
        method_cols = [col for col in method_cols if col in all_method_cols]

        min_scores = multi_df[method_cols].min(axis=1)
        max_scores = multi_df[method_cols].max(axis=1)
        
        # Calculate ranks for each row (larger is better for Accuracy)
        try:

            ranks = multi_df[all_method_cols].rank(axis=1, ascending=False)
            for method in method_cols:
                try:
                    avg_rank = ranks[method].mean()
                    avg_score = multi_df[method].mean()
                    avg_norm_score = ((multi_df[method] - min_scores) / (max_scores - min_scores)).mean()
                    
                    results.append({
                        'Type': 'Multiclass Classification',
                        'Method': method,
                        'Datasets': len(multi_df),
                        'Metric': 'Accuracy',
                        'Average_Score': avg_score,
                        'Average_Rank': avg_rank,
                        'Average_Normalized_Score': avg_norm_score
                    })
                except KeyError:
                    print(f"Warning: Method '{method}' not found in multiclass classification data")
                    continue
        except Exception as e:
            print(f"Error processing multiclass classification data: {str(e)}")
    
    # Convert results to DataFrame and sort by Type and Average_Rank
    if not results:
        print("Warning: No valid results were generated")
        return pd.DataFrame()
        
    results_df = pd.DataFrame(results)
    return results_df.sort_values(['Type', 'Average_Rank'])

def print_performance_summary(results_df):
    """
    Print a formatted summary of the performance analysis results with clear visual separation
    between different task types.
    
    Parameters:
    results_df: DataFrame containing performance analysis results
    """
    # Define some formatting constants
    SECTION_WIDTH = 120
    DOUBLE_LINE = "=" * SECTION_WIDTH
    SINGLE_LINE = "-" * SECTION_WIDTH
    
    print("\nPERFORMANCE ANALYSIS SUMMARY")
    print(DOUBLE_LINE)
    
    total_datasets = 0
    for task_type in results_df['Type'].unique():
        task_results = results_df[results_df['Type'] == task_type]
        
        # Print section header
        print(f"\n{task_type.upper()}")
        print(SINGLE_LINE)
        
        # Print metadata
        print(f"Datasets analyzed: {task_results['Datasets'].iloc[0]}")
        print(f"Evaluation metric: {task_results['Metric'].iloc[0]}\n")
        
        total_datasets += task_results['Datasets'].iloc[0]
        
        # Print column headers
        print(f"{'Method':<25} {'Avg Rank':<15} {'Avg Score':<15} {'Avg Norm Score':<15}")
        print(SINGLE_LINE)
        
        # Print results for each method
        for _, row in task_results.iterrows():
            print(f"{row['Method']:<25} {row['Average_Rank']:<15.2f} {row['Average_Score']:<15.4f} {row['Average_Normalized_Score']:<15.4f}")
        
        print(SINGLE_LINE)
    
    print(f"\nAnalysis complete. {len(results_df['Type'].unique())} task types evaluated. {len(results_df)} methods evaluated. {total_datasets} datasets evaluated.")
    print(DOUBLE_LINE)

In [7]:
reg_df[reg_df['rfm'].isna()]

Unnamed: 0,Dataset/Model,dummy,LinearRegression,knn,svm,xgboost,catboost,RandomForest,lightgbm,mlp,...,grownet,tabr,modernNCA,mlp_plr,realmlp,excelformer,dnnr,rfm,PFN-v2,kernel


In [8]:
bin_df[bin_df['rfm'].isna()]

Unnamed: 0,Dataset/Model,dummy,LogReg,NCM,NaiveBayes,knn,svm,xgboost,catboost,RandomForest,...,ptarl,grownet,tabr,modernNCA,mlp_plr,realmlp,excelformer,rfm,PFN-v2,kernel
68,customer_satisfaction_in_airline,0.547313,0.903257,0.794079,0.812827,0.934324,0.902282,0.960733,0.963053,0.933603,...,0.95771,0.950503,0.963438,0.961685,0.960579,0.960848,0.959463,,0.949651,0.956498


In [9]:
multi_df[multi_df['rfm'].isna()]

Unnamed: 0,Dataset/Model,dummy,LogReg,NCM,NaiveBayes,knn,svm,xgboost,catboost,RandomForest,...,ptarl,grownet,tabr,modernNCA,mlp_plr,realmlp,excelformer,rfm,PFN-v2,kernel
3,Credit_c,0.53175,0.73355,0.5928,0.43325,0.77865,0.63107,0.780113,0.797217,0.585353,...,0.77542,0.6423,0.820073,0.7986,0.789123,0.78501,0.784137,,0.701387,0.76535
10,Rain_in_Australia,0.758387,0.835453,0.717998,0.798776,0.833116,0.822116,0.852835,0.868922,0.765454,...,0.859508,0.817778,0.860901,,0.862693,0.869579,0.850541,,0.840373,0.857349
14,accelerometer,0.333322,0.339499,0.339956,0.499493,0.73295,0.353039,0.734745,0.723796,0.676224,...,0.736425,0.570025,0.744326,0.746222,0.738603,0.739623,0.73603,,0.567385,0.736054
72,walking-activity,0.147253,0.271035,0.275521,0.355007,0.630964,0.245488,0.651941,0.661738,0.615948,...,0.643312,0.410667,0.671296,0.677437,0.65002,0.659211,0.651935,,,0.639468


In [10]:
# Then drop rows where 'rfm' is still NA
reg_df = reg_df.dropna(subset=['rfm'])
bin_df = bin_df.dropna(subset=['rfm'])
multi_df = multi_df.dropna(subset=['rfm'])


In [11]:

# try:
#     reg_df = reg_df.dropna(subset=['rfm_power'])
#     bin_df = bin_df.dropna(subset=['rfm_power'])
#     multi_df = multi_df.dropna(subset=['rfm_power'])
# except:
#     pass

# Get numeric columns only

# Impute missing values with row means for numeric columns only
numeric_cols = reg_df.select_dtypes(include=['float32', 'float64', 'int64']).columns
reg_df[numeric_cols] = reg_df[numeric_cols].apply(lambda row: row.fillna(row.mean()), axis=1)

numeric_cols = bin_df.select_dtypes(include=['float32', 'float64', 'int64']).columns
bin_df[numeric_cols] = bin_df[numeric_cols].apply(lambda row: row.fillna(row.mean()), axis=1)

numeric_cols = multi_df.select_dtypes(include=['float32', 'float64', 'int64']).columns
multi_df[numeric_cols] = multi_df[numeric_cols].apply(lambda row: row.fillna(row.mean()), axis=1)

In [15]:
# Create a dictionary to map datasets to train sizes
train_size_dict = dict(zip(datasets, train_sizes))

# Add train_size column to reg_df, bin_df, and multi_df
for df in [reg_df, bin_df, multi_df]:
    df['train_size'] = df['Dataset/Model'].map(train_size_dict)

num_classes_dict = dict(zip(datasets, num_classes))

# Add train_size column to reg_df, bin_df, and multi_df
for df in [multi_df]:
    df['num_classes'] = df['Dataset/Model'].map(num_classes_dict)

reg_df_final = reg_df
bin_df_final = bin_df
multi_df_final = multi_df

view_large_datasets = True
if view_large_datasets:
    reg_df_final = reg_df[reg_df['train_size'] > 10_000]
    bin_df_final = bin_df[bin_df['train_size'] > 10_000]
    multi_df_final = multi_df[multi_df['train_size'] > 10_000]

# view_small_datasets = False
# if view_small_datasets:
#     reg_df_final = reg_df[reg_df['train_size'] < 10_000]
#     bin_df_final = bin_df[bin_df['train_size'] < 10_000]
#     multi_df_final = multi_df[multi_df['train_size'] < 10_000]

# multi_df_final = multi_df[multi_df['num_classes'] > 10]


In [16]:
# Example usage:
# methods = ['dummy', 'LogReg', 'NCM', 'NaiveBayes', 'knn', 'svm',
#        'xgboost', 'catboost', 'RandomForest', 'lightgbm', 'tabpfn', 'mlp',
#        'resnet', 'node', 'switchtab', 'tabnet', 'tabcaps', 'tangos', 'danets',
#        'ftt', 'autoint', 'dcn2', 'snn', 'tabtransformer', 'ptarl', 'grownet',
#        'tabr', 'modernNCA', 'mlp_plr', 'realmlp', 'excelformer', 'rfm']
methods = ['PFN-v2','catboost','xgboost','rfm','mlp','realmlp','lightgbm','svm','RandomForest','kernel']#, 'rfm_power']
results_df = analyze_performance(reg_df_final, bin_df_final, multi_df_final, methods=methods)


In [17]:
print_performance_summary(results_df)


PERFORMANCE ANALYSIS SUMMARY

BINARY CLASSIFICATION
------------------------------------------------------------------------------------------------------------------------
Datasets analyzed: 26
Evaluation metric: Accuracy

Method                    Avg Rank        Avg Score       Avg Norm Score 
------------------------------------------------------------------------------------------------------------------------
rfm                       5.92            0.8377          0.9182         
realmlp                   7.50            0.8346          0.8091         
PFN-v2                    10.44           0.8292          0.8034         
catboost                  10.50           0.8215          0.8059         
lightgbm                  10.52           0.8213          0.7725         
xgboost                   12.12           0.8192          0.7432         
mlp                       12.96           0.8278          0.7114         
kernel                    14.37           0.8261          0.70