In [4]:
import pandas as pd
import os
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
BASE_RESULT_DIR = os.path.join("..", "result")

# --- Input Subdirectories ---
BOOK_REVIEW_RESULT_DIR = os.path.join(BASE_RESULT_DIR, "book_reviews")
FINANCIAL_NEWS_RESULT_DIR = os.path.join(BASE_RESULT_DIR, "financial_news")

# --- Input Filenames (Update if yours are different) ---
ML_RESULTS_FN_BR = "Book_Review_ml_tfidf_tuned_results.csv"
DL_RESULTS_FN_BR = "Book_Review_dl_pytorch_results.csv"
LLM_RESULTS_FN_BR = "Book_Review_llm_transformers_results.csv"

ML_RESULTS_FN_FN = "Financial_News_ml_tfidf_tuned_results.csv"
DL_RESULTS_FN_FN = "Financial_News_dl_pytorch_results.csv"
LLM_RESULTS_FN_FN = "Financial_News_llm_transformers_results.csv"

# --- Output Filenames ---
OUTPUT_COMPARISON_BR = os.path.join(BASE_RESULT_DIR, "comparison_book_reviews.csv")
OUTPUT_COMPARISON_FN = os.path.join(BASE_RESULT_DIR, "comparison_financial_news.csv")
OUTPUT_COMPARISON_ALL = os.path.join(BASE_RESULT_DIR, "comparison_all_datasets.csv")

# --- Columns to keep (adjust if needed, should match generated files) ---
# Example, assuming these were generated consistently
EXPECTED_COLUMNS = [
    "Dataset", "Model",
    "Accuracy",
    "F1 (Macro)", "Precision (Macro)", "Recall (Macro)",
    "F1 (Weighted)", "Precision (Weighted)", "Recall (Weighted)",
    "Train Time (s)", "Eval Time (s)", # Note: DL used Train Time (Epoch, s)
    # Optional columns generated by specific scripts:
    # "Best Params", # From ML script
    # "Train Time (Epoch, s)" # From DL script
]
SORT_BY_COLUMN = "F1 (Macro)" # Column to sort results by

In [6]:
# --- Helper Function to Load Results ---
def load_results(file_path):
    """Loads a CSV result file into a Pandas DataFrame."""
    if not os.path.exists(file_path):
        logging.warning(f"Result file not found, skipping: {file_path}")
        return None
    try:
        df = pd.read_csv(file_path)
        logging.info(f"Successfully loaded: {file_path} ({len(df)} rows)")
        # Handle potential variations in training time column name
        if "Train Time (Epoch, s)" in df.columns and "Train Time (s)" not in df.columns:
             df.rename(columns={"Train Time (Epoch, s)": "Train Time (s)"}, inplace=True)
             logging.debug(f"Renamed 'Train Time (Epoch, s)' to 'Train Time (s)' in {os.path.basename(file_path)}")
        return df
    except Exception as e:
        logging.error(f"Error loading {file_path}: {e}")
        return None

In [7]:
logging.info("Starting result aggregation script...")

# --- Load Book Review Results ---
br_ml_df = load_results(os.path.join(BOOK_REVIEW_RESULT_DIR, ML_RESULTS_FN_BR))
br_dl_df = load_results(os.path.join(BOOK_REVIEW_RESULT_DIR, DL_RESULTS_FN_BR))
br_llm_df = load_results(os.path.join(BOOK_REVIEW_RESULT_DIR, LLM_RESULTS_FN_BR))

# --- Load Financial News Results ---
fn_ml_df = load_results(os.path.join(FINANCIAL_NEWS_RESULT_DIR, ML_RESULTS_FN_FN))
fn_dl_df = load_results(os.path.join(FINANCIAL_NEWS_RESULT_DIR, DL_RESULTS_FN_FN))
fn_llm_df = load_results(os.path.join(FINANCIAL_NEWS_RESULT_DIR, LLM_RESULTS_FN_FN))

# List of loaded dataframes for easier handling
all_dfs = [df for df in [br_ml_df, br_dl_df, br_llm_df, fn_ml_df, fn_dl_df, fn_llm_df] if df is not None]

if not all_dfs:
    logging.error("No result files were loaded successfully. Exiting.")
    exit()

# --- Combine and Save Book Review Comparison ---
book_review_dfs = [df for df in [br_ml_df, br_dl_df, br_llm_df] if df is not None]
if book_review_dfs:
    logging.info(f"Combining {len(book_review_dfs)} result files for Book Reviews...")
    combined_br_df = pd.concat(book_review_dfs, ignore_index=True)

    # Select and reorder columns gracefully
    cols_to_use = [col for col in EXPECTED_COLUMNS if col in combined_br_df.columns]
    combined_br_df = combined_br_df[cols_to_use]

    # Sort results
    if SORT_BY_COLUMN in combined_br_df.columns:
        combined_br_df = combined_br_df.sort_values(by=SORT_BY_COLUMN, ascending=False)

    try:
        combined_br_df.to_csv(OUTPUT_COMPARISON_BR, index=False)
        logging.info(f"Book Review comparison saved to: {OUTPUT_COMPARISON_BR}")
    except Exception as e:
        logging.error(f"Failed to save Book Review comparison: {e}")
else:
    logging.warning("No Book Review result files found to combine.")


# --- Combine and Save Financial News Comparison ---
financial_news_dfs = [df for df in [fn_ml_df, fn_dl_df, fn_llm_df] if df is not None]
if financial_news_dfs:
    logging.info(f"Combining {len(financial_news_dfs)} result files for Financial News...")
    combined_fn_df = pd.concat(financial_news_dfs, ignore_index=True)

    # Select and reorder columns gracefully
    cols_to_use = [col for col in EXPECTED_COLUMNS if col in combined_fn_df.columns]
    combined_fn_df = combined_fn_df[cols_to_use]

    # Sort results
    if SORT_BY_COLUMN in combined_fn_df.columns:
        combined_fn_df = combined_fn_df.sort_values(by=SORT_BY_COLUMN, ascending=False)

    try:
        combined_fn_df.to_csv(OUTPUT_COMPARISON_FN, index=False)
        logging.info(f"Financial News comparison saved to: {OUTPUT_COMPARISON_FN}")
    except Exception as e:
        logging.error(f"Failed to save Financial News comparison: {e}")
else:
    logging.warning("No Financial News result files found to combine.")


# --- Combine and Save Overall Comparison ---
logging.info(f"Combining all {len(all_dfs)} loaded result files...")
combined_all_df = pd.concat(all_dfs, ignore_index=True)

# Select and reorder columns gracefully
cols_to_use = [col for col in EXPECTED_COLUMNS if col in combined_all_df.columns]
# Add back any extra columns that might exist (like Best Params) but weren't in EXPECTED_COLUMNS
extra_cols = [col for col in combined_all_df.columns if col not in cols_to_use]
final_cols_order = cols_to_use + extra_cols
combined_all_df = combined_all_df[final_cols_order]


# Sort results (e.g., by Dataset, then by the chosen metric)
if SORT_BY_COLUMN in combined_all_df.columns and "Dataset" in combined_all_df.columns:
    combined_all_df = combined_all_df.sort_values(by=["Dataset", SORT_BY_COLUMN], ascending=[True, False])
elif "Dataset" in combined_all_df.columns:
    combined_all_df = combined_all_df.sort_values(by=["Dataset"])


try:
    combined_all_df.to_csv(OUTPUT_COMPARISON_ALL, index=False)
    logging.info(f"Overall comparison saved to: {OUTPUT_COMPARISON_ALL}")
except Exception as e:
    logging.error(f"Failed to save overall comparison: {e}")

logging.info("Result aggregation finished.")

2025-05-10 12:22:54,877 - INFO - Starting result aggregation script...
2025-05-10 12:22:54,880 - INFO - Successfully loaded: ..\result\book_reviews\Book_Review_ml_tfidf_tuned_results.csv (5 rows)
2025-05-10 12:22:54,882 - INFO - Successfully loaded: ..\result\book_reviews\Book_Review_dl_pytorch_results.csv (10 rows)
2025-05-10 12:22:54,885 - INFO - Successfully loaded: ..\result\book_reviews\Book_Review_llm_transformers_results.csv (7 rows)
2025-05-10 12:22:54,895 - INFO - Successfully loaded: ..\result\financial_news\Financial_News_ml_tfidf_tuned_results.csv (5 rows)
2025-05-10 12:22:54,902 - INFO - Successfully loaded: ..\result\financial_news\Financial_News_dl_pytorch_results.csv (10 rows)
2025-05-10 12:22:54,904 - INFO - Successfully loaded: ..\result\financial_news\Financial_News_llm_transformers_results.csv (2 rows)
2025-05-10 12:22:54,905 - INFO - Combining 3 result files for Book Reviews...
2025-05-10 12:22:54,911 - INFO - Book Review comparison saved to: ..\result\comparison_b

In [8]:
# -*- coding: utf-8 -*-
"""
Aggregate Confusion Matrices from Multiple Experiments (Revised for Specific CSV Format)

This script reads individual confusion matrix CSV files where the first row contains
predicted labels and the first column (after header) contains actual labels.
It parses domain and model info from filenames, standardizes their format,
and combines them into a single comprehensive CSV.
"""

import pandas as pd
import numpy as np
import os
import re
import warnings

# Ignore warnings for cleaner output (optional)
warnings.filterwarnings('ignore')

# %% [markdown]
# # 1. Configuration

# %%
# --- Paths ---
BASE_RESULTS_DIR = "../result"
OUTPUT_DIR = os.path.join(BASE_RESULTS_DIR, "comparison")
AGGREGATED_CM_FILE = os.path.join(OUTPUT_DIR, "all_confusion_matrices_aggregated.csv") # New output

# --- Domain Mapping ---
DOMAIN_PREFIX_MAP = {
    "book_review": "Book Reviews",
    "book_reviews": "Book Reviews",
    "financial_news": "Financial News",
    # Add more if you have other domain prefixes
}

# --- Expected Labels (Order is important for mapping if CSV uses numeric indices) ---
# This also defines the order of labels in the output, assuming a 3x3 matrix
EXPECTED_LABELS_IN_ORDER = ['negative', 'neutral', 'positive']

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# %% [markdown]
# # 2. Helper Function to Parse Filename for Domain and Model

# %%
def parse_filename_for_domain_and_model(filename):
    """
    Extracts the domain and model configuration name from the confusion matrix filename.
    Example filename: "Book_Review_BERT_Full_FT_confusion_matrix.csv"
    Returns: (domain_name, model_config_name) or (None, None) if not parsable.
    """
    original_filename = filename
    filename_lower = filename.lower()
    extracted_domain = None
    remaining_filename_part = filename # Initialize with full filename

    longest_match_len = 0
    for prefix_key, domain_name_val in DOMAIN_PREFIX_MAP.items():
        # Ensure the prefix_key from the map is also lowercased for comparison
        current_prefix_in_filename = prefix_key.lower() + "_"
        if filename_lower.startswith(current_prefix_in_filename):
            if len(prefix_key) > longest_match_len:
                extracted_domain = domain_name_val
                remaining_filename_part = filename[len(prefix_key) + 1:]
                longest_match_len = len(prefix_key)

    if not extracted_domain:
        # Fallback: Try to infer domain if not in subfolder and not caught by prefix map
        # This part might need adjustment if your structure is strict
        print(f"    Warning: Could not determine domain from prefix for {original_filename}. Trying to infer.")
        # A simple heuristic could be to take the first part of the filename before an underscore
        # but this is less robust. DOMAIN_PREFIX_MAP is preferred.
        # For now, we'll stick to prefix map logic.
        return None, None


    suffix_to_remove = "_confusion_matrix.csv"
    if remaining_filename_part.lower().endswith(suffix_to_remove):
        model_config_name = remaining_filename_part[:-len(suffix_to_remove)]
    else:
        if "dl_pytorch_results" in original_filename.lower() or \
           "ml_tfidf_tuned_results" in original_filename.lower() or \
           "llm_transformers_results" in original_filename.lower():
            return None, None # Not a specific model's CM
        model_config_name = remaining_filename_part.replace(".csv", "")

    model_config_name = model_config_name.replace("_(" , " (").replace(")_", ") ")
    model_config_name = model_config_name.replace("_+", " + ")
    return extracted_domain, model_config_name.strip()

# %% [markdown]
# # 3. Main Aggregation Loop

# %%
all_matrices_data = []

print("Starting confusion matrix aggregation...")

for root_dir, _, files in os.walk(BASE_RESULTS_DIR):
    if os.path.basename(root_dir).lower() == "comparison":
        continue

    for filename in files:
        if filename.endswith(".csv") and "confusion_matrix" in filename.lower():
            file_path = os.path.join(root_dir, filename)
            domain_name, model_config_name = parse_filename_for_domain_and_model(filename)

            if domain_name is None or model_config_name is None:
                # print(f"  Skipping (unparsable/summary file): {filename}")
                continue

            print(f"  Reading: {filename} (Domain: {domain_name}, Model: {model_config_name})")

            try:
                # Read the CSV. The first row is header (Predicted Labels).
                # The first column of data (after header) is Actual Labels.
                cm_df_raw = pd.read_csv(file_path)

                # Check if the first column name is 'True Label' or similar (as in your example)
                # or if it's unnamed (often the case if index was written)
                if cm_df_raw.columns[0].lower() in ['true label', 'actual', 'unnamed: 0']:
                    actual_labels_from_file = cm_df_raw.iloc[:, 0].astype(str).tolist()
                    # Predicted labels are the column names from the second column onwards
                    predicted_labels_from_file = cm_df_raw.columns[1:].astype(str).tolist()
                    # The actual counts are in the remaining columns
                    counts_matrix = cm_df_raw.iloc[:, 1:].values
                else:
                    # A simpler case: Assume first col is index after reading, header is predicted
                    # This might occur if pandas infers an index during read if first col unnamed and unique
                    temp_df = pd.read_csv(file_path, index_col=0)
                    if temp_df.shape[0] == temp_df.shape[1] and temp_df.applymap(np.isreal).all().all():
                        actual_labels_from_file = temp_df.index.astype(str).tolist()
                        predicted_labels_from_file = temp_df.columns.astype(str).tolist()
                        counts_matrix = temp_df.values
                    else:
                        print(f"    Warning: Could not reliably parse label structure for {filename}. Skipping.")
                        continue


                # Validate dimensions (assuming 3x3 for Negative, Neutral, Positive)
                # This part needs to be flexible if your number of classes can vary
                # For now, let's assume 3x3 and map to EXPECTED_LABELS_IN_ORDER
                if counts_matrix.shape[0] != len(EXPECTED_LABELS_IN_ORDER) or \
                   counts_matrix.shape[1] != len(EXPECTED_LABELS_IN_ORDER):
                    print(f"    Warning: Matrix dimensions ({counts_matrix.shape}) for {filename} do not match expected "
                          f"({len(EXPECTED_LABELS_IN_ORDER)}x{len(EXPECTED_LABELS_IN_ORDER)}). "
                          f"Attempting to map based on found labels, but be cautious.")
                    # If dimensions don't match, we'll use the labels found in the file
                    # and the output will reflect that.
                    current_actual_labels = actual_labels_from_file
                    current_predicted_labels = predicted_labels_from_file
                else:
                    # If dimensions match, we assume the order in the file corresponds to EXPECTED_LABELS_IN_ORDER
                    current_actual_labels = EXPECTED_LABELS_IN_ORDER
                    current_predicted_labels = EXPECTED_LABELS_IN_ORDER


                for i, actual_label in enumerate(current_actual_labels):
                    # Ensure we don't go out of bounds if matrix was smaller than expected
                    if i >= counts_matrix.shape[0]: break
                    for j, predicted_label in enumerate(current_predicted_labels):
                        if j >= counts_matrix.shape[1]: break
                        count = counts_matrix[i, j]
                        all_matrices_data.append({
                            "Domain": domain_name,
                            "Model_Configuration": model_config_name,
                            "Actual_Label": actual_label,
                            "Predicted_Label": predicted_label,
                            "Count": count
                        })
            except Exception as e:
                print(f"    Error processing file {filename}: {e}")
                import traceback
                traceback.print_exc()


# %% [markdown]
# # 4. Create and Save Aggregated DataFrame

# %%
if all_matrices_data:
    aggregated_df = pd.DataFrame(all_matrices_data)
    # Calculate num_classes based on the most common matrix size found
    if not aggregated_df.empty:
        num_actual_labels = aggregated_df.groupby(['Domain', 'Model_Configuration'])['Actual_Label'].nunique()
        common_num_classes = num_actual_labels.mode()[0] if not num_actual_labels.empty else len(EXPECTED_LABELS_IN_ORDER)
        num_aggregated_matrices = len(aggregated_df) // (common_num_classes**2)
    else:
        num_aggregated_matrices = 0

    print(f"\nSuccessfully aggregated approximately {num_aggregated_matrices} confusion matrices.")
    print(f"Total rows in aggregated file: {len(aggregated_df)}")

    print("\nSample of aggregated data:")
    print(aggregated_df.sample(min(10, len(aggregated_df))).to_string())

    try:
        aggregated_df.to_csv(AGGREGATED_CM_FILE, index=False)
        print(f"\nAggregated confusion matrices saved to: {AGGREGATED_CM_FILE}")
    except Exception as e:
        print(f"\nError saving aggregated file: {e}")
else:
    print("\nNo confusion matrices were processed. Output file not created.")

print("\nAggregation script finished.")

Starting confusion matrix aggregation...
  Reading: Book_Review_BERT_Feature_Extractor__LR_confusion_matrix.csv (Domain: Book Reviews, Model: BERT_Feature_Extractor__LR)
  Reading: Book_Review_BERT_Full_FT_confusion_matrix.csv (Domain: Book Reviews, Model: BERT_Full_FT)
  Reading: Book_Review_BERT_LoRA_FT_confusion_matrix.csv (Domain: Book Reviews, Model: BERT_LoRA_FT)
  Reading: Book_Review_BiLSTM_(GloVe_Emb)_confusion_matrix.csv (Domain: Book Reviews, Model: BiLSTM (GloVe_Emb))
  Reading: Book_Review_BiLSTM_(Learned_Emb)_confusion_matrix.csv (Domain: Book Reviews, Model: BiLSTM (Learned_Emb))
  Reading: Book_Review_CNN-LSTM_(GloVe_Emb)_confusion_matrix.csv (Domain: Book Reviews, Model: CNN-LSTM (GloVe_Emb))
  Reading: Book_Review_CNN_(GloVe_Emb)_confusion_matrix.csv (Domain: Book Reviews, Model: CNN (GloVe_Emb))
  Reading: Book_Review_CNN_(Learned_Emb)_confusion_matrix.csv (Domain: Book Reviews, Model: CNN (Learned_Emb))
  Reading: Book_Review_DistilBERT_Feature_Extractor__LR_confusi

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# --- Configuration ---
CSV_FILE_PATH = "..//result//comparison_all_datasets.csv"
OUTPUT_DIR = "..//result//result_visualizations"
PRIMARY_METRICS = ["F1 (Macro)", "Accuracy", "Precision (Macro)", "Recall (Macro)"]
TIME_METRICS = ["Train Time (s)", "Eval Time (s)"]
MODEL_COLUMN = "Model"
DATASET_COLUMN = "Dataset"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

def plot_metrics_for_dataset(df_dataset, dataset_name, metrics_to_plot, y_label, title_suffix, output_dir):
    """
    Generates and saves a bar plot for specified metrics for a given dataset.
    """
    if df_dataset.empty:
        print(f"No data to plot for {dataset_name} - {title_suffix}.")
        return

    num_metrics = len(metrics_to_plot)
    num_models = len(df_dataset[MODEL_COLUMN].unique())

    plt.figure(figsize=(max(12, num_models * 0.8), 6 + num_metrics * 1)) # Adjust figure size
    
    # Sort by the first metric in the list for consistent plotting order (optional)
    df_dataset_sorted = df_dataset.sort_values(by=metrics_to_plot[0], ascending=False)

    for i, metric in enumerate(metrics_to_plot):
        if metric not in df_dataset_sorted.columns:
            print(f"Warning: Metric '{metric}' not found in data for dataset '{dataset_name}'. Skipping this metric.")
            continue
        
        # For multiple metrics, we can plot them side-by-side or group them.
        # Here, we'll use seaborn's barplot which handles grouping well if data is melted,
        # or plot them individually if preferred. For simplicity, let's plot them as grouped bars.

    df_melted = df_dataset_sorted.melt(id_vars=[MODEL_COLUMN], value_vars=metrics_to_plot, var_name='Metric', value_name='Score')

    sns.barplot(x=MODEL_COLUMN, y='Score', hue='Metric', data=df_melted, palette='viridis')

    plt.title(f'{dataset_name} - {title_suffix}', fontsize=16)
    plt.xlabel(MODEL_COLUMN, fontsize=12)
    plt.ylabel(y_label, fontsize=12)
    plt.xticks(rotation=45, ha='right', fontsize=10)
    plt.yticks(fontsize=10)
    plt.legend(title='Metric')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()

    plot_filename = f"{dataset_name.replace(' ', '_')}_{title_suffix.replace(' ', '_')}.png"
    plt.savefig(os.path.join(output_dir, plot_filename))
    print(f"Saved plot: {os.path.join(output_dir, plot_filename)}")
    plt.close()

def main():
    # --- Load Data ---
    try:
        df_all_results = pd.read_csv(CSV_FILE_PATH)
        print(f"Successfully loaded {CSV_FILE_PATH} with {len(df_all_results)} rows.")
    except FileNotFoundError:
        print(f"Error: {CSV_FILE_PATH} not found. Please ensure the file exists at the correct path.")
        return
    except Exception as e:
        print(f"Error loading {CSV_FILE_PATH}: {e}")
        return

    # Convert metric columns to numeric, coercing errors
    for metric_list in [PRIMARY_METRICS, TIME_METRICS]:
        for col in metric_list:
            if col in df_all_results.columns:
                df_all_results[col] = pd.to_numeric(df_all_results[col], errors='coerce')
            else:
                print(f"Warning: Column '{col}' not found in the CSV. It will be skipped.")
    
    if DATASET_COLUMN not in df_all_results.columns:
        print(f"Error: Dataset column '{DATASET_COLUMN}' not found in the CSV. Cannot proceed.")
        return
    if MODEL_COLUMN not in df_all_results.columns:
        print(f"Error: Model column '{MODEL_COLUMN}' not found in the CSV. Cannot proceed.")
        return

    unique_datasets = df_all_results[DATASET_COLUMN].unique()

    for dataset_name in unique_datasets:
        print(f"\nProcessing dataset: {dataset_name}")
        df_dataset = df_all_results[df_all_results[DATASET_COLUMN] == dataset_name].copy()

        # Plot Performance Metrics
        plot_metrics_for_dataset(df_dataset, dataset_name, PRIMARY_METRICS, 'Score', 'Performance Metrics', OUTPUT_DIR)

        # Plot Time Metrics
        plot_metrics_for_dataset(df_dataset, dataset_name, TIME_METRICS, 'Time (seconds)', 'Time Metrics', OUTPUT_DIR)

    print(f"\nAll visualizations saved to: {OUTPUT_DIR}")

if __name__ == "__main__":
    main()

Successfully loaded ..//result//comparison_all_datasets.csv with 45 rows.

Processing dataset: Book Review
Saved plot: ..//result//result_visualizations\Book_Review_Performance_Metrics.png
Saved plot: ..//result//result_visualizations\Book_Review_Time_Metrics.png

Processing dataset: Financial News
Saved plot: ..//result//result_visualizations\Financial_News_Performance_Metrics.png
Saved plot: ..//result//result_visualizations\Financial_News_Time_Metrics.png

All visualizations saved to: ..//result//result_visualizations


In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

# --- Configuration ---
FILE_PATH = "../result/comparison/all_confusion_matrices_aggregated.csv"
OUTPUT_DIR = "../result/misclassification_analysis"
CLASSES = ['negative', 'neutral', 'positive']  # Expected class labels

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

def analyze_misclassifications(df_subset, domain, model):
    """
    Analyze misclassification patterns in a confusion matrix.
    
    Args:
        df_subset: DataFrame containing confusion matrix data for a specific domain and model
        domain: The domain name
        model: The model configuration name
        
    Returns:
        A dictionary containing misclassification statistics
    """
    results = {
        "Domain": domain,
        "Model": model,
        "ClassCounts": {},
        "MisclassificationRates": {},
        "CorrectClassificationRates": {}
    }
    
    # Get total counts for each actual class
    for actual_class in CLASSES:
        # Get data for this actual class (case insensitive matching)
        class_data = df_subset[df_subset['Actual_Label'].str.lower() == actual_class.lower()]
        
        if not class_data.empty:
            total_count = class_data['Count'].sum()
            results["ClassCounts"][actual_class] = total_count
            
            # Calculate correct classification rate
            correct_count = class_data[class_data['Predicted_Label'].str.lower() == actual_class.lower()]['Count'].sum()
            correct_rate = (correct_count / total_count) * 100 if total_count > 0 else 0
            results["CorrectClassificationRates"][actual_class] = correct_rate
            
            # Calculate misclassification rates to other classes
            for predicted_class in CLASSES:
                if predicted_class.lower() != actual_class.lower():
                    misclassified_count = class_data[
                        class_data['Predicted_Label'].str.lower() == predicted_class.lower()
                    ]['Count'].sum()
                    
                    misclassification_rate = (misclassified_count / total_count) * 100 if total_count > 0 else 0
                    key = f"{actual_class}_as_{predicted_class}"
                    results["MisclassificationRates"][key] = misclassification_rate
    
    return results

def main():
    print("Starting misclassification analysis...")
    
    # Load the aggregated confusion matrix data
    try:
        cm_df = pd.read_csv(FILE_PATH)
        print(f"Successfully loaded {FILE_PATH} with {len(cm_df)} rows.")
    except FileNotFoundError:
        print(f"Error: {FILE_PATH} not found. Please ensure the file exists at the correct path.")
        return
    except Exception as e:
        print(f"Error loading {FILE_PATH}: {e}")
        return
    
    # Normalize class labels for consistency (lowercase)
    cm_df['Actual_Label'] = cm_df['Actual_Label'].str.lower()
    cm_df['Predicted_Label'] = cm_df['Predicted_Label'].str.lower()
    
    # Get unique combinations of domain and model
    domain_model_pairs = cm_df[['Domain', 'Model_Configuration']].drop_duplicates()
    
    # Initialize list to store all analysis results
    all_results = []
    
    # Analyze each domain-model combination
    for _, row in domain_model_pairs.iterrows():
        domain = row['Domain']
        model = row['Model_Configuration']
        
        print(f"\nAnalyzing: {domain} - {model}")
        
        # Get subset of data for this domain and model
        subset = cm_df[
            (cm_df['Domain'] == domain) & 
            (cm_df['Model_Configuration'] == model)
        ]
        
        if subset.empty:
            print(f"  No data found for {domain} - {model}")
            continue
        
        # Analyze misclassifications
        results = analyze_misclassifications(subset, domain, model)
        all_results.append(results)
        
        # Print summary statistics for this model
        print(f"  Class counts: {results['ClassCounts']}")
        
        print(f"  Correct classification rates:")
        for cls, rate in results['CorrectClassificationRates'].items():
            print(f"    {rate:.2f}% of actual {cls} reviews were correctly classified.")
        
        print(f"  Misclassification rates:")
        for key, rate in results['MisclassificationRates'].items():
            actual_class, predicted_class = key.split('_as_')
            print(f"    {rate:.2f}% of actual {actual_class} reviews were predicted as {predicted_class}.")
    
    # Create a consolidated DataFrame for all results
    def extract_row_from_result(result):
        row = {
            "Domain": result["Domain"],
            "Model": result["Model"]
        }
        
        # Add class counts
        for cls, count in result["ClassCounts"].items():
            row[f"{cls}_count"] = count
        
        # Add correct classification rates
        for cls, rate in result["CorrectClassificationRates"].items():
            row[f"{cls}_correct_rate"] = rate
        
        # Add misclassification rates
        for key, rate in result["MisclassificationRates"].items():
            row[key] = rate
        
        return row
    
    results_rows = [extract_row_from_result(result) for result in all_results]
    results_df = pd.DataFrame(results_rows)
    
    # Save the consolidated results
    output_file = os.path.join(OUTPUT_DIR, "misclassification_analysis_summary.csv")
    results_df.to_csv(output_file, index=False)
    print(f"\nSaved misclassification analysis to: {output_file}")
    
    # Calculate domain-level average misclassification rates
    print("\n--- Domain-Level Misclassification Analysis ---")
    for domain in results_df['Domain'].unique():
        domain_df = results_df[results_df['Domain'] == domain]
        print(f"\n{domain} (averaged across {len(domain_df)} models):")
        
        # Calculate average misclassification rates for this domain
        misclass_columns = [col for col in domain_df.columns if '_as_' in col]
        for col in misclass_columns:
            avg_rate = domain_df[col].mean()
            actual_class, predicted_class = col.split('_as_')
            print(f"  {avg_rate:.2f}% of actual {actual_class} reviews were predicted as {predicted_class}.")
        
        # Create visualization of average misclassification patterns
        plt.figure(figsize=(10, 8))
        
        # Create confusion-matrix-like structure for visualization
        confusion_data = np.zeros((len(CLASSES), len(CLASSES)))
        class_indices = {cls: i for i, cls in enumerate(CLASSES)}
        
        # Fill in correct classification rates
        for cls in CLASSES:
            i = class_indices[cls]
            confusion_data[i, i] = domain_df[f"{cls}_correct_rate"].mean()
        
        # Fill in misclassification rates
        for col in misclass_columns:
            actual_class, predicted_class = col.split('_as_')
            i = class_indices[actual_class]
            j = class_indices[predicted_class]
            confusion_data[i, j] = domain_df[col].mean()
        
        # Plot heatmap
        ax = sns.heatmap(confusion_data, annot=True, fmt=".2f", cmap="YlGnBu",
                    xticklabels=CLASSES, yticklabels=CLASSES)
        plt.title(f"Average Classification Rates (%) for {domain}")
        plt.xlabel("Predicted Class")
        plt.ylabel("Actual Class")
        
        # Save the figure
        heatmap_file = os.path.join(OUTPUT_DIR, f"{domain.replace(' ', '_')}_avg_classification_heatmap.png")
        plt.tight_layout()
        plt.savefig(heatmap_file)
        plt.close()
        print(f"  Saved heatmap visualization to: {heatmap_file}")
        
        # Find most common misclassifications
        highest_misclass = {}
        for cls in CLASSES:
            misclass_cols = [col for col in misclass_columns if col.startswith(f"{cls}_as_")]
            if misclass_cols:
                avg_rates = [domain_df[col].mean() for col in misclass_cols]
                max_col = misclass_cols[np.argmax(avg_rates)]
                highest_misclass[cls] = (max_col, max(avg_rates))
        
        print("\n  Most common misclassifications:")
        for actual_class, (misclass_col, rate) in highest_misclass.items():
            _, predicted_class = misclass_col.split('_as_')
            print(f"    {actual_class} → {predicted_class}: {rate:.2f}%")
    
    # Check if neutral is getting "eaten" by other classes
    print("\n--- 'Neutral' Class Analysis ---")
    for domain in results_df['Domain'].unique():
        domain_df = results_df[results_df['Domain'] == domain]
        neutral_as_neg = domain_df['neutral_as_negative'].mean()
        neutral_as_pos = domain_df['neutral_as_positive'].mean()
        neg_as_neutral = domain_df['negative_as_neutral'].mean()
        pos_as_neutral = domain_df['positive_as_neutral'].mean()
        
        print(f"\n{domain}:")
        print(f"  Neutral class is misclassified as:")
        print(f"    Negative: {neutral_as_neg:.2f}%")
        print(f"    Positive: {neutral_as_pos:.2f}%")
        print(f"  Other classes misclassified as Neutral:")
        print(f"    Negative → Neutral: {neg_as_neutral:.2f}%")
        print(f"    Positive → Neutral: {pos_as_neutral:.2f}%")
        
        # Determine if neutral is being "eaten" or "eating" other classes
        neutral_outflow = neutral_as_neg + neutral_as_pos
        neutral_inflow = neg_as_neutral + pos_as_neutral
        
        if neutral_outflow > neutral_inflow:
            print(f"  Neutral class is being 'eaten' by other classes (Net loss: {neutral_outflow - neutral_inflow:.2f}%)")
        else:
            print(f"  Neutral class is 'eating' other classes (Net gain: {neutral_inflow - neutral_outflow:.2f}%)")

if __name__ == "__main__":
    main()

Starting misclassification analysis...
Successfully loaded ../result/comparison/all_confusion_matrices_aggregated.csv with 405 rows.

Analyzing: Book Reviews - BERT_Feature_Extractor__LR
  Class counts: {'negative': 5292, 'neutral': 3814, 'positive': 35894}
  Correct classification rates:
    71.26% of actual negative reviews were correctly classified.
    54.01% of actual neutral reviews were correctly classified.
    74.32% of actual positive reviews were correctly classified.
  Misclassification rates:
    21.71% of actual negative reviews were predicted as neutral.
    7.03% of actual negative reviews were predicted as positive.
    24.15% of actual neutral reviews were predicted as negative.
    21.84% of actual neutral reviews were predicted as positive.
    7.99% of actual positive reviews were predicted as negative.
    17.69% of actual positive reviews were predicted as neutral.

Analyzing: Book Reviews - BERT_Full_FT
  Class counts: {'negative': 5292, 'neutral': 3814, 'positi