In [None]:
import pandas as pd
import os
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
BASE_RESULT_DIR = os.path.join("..", "result")

# --- Input Subdirectories ---
BOOK_REVIEW_RESULT_DIR = os.path.join(BASE_RESULT_DIR, "book_reviews")
FINANCIAL_NEWS_RESULT_DIR = os.path.join(BASE_RESULT_DIR, "financial_news")

# --- Input Filenames (Update if yours are different) ---
ML_RESULTS_FN_BR = "book_reviews_ml_tfidf_tuned_results.csv"
DL_RESULTS_FN_BR = "book_reviews_dl_pytorch_results.csv"
LLM_RESULTS_FN_BR = "book_reviews_llm_transformers_results.csv"

ML_RESULTS_FN_FN = "financial_news_ml_tfidf_tuned_results.csv"
DL_RESULTS_FN_FN = "financial_news_dl_pytorch_results.csv"
LLM_RESULTS_FN_FN = "financial_news_llm_transformers_results.csv"

# --- Output Filenames ---
OUTPUT_COMPARISON_BR = os.path.join(BASE_RESULT_DIR, "comparison_book_reviews.csv")
OUTPUT_COMPARISON_FN = os.path.join(BASE_RESULT_DIR, "comparison_financial_news.csv")
OUTPUT_COMPARISON_ALL = os.path.join(BASE_RESULT_DIR, "comparison_all_datasets.csv")

# --- Columns to keep (adjust if needed, should match generated files) ---
# Example, assuming these were generated consistently
EXPECTED_COLUMNS = [
    "Dataset", "Model",
    "Accuracy",
    "F1 (Macro)", "Precision (Macro)", "Recall (Macro)",
    "F1 (Weighted)", "Precision (Weighted)", "Recall (Weighted)",
    "Train Time (s)", "Eval Time (s)", # Note: DL used Train Time (Epoch, s)
    # Optional columns generated by specific scripts:
    # "Best Params", # From ML script
    # "Train Time (Epoch, s)" # From DL script
]
SORT_BY_COLUMN = "F1 (Macro)" # Column to sort results by

In [None]:
# --- Helper Function to Load Results ---
def load_results(file_path):
    """Loads a CSV result file into a Pandas DataFrame."""
    if not os.path.exists(file_path):
        logging.warning(f"Result file not found, skipping: {file_path}")
        return None
    try:
        df = pd.read_csv(file_path)
        logging.info(f"Successfully loaded: {file_path} ({len(df)} rows)")
        # Handle potential variations in training time column name
        if "Train Time (Epoch, s)" in df.columns and "Train Time (s)" not in df.columns:
             df.rename(columns={"Train Time (Epoch, s)": "Train Time (s)"}, inplace=True)
             logging.debug(f"Renamed 'Train Time (Epoch, s)' to 'Train Time (s)' in {os.path.basename(file_path)}")
        return df
    except Exception as e:
        logging.error(f"Error loading {file_path}: {e}")
        return None

In [None]:
logging.info("Starting result aggregation script...")

# --- Load Book Review Results ---
br_ml_df = load_results(os.path.join(BOOK_REVIEW_RESULT_DIR, ML_RESULTS_FN_BR))
br_dl_df = load_results(os.path.join(BOOK_REVIEW_RESULT_DIR, DL_RESULTS_FN_BR))
br_llm_df = load_results(os.path.join(BOOK_REVIEW_RESULT_DIR, LLM_RESULTS_FN_BR))

# --- Load Financial News Results ---
fn_ml_df = load_results(os.path.join(FINANCIAL_NEWS_RESULT_DIR, ML_RESULTS_FN_FN))
fn_dl_df = load_results(os.path.join(FINANCIAL_NEWS_RESULT_DIR, DL_RESULTS_FN_FN))
fn_llm_df = load_results(os.path.join(FINANCIAL_NEWS_RESULT_DIR, LLM_RESULTS_FN_FN))

# List of loaded dataframes for easier handling
all_dfs = [df for df in [br_ml_df, br_dl_df, br_llm_df, fn_ml_df, fn_dl_df, fn_llm_df] if df is not None]

if not all_dfs:
    logging.error("No result files were loaded successfully. Exiting.")
    exit()

# --- Combine and Save Book Review Comparison ---
book_review_dfs = [df for df in [br_ml_df, br_dl_df, br_llm_df] if df is not None]
if book_review_dfs:
    logging.info(f"Combining {len(book_review_dfs)} result files for Book Reviews...")
    combined_br_df = pd.concat(book_review_dfs, ignore_index=True)

    # Select and reorder columns gracefully
    cols_to_use = [col for col in EXPECTED_COLUMNS if col in combined_br_df.columns]
    combined_br_df = combined_br_df[cols_to_use]

    # Sort results
    if SORT_BY_COLUMN in combined_br_df.columns:
        combined_br_df = combined_br_df.sort_values(by=SORT_BY_COLUMN, ascending=False)

    try:
        combined_br_df.to_csv(OUTPUT_COMPARISON_BR, index=False)
        logging.info(f"Book Review comparison saved to: {OUTPUT_COMPARISON_BR}")
    except Exception as e:
        logging.error(f"Failed to save Book Review comparison: {e}")
else:
    logging.warning("No Book Review result files found to combine.")


# --- Combine and Save Financial News Comparison ---
financial_news_dfs = [df for df in [fn_ml_df, fn_dl_df, fn_llm_df] if df is not None]
if financial_news_dfs:
    logging.info(f"Combining {len(financial_news_dfs)} result files for Financial News...")
    combined_fn_df = pd.concat(financial_news_dfs, ignore_index=True)

    # Select and reorder columns gracefully
    cols_to_use = [col for col in EXPECTED_COLUMNS if col in combined_fn_df.columns]
    combined_fn_df = combined_fn_df[cols_to_use]

    # Sort results
    if SORT_BY_COLUMN in combined_fn_df.columns:
        combined_fn_df = combined_fn_df.sort_values(by=SORT_BY_COLUMN, ascending=False)

    try:
        combined_fn_df.to_csv(OUTPUT_COMPARISON_FN, index=False)
        logging.info(f"Financial News comparison saved to: {OUTPUT_COMPARISON_FN}")
    except Exception as e:
        logging.error(f"Failed to save Financial News comparison: {e}")
else:
    logging.warning("No Financial News result files found to combine.")


# --- Combine and Save Overall Comparison ---
logging.info(f"Combining all {len(all_dfs)} loaded result files...")
combined_all_df = pd.concat(all_dfs, ignore_index=True)

# Select and reorder columns gracefully
cols_to_use = [col for col in EXPECTED_COLUMNS if col in combined_all_df.columns]
# Add back any extra columns that might exist (like Best Params) but weren't in EXPECTED_COLUMNS
extra_cols = [col for col in combined_all_df.columns if col not in cols_to_use]
final_cols_order = cols_to_use + extra_cols
combined_all_df = combined_all_df[final_cols_order]


# Sort results (e.g., by Dataset, then by the chosen metric)
if SORT_BY_COLUMN in combined_all_df.columns and "Dataset" in combined_all_df.columns:
    combined_all_df = combined_all_df.sort_values(by=["Dataset", SORT_BY_COLUMN], ascending=[True, False])
elif "Dataset" in combined_all_df.columns:
    combined_all_df = combined_all_df.sort_values(by=["Dataset"])


try:
    combined_all_df.to_csv(OUTPUT_COMPARISON_ALL, index=False)
    logging.info(f"Overall comparison saved to: {OUTPUT_COMPARISON_ALL}")
except Exception as e:
    logging.error(f"Failed to save overall comparison: {e}")

logging.info("Result aggregation finished.")