# Machine Learning Model Training and Evaluation

This notebook demonstrates how to train and evaluate machine learning models using embeddings extracted from Vision Transformer models. It utilizes the functions from the `src/ml_classifiers` package for model training, evaluation, and results export.

## 1. Environment Setup and Dependencies

Configure paths and import required libraries for ML model training.

In [None]:
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path

# Add root directory to path for importing project modules
project_root = str(Path().absolute().parent)
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Import ML classifier functions
from src.ml_classifiers import (
    setup_training_data,
    train_model,
    evaluate_model,
    load_embeddings,
    get_available_combinations
)

# Configure paths
EMBEDDINGS_DIR = "data/embeddings"  # Directory containing extracted embeddings
RESULTS_DIR = "results"       # Directory to save training results

# Training configuration
USE_DATA_BALANCING = True    # Whether to apply SMOTE for imbalanced data
SAVE_CONFUSION_MATRIX = True # Save confusion matrix plots

## 2. Load Embeddings Data

Load and organize embeddings from parquet files.

In [None]:
# Load all embeddings
print("Loading embeddings from:", EMBEDDINGS_DIR)
dataframes = load_embeddings(EMBEDDINGS_DIR)

# Get available model-dataset combinations
combinations = get_available_combinations(dataframes)

print("\nAvailable combinations:")
for model_name, dataset_name in combinations:
    print(f"- {model_name} / {dataset_name}")
    
# Print some example shapes
print("\nExample DataFrame shapes:")
for (model, dataset, split), df in list(dataframes.items())[:3]:
    print(f"{model} - {dataset} - {split}: {df.shape}")

## 3. Training and Evaluation Loop

Process each model-dataset combination, training and evaluating machine learning models.

In [None]:
# Process each combination
for model_name, dataset_name in combinations:
    print(f"\nProcessing {model_name} - {dataset_name}")
    
    # Get data splits
    df_train = dataframes[(model_name, dataset_name, 'train')]
    df_val = dataframes[(model_name, dataset_name, 'validation')]
    df_test = dataframes[(model_name, dataset_name, 'test')]
    
    # Prepare data
    train_data, val_data, test_data = setup_training_data(df_train, df_val, df_test)
    
    print("\nTraining models...")
    best_model, results = train_model(
        train_data,
        val_data,
        model_name=model_name,
        dataset_name=dataset_name,
        balance_data=USE_DATA_BALANCING,
        results_dir=RESULTS_DIR
    )
    
    print("\nEvaluating on test set...")
    eval_results = evaluate_model(
        best_model,
        test_data,
        model_name=model_name,
        dataset_name=dataset_name,
        results_dir=RESULTS_DIR
    )
    
    print(f"\nResults saved in {RESULTS_DIR}/{model_name}/{dataset_name}/")

## 4. Results Analysis

Now we will visualize and analyze the results obtained for each model-dataset combination.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Function to extract metrics from LaTeX file
def extract_metrics_from_tex(tex_path):
    with open(tex_path, 'r') as f:
        content = f.read()
        
    # Extract metrics using regex patterns
    accuracy = float(re.search(r'Accuracy: ([\d.]+)', content).group(1))
    f1 = float(re.search(r'F1-score \(weighted\): ([\d.]+)', content).group(1))
    precision = float(re.search(r'Precision \(weighted\): ([\d.]+)', content).group(1))
    recall = float(re.search(r'Recall \(weighted\): ([\d.]+)', content).group(1))
    
    return accuracy, f1, precision, recall

# Collect results
results_data = []
for model_name, dataset_name in combinations:
    # Load results from .tex file
    results_path = os.path.join(RESULTS_DIR, model_name, dataset_name, 'test_results.tex')
    try:
        accuracy, f1, precision, recall = extract_metrics_from_tex(results_path)
        
        results_data.append({
            'Model': model_name,
            'Dataset': dataset_name,
            'Accuracy': accuracy,
            'F1 Score': f1,
            'Precision': precision,
            'Recall': recall
        })
    except (FileNotFoundError, AttributeError) as e:
        print(f"Warning: Could not process results for {model_name} - {dataset_name}: {str(e)}")

# Create DataFrame with results
df_results = pd.DataFrame(results_data)

# Configure visualization style
plt.style.use('seaborn')
sns.set_palette("husl")

# Create figure for main metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Metrics Comparison by Model and Dataset', fontsize=16)

metrics = ['Accuracy', 'F1 Score', 'Precision', 'Recall']
for i, metric in enumerate(metrics):
    ax = axes[i//2, i%2]
    sns.barplot(data=df_results, x='Model', y=metric, hue='Dataset', ax=ax)
    ax.set_title(f'{metric} by Model and Dataset')
    ax.tick_params(axis='x', rotation=45)
    ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()

# Show results table
print("\nResults Table:")
print(df_results.round(4).to_string(index=False))

In [None]:
# Generate heatmap of model-dataset performance
pivot_acc = df_results.pivot(index='Model', columns='Dataset', values='Accuracy')
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_acc, annot=True, cmap='YlOrRd', fmt='.3f', center=0.5)
plt.title('Accuracy Heatmap by Model and Dataset')
plt.tight_layout()
plt.show()

# Calculate and display summary statistics
print("\nStatistics by Model:")
print("=" * 50)
model_stats = df_results.groupby('Model')[['Accuracy', 'F1 Score', 'Precision', 'Recall']].agg(['mean', 'std'])
print(model_stats.round(4).to_string())

print("\nStatistics by Dataset:")
print("=" * 50)
dataset_stats = df_results.groupby('Dataset')[['Accuracy', 'F1 Score', 'Precision', 'Recall']].agg(['mean', 'std'])
print(dataset_stats.round(4).to_string())