## Model Evaluation Pipeline: Performance Analysis & Visualization

This notebook handles comprehensive model evaluation and analysis:

1. **Setup & Model Loading** - Load the trained model and datasets
2. **Prediction Generation** - Generate predictions on validation and test sets
3. **Performance Metrics** - Calculate accuracy, precision, recall, F1-score
4. **Confusion Matrix Analysis** - Visualize classification patterns and errors
5. **Per-Class Analysis** - Detailed metrics for each enzyme class
6. **Error Analysis** - Examine misclassified examples
7. **Results Visualization** - Interactive plots and comprehensive reports

After running this notebook, you should have:
- `results/predictions/` - Saved predictions for both validation and test sets
- `results/figures/` - Confusion matrices and performance visualizations
- `results/metrics/` - Detailed classification reports and per-class statistics
- Comprehensive understanding of model strengths and weaknesses

### Import Libraries

In [None]:
import os
import yaml
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
from peft import PeftModel
from datasets import load_from_disk
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    precision_recall_fscore_support
)

print("✓ All libraries imported successfully")

## 1. Setup & Configuration

In [None]:
# Load configuration
print("=" * 70)
print("EVALUATION SETUP")
print("=" * 70)

with open("../config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

print("\nConfiguration loaded:")
print(f"  Project: {cfg['project']['name']}")
print(f"  Model: {cfg['model']['name']}")
print(f"  Number of classes: {cfg['model']['num_labels']}")

# EC class names
ec_names = [
    "Oxidoreductases",
    "Transferases",
    "Hydrolases",
    "Lyases",
    "Isomerases",
    "Ligases",
    "Translocases"
]

ec_names_short = ["Oxido", "Trans", "Hydro", "Lyase", "Isom", "Ligase", "Transl"]

print("\n✓ Setup complete")

In [None]:
# Create output directories
print("\n=== CREATING OUTPUT DIRECTORIES ===")

output_dirs = [
    "../results/predictions",
    "../results/figures",
    "../results/metrics"
]

for dir_path in output_dirs:
    os.makedirs(dir_path, exist_ok=True)
    print(f"  ✓ {dir_path}")

print("\n✓ Output directories ready")

In [None]:
# Check device
print("\n=== DEVICE CHECK ===")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Load Model & Datasets

In [None]:
print("\n=== LOADING TRAINED MODEL ===")

model_path = "../models/final_model"

if not os.path.exists(model_path):
    print(f"❌ Error: Model not found at {model_path}")
    print("Please run 02_training.ipynb first to train the model.")
else:
    print(f"Loading model from: {model_path}")
    
    # Load the base model first
    base_model = AutoModelForSequenceClassification.from_pretrained(
        cfg['model']['name'],
        num_labels=cfg['model']['num_labels'],
        trust_remote_code=True
    )
    
    # Load LoRA weights
    model = PeftModel.from_pretrained(base_model, model_path)
    model = model.to(device)
    model.eval()
    
    print("✓ Model loaded successfully")
    print(f"  Model on device: {next(model.parameters()).device}")

In [None]:
print("\n=== LOADING DATASETS ===")

# Load tokenized datasets
val_path = "../data/tokenized/val_dataset"
test_path = "../data/tokenized/test_dataset"

print(f"Loading validation dataset from: {val_path}")
val_ds = load_from_disk(val_path).with_format("torch")

print(f"Loading test dataset from: {test_path}")
test_ds = load_from_disk(test_path).with_format("torch")

print("\n=== DATASET SUMMARY ===")
print(f"Validation samples: {len(val_ds):,}")
print(f"Test samples: {len(test_ds):,}")

print("\n✓ Datasets loaded successfully")

## 3. Generate Predictions

In [None]:
# Setup trainer for prediction
print("\n=== SETTING UP TRAINER FOR EVALUATION ===")

# Minimal training arguments for evaluation
eval_args = TrainingArguments(
    output_dir="../models/eval_temp",
    per_device_eval_batch_size=16,
    dataloader_num_workers=4,
)

trainer = Trainer(
    model=model,
    args=eval_args,
)

print("✓ Trainer configured for evaluation")

In [None]:
# Generate predictions on validation set
print("\n=== GENERATING VALIDATION PREDICTIONS ===")

val_predictions = trainer.predict(val_ds)
val_logits = val_predictions.predictions
val_pred_labels = val_logits.argmax(-1)
val_true_labels = val_predictions.label_ids

# Get prediction probabilities
val_probs = torch.softmax(torch.tensor(val_logits), dim=-1).numpy()
val_confidence = val_probs.max(-1)

print(f"✓ Generated predictions for {len(val_pred_labels):,} validation samples")
print(f"  Prediction shape: {val_logits.shape}")

In [None]:
# Generate predictions on test set
print("\n=== GENERATING TEST PREDICTIONS ===")

test_predictions = trainer.predict(test_ds)
test_logits = test_predictions.predictions
test_pred_labels = test_logits.argmax(-1)
test_true_labels = test_predictions.label_ids

# Get prediction probabilities
test_probs = torch.softmax(torch.tensor(test_logits), dim=-1).numpy()
test_confidence = test_probs.max(-1)

print(f"✓ Generated predictions for {len(test_pred_labels):,} test samples")
print(f"  Prediction shape: {test_logits.shape}")

## 4. Overall Performance Metrics

In [None]:
# Calculate overall metrics
print("\n=== OVERALL PERFORMANCE METRICS ===")

# Validation metrics
val_accuracy = accuracy_score(val_true_labels, val_pred_labels)
val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(
    val_true_labels, val_pred_labels, average='weighted'
)

# Test metrics
test_accuracy = accuracy_score(test_true_labels, test_pred_labels)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(
    test_true_labels, test_pred_labels, average='weighted'
)

print("\nValidation Set:")
print(f"  Accuracy:  {val_accuracy:.4f}")
print(f"  Precision: {val_precision:.4f}")
print(f"  Recall:    {val_recall:.4f}")
print(f"  F1-Score:  {val_f1:.4f}")

print("\nTest Set:")
print(f"  Accuracy:  {test_accuracy:.4f}")
print(f"  Precision: {test_precision:.4f}")
print(f"  Recall:    {test_recall:.4f}")
print(f"  F1-Score:  {test_f1:.4f}")

# Save metrics
metrics_df = pd.DataFrame({
    'Split': ['Validation', 'Test'],
    'Accuracy': [val_accuracy, test_accuracy],
    'Precision': [val_precision, test_precision],
    'Recall': [val_recall, test_recall],
    'F1-Score': [val_f1, test_f1]
})

metrics_df.to_csv('../results/metrics/overall_metrics.csv', index=False)
print("\n✓ Overall metrics saved to results/metrics/overall_metrics.csv")

In [None]:
# Visualize overall metrics
print("\n=== VISUALIZING OVERALL METRICS ===")

fig = go.Figure()

metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
val_values = [val_accuracy, val_precision, val_recall, val_f1]
test_values = [test_accuracy, test_precision, test_recall, test_f1]

fig.add_trace(go.Bar(
    name='Validation',
    x=metrics_to_plot,
    y=val_values,
    text=[f'{v:.3f}' for v in val_values],
    textposition='auto',
))

fig.add_trace(go.Bar(
    name='Test',
    x=metrics_to_plot,
    y=test_values,
    text=[f'{v:.3f}' for v in test_values],
    textposition='auto',
))

fig.update_layout(
    title='Overall Model Performance: Validation vs Test',
    xaxis_title='Metric',
    yaxis_title='Score',
    yaxis=dict(range=[0, 1]),
    barmode='group',
    width=800,
    height=500
)

fig.show()
print("✓ Overall metrics visualization complete")

## 5. Confusion Matrix Analysis

In [None]:
# Compute confusion matrices
print("\n=== COMPUTING CONFUSION MATRICES ===")

val_cm = confusion_matrix(val_true_labels, val_pred_labels)
test_cm = confusion_matrix(test_true_labels, test_pred_labels)

# Normalize confusion matrices
val_cm_norm = val_cm.astype('float') / val_cm.sum(axis=1)[:, np.newaxis]
test_cm_norm = test_cm.astype('float') / test_cm.sum(axis=1)[:, np.newaxis]

print("✓ Confusion matrices computed")
print(f"  Validation CM shape: {val_cm.shape}")
print(f"  Test CM shape: {test_cm.shape}")

In [None]:
# Plot test confusion matrix (main result)
print("\n=== TEST SET CONFUSION MATRIX ===")

plt.figure(figsize=(12, 10))

# Absolute counts
plt.subplot(1, 2, 1)
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=ec_names_short, yticklabels=ec_names_short,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Absolute Counts', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Normalized (percentages)
plt.subplot(1, 2, 2)
sns.heatmap(test_cm_norm, annot=True, fmt='.2%', cmap='Blues',
            xticklabels=ec_names_short, yticklabels=ec_names_short,
            cbar_kws={'label': 'Percentage'})
plt.title('Confusion Matrix - Normalized', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('../results/figures/confusion_matrix_test.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Test confusion matrix saved to results/figures/confusion_matrix_test.png")

In [None]:
# Plot validation confusion matrix
print("\n=== VALIDATION SET CONFUSION MATRIX ===")

plt.figure(figsize=(12, 10))

# Absolute counts
plt.subplot(1, 2, 1)
sns.heatmap(val_cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=ec_names_short, yticklabels=ec_names_short,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Absolute Counts', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Normalized (percentages)
plt.subplot(1, 2, 2)
sns.heatmap(val_cm_norm, annot=True, fmt='.2%', cmap='Greens',
            xticklabels=ec_names_short, yticklabels=ec_names_short,
            cbar_kws={'label': 'Percentage'})
plt.title('Confusion Matrix - Normalized', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.tight_layout()
plt.savefig('../results/figures/confusion_matrix_val.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Validation confusion matrix saved to results/figures/confusion_matrix_val.png")

In [None]:
# Interactive confusion matrix
print("\n=== INTERACTIVE CONFUSION MATRIX (TEST SET) ===")

fig = go.Figure(data=go.Heatmap(
    z=test_cm_norm,
    x=ec_names,
    y=ec_names,
    colorscale='Blues',
    text=test_cm,
    texttemplate='%{text}<br>(%{z:.1%})',
    textfont={"size": 10},
    colorbar=dict(title="Accuracy")
))

fig.update_layout(
    title='Interactive Confusion Matrix - Test Set',
    xaxis_title='Predicted Label',
    yaxis_title='True Label',
    width=800,
    height=700
)

fig.show()
print("✓ Interactive confusion matrix displayed")

## 6. Per-Class Performance Analysis

In [None]:
# Generate detailed classification reports
print("\n=== CLASSIFICATION REPORT (TEST SET) ===")

test_report = classification_report(
    test_true_labels,
    test_pred_labels,
    target_names=ec_names,
    digits=4
)

print(test_report)

# Save report
with open('../results/metrics/classification_report_test.txt', 'w') as f:
    f.write(test_report)

print("\n✓ Classification report saved to results/metrics/classification_report_test.txt")

In [None]:
# Calculate per-class metrics
print("\n=== PER-CLASS METRICS (TEST SET) ===")

precision, recall, f1, support = precision_recall_fscore_support(
    test_true_labels, test_pred_labels
)

# Create DataFrame
perclass_df = pd.DataFrame({
    'Class': range(7),
    'Class_Name': ec_names,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Support': support
})

# Calculate accuracy per class from confusion matrix
class_accuracy = test_cm_norm.diagonal()
perclass_df['Accuracy'] = class_accuracy

print("\n" + perclass_df.to_string(index=False))

# Save to CSV
perclass_df.to_csv('../results/metrics/perclass_metrics_test.csv', index=False)
print("\n✓ Per-class metrics saved to results/metrics/perclass_metrics_test.csv")

In [None]:
# Visualize per-class performance
print("\n=== VISUALIZING PER-CLASS PERFORMANCE ===")

fig = go.Figure()

fig.add_trace(go.Bar(
    name='Precision',
    x=ec_names_short,
    y=perclass_df['Precision'],
    text=[f'{v:.3f}' for v in perclass_df['Precision']],
    textposition='auto',
))

fig.add_trace(go.Bar(
    name='Recall',
    x=ec_names_short,
    y=perclass_df['Recall'],
    text=[f'{v:.3f}' for v in perclass_df['Recall']],
    textposition='auto',
))

fig.add_trace(go.Bar(
    name='F1-Score',
    x=ec_names_short,
    y=perclass_df['F1-Score'],
    text=[f'{v:.3f}' for v in perclass_df['F1-Score']],
    textposition='auto',
))

fig.update_layout(
    title='Per-Class Performance Metrics (Test Set)',
    xaxis_title='Enzyme Class',
    yaxis_title='Score',
    yaxis=dict(range=[0, 1]),
    barmode='group',
    width=1000,
    height=500
)

fig.show()
print("✓ Per-class performance visualization complete")

In [None]:
# Identify best and worst performing classes
print("\n=== BEST AND WORST PERFORMING CLASSES ===")

sorted_by_f1 = perclass_df.sort_values('F1-Score', ascending=False)

print("\nTop 3 Classes (by F1-Score):")
for idx, row in sorted_by_f1.head(3).iterrows():
    print(f"  {row['Class']}. {row['Class_Name']}: F1={row['F1-Score']:.4f}, Acc={row['Accuracy']:.4f}")

print("\nBottom 3 Classes (by F1-Score):")
for idx, row in sorted_by_f1.tail(3).iterrows():
    print(f"  {row['Class']}. {row['Class_Name']}: F1={row['F1-Score']:.4f}, Acc={row['Accuracy']:.4f}")

print("\n✓ Performance analysis complete")

## 7. Error Analysis

In [None]:
# Analyze misclassifications
print("\n=== MISCLASSIFICATION ANALYSIS (TEST SET) ===")

# Find misclassified samples
misclassified_mask = test_pred_labels != test_true_labels
num_misclassified = misclassified_mask.sum()
total_samples = len(test_pred_labels)

print(f"\nTotal misclassifications: {num_misclassified:,} / {total_samples:,} ({num_misclassified/total_samples*100:.2f}%)")

# Get misclassified samples with details
misclassified_df = pd.DataFrame({
    'Index': np.where(misclassified_mask)[0],
    'True_Label': test_true_labels[misclassified_mask],
    'Pred_Label': test_pred_labels[misclassified_mask],
    'True_Class': [ec_names[label] for label in test_true_labels[misclassified_mask]],
    'Pred_Class': [ec_names[label] for label in test_pred_labels[misclassified_mask]],
    'Confidence': test_confidence[misclassified_mask]
})

print(f"\nMisclassified samples DataFrame shape: {misclassified_df.shape}")
print("\nFirst 10 misclassifications:")
print(misclassified_df.head(10).to_string(index=False))

In [None]:
# Find most common error patterns
print("\n=== MOST COMMON ERROR PATTERNS ===")

error_pairs = misclassified_df.groupby(['True_Class', 'Pred_Class']).size().reset_index(name='Count')
error_pairs = error_pairs.sort_values('Count', ascending=False)

print("\nTop 10 confusion pairs:")
print(f"\n{'True Class':<20} {'Predicted As':<20} {'Count':>8} {'% of Errors':>12}")
print("-" * 65)

for idx, row in error_pairs.head(10).iterrows():
    pct = (row['Count'] / num_misclassified) * 100
    print(f"{row['True_Class']:<20} {row['Pred_Class']:<20} {row['Count']:>8} {pct:>11.2f}%")

# Save error analysis
error_pairs.to_csv('../results/metrics/error_patterns.csv', index=False)
misclassified_df.to_csv('../results/predictions/misclassified_samples.csv', index=False)

print("\n✓ Error patterns saved to results/metrics/error_patterns.csv")
print("✓ Misclassified samples saved to results/predictions/misclassified_samples.csv")

In [None]:
# Analyze prediction confidence
print("\n=== PREDICTION CONFIDENCE ANALYSIS ===")

# Compare confidence for correct vs incorrect predictions
correct_mask = ~misclassified_mask
correct_confidence = test_confidence[correct_mask]
incorrect_confidence = test_confidence[misclassified_mask]

print(f"\nCorrect predictions:")
print(f"  Mean confidence: {correct_confidence.mean():.4f}")
print(f"  Median confidence: {np.median(correct_confidence):.4f}")
print(f"  Std confidence: {correct_confidence.std():.4f}")

print(f"\nIncorrect predictions:")
print(f"  Mean confidence: {incorrect_confidence.mean():.4f}")
print(f"  Median confidence: {np.median(incorrect_confidence):.4f}")
print(f"  Std confidence: {incorrect_confidence.std():.4f}")

# Plot confidence distributions
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=correct_confidence,
    name='Correct',
    opacity=0.7,
    nbinsx=50
))

fig.add_trace(go.Histogram(
    x=incorrect_confidence,
    name='Incorrect',
    opacity=0.7,
    nbinsx=50
))

fig.update_layout(
    title='Prediction Confidence Distribution: Correct vs Incorrect',
    xaxis_title='Confidence Score',
    yaxis_title='Count',
    barmode='overlay',
    width=900,
    height=500
)

fig.show()
print("\n✓ Confidence analysis complete")

In [None]:
# Examine low-confidence errors
print("\n=== LOW CONFIDENCE MISCLASSIFICATIONS ===")

# Sort misclassified by confidence
low_conf_errors = misclassified_df.sort_values('Confidence').head(20)

print("\nTop 20 lowest confidence errors:")
print(low_conf_errors[['True_Class', 'Pred_Class', 'Confidence']].to_string(index=False))

# High confidence errors (model was wrong but very confident)
high_conf_errors = misclassified_df.sort_values('Confidence', ascending=False).head(20)

print("\n=== HIGH CONFIDENCE MISCLASSIFICATIONS (Most Concerning) ===")
print("\nTop 20 highest confidence errors:")
print(high_conf_errors[['True_Class', 'Pred_Class', 'Confidence']].to_string(index=False))

print(f"\nNumber of high-confidence errors (>0.9): {(misclassified_df['Confidence'] > 0.9).sum()}")
print(f"Number of high-confidence errors (>0.8): {(misclassified_df['Confidence'] > 0.8).sum()}")

## 8. Save Predictions & Summary

In [None]:
# Save all predictions
print("\n=== SAVING PREDICTIONS ===")

# Validation predictions
val_pred_df = pd.DataFrame({
    'True_Label': val_true_labels,
    'Pred_Label': val_pred_labels,
    'True_Class': [ec_names[label] for label in val_true_labels],
    'Pred_Class': [ec_names[label] for label in val_pred_labels],
    'Confidence': val_confidence,
    'Correct': val_true_labels == val_pred_labels
})

# Add probability for each class
for i, class_name in enumerate(ec_names):
    val_pred_df[f'Prob_{class_name}'] = val_probs[:, i]

val_pred_df.to_csv('../results/predictions/validation_predictions.csv', index=False)
print(f"✓ Validation predictions saved ({len(val_pred_df):,} samples)")

# Test predictions
test_pred_df = pd.DataFrame({
    'True_Label': test_true_labels,
    'Pred_Label': test_pred_labels,
    'True_Class': [ec_names[label] for label in test_true_labels],
    'Pred_Class': [ec_names[label] for label in test_pred_labels],
    'Confidence': test_confidence,
    'Correct': test_true_labels == test_pred_labels
})

# Add probability for each class
for i, class_name in enumerate(ec_names):
    test_pred_df[f'Prob_{class_name}'] = test_probs[:, i]

test_pred_df.to_csv('../results/predictions/test_predictions.csv', index=False)
print(f"✓ Test predictions saved ({len(test_pred_df):,} samples)")

In [None]:
# Generate comprehensive summary report
print("\n=== GENERATING SUMMARY REPORT ===")

summary_report = f"""
{'='*80}
ENZYME CLASSIFICATION MODEL - EVALUATION SUMMARY
{'='*80}

Model: {cfg['model']['name']}
Training: LoRA Fine-tuning
Number of Classes: {cfg['model']['num_labels']}

{'='*80}
OVERALL PERFORMANCE
{'='*80}

Validation Set ({len(val_ds):,} samples):
  Accuracy:  {val_accuracy:.4f}
  Precision: {val_precision:.4f}
  Recall:    {val_recall:.4f}
  F1-Score:  {val_f1:.4f}

Test Set ({len(test_ds):,} samples):
  Accuracy:  {test_accuracy:.4f}
  Precision: {test_precision:.4f}
  Recall:    {test_recall:.4f}
  F1-Score:  {test_f1:.4f}

{'='*80}
PER-CLASS PERFORMANCE (Test Set)
{'='*80}

{perclass_df.to_string(index=False)}

{'='*80}
BEST PERFORMING CLASSES (by F1-Score)
{'='*80}
"""

for idx, row in sorted_by_f1.head(3).iterrows():
    summary_report += f"\n{row['Class']}. {row['Class_Name']}: F1={row['F1-Score']:.4f}, Acc={row['Accuracy']:.4f}"

summary_report += f"""

{'='*80}
WORST PERFORMING CLASSES (by F1-Score)
{'='*80}
"""

for idx, row in sorted_by_f1.tail(3).iterrows():
    summary_report += f"\n{row['Class']}. {row['Class_Name']}: F1={row['F1-Score']:.4f}, Acc={row['Accuracy']:.4f}"

summary_report += f"""

{'='*80}
ERROR ANALYSIS
{'='*80}

Total Misclassifications: {num_misclassified:,} / {total_samples:,} ({num_misclassified/total_samples*100:.2f}%)

Confidence Statistics:
  Correct Predictions:   Mean={correct_confidence.mean():.4f}, Std={correct_confidence.std():.4f}
  Incorrect Predictions: Mean={incorrect_confidence.mean():.4f}, Std={incorrect_confidence.std():.4f}

High Confidence Errors (>0.9): {(misclassified_df['Confidence'] > 0.9).sum()}
High Confidence Errors (>0.8): {(misclassified_df['Confidence'] > 0.8).sum()}

Most Common Error Patterns:
"""

for idx, row in error_pairs.head(5).iterrows():
    pct = (row['Count'] / num_misclassified) * 100
    summary_report += f"\n  {row['True_Class']} → {row['Pred_Class']}: {row['Count']} ({pct:.2f}%)"

summary_report += f"""

{'='*80}
OUTPUT FILES
{'='*80}

Predictions:
  - results/predictions/validation_predictions.csv
  - results/predictions/test_predictions.csv
  - results/predictions/misclassified_samples.csv

Metrics:
  - results/metrics/overall_metrics.csv
  - results/metrics/perclass_metrics_test.csv
  - results/metrics/classification_report_test.txt
  - results/metrics/error_patterns.csv

Figures:
  - results/figures/confusion_matrix_test.png
  - results/figures/confusion_matrix_val.png

{'='*80}
END OF REPORT
{'='*80}
"""

# Save summary report
with open('../results/evaluation_summary.txt', 'w') as f:
    f.write(summary_report)

print(summary_report)
print("\n✓ Summary report saved to results/evaluation_summary.txt")