# View Historical MLflow Results

Query and visualize results from previous training runs stored in MLflow.

In [3]:
# View all historical MLflow experiments and runs
import mlflow
import pandas as pd

mlflow.set_tracking_uri("file:./mlruns")

# List all experiments
print("="*100)
print("AVAILABLE EXPERIMENTS")
print("="*100)
experiments = mlflow.search_experiments()
for exp in experiments:
    print(f"Experiment: {exp.name} (ID: {exp.experiment_id})")

# Query specific experiment (change name if needed)
experiment_name = "nsf_models_flagged_words"  # or "nsf_models" for old runs
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment:
    print(f"\n{'='*100}")
    print(f"RUNS FOR EXPERIMENT: {experiment_name}")
    print(f"{'='*100}\n")
    
    # Get all runs for this experiment
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    
    if len(runs) > 0:
        # Display key metrics
        display_cols = ['run_id', 'start_time', 'tags.mlflow.runName', 
                       'metrics.valid_accuracy', 'metrics.valid_f1_score', 'metrics.valid_auc_roc',
                       'metrics.test_accuracy', 'metrics.test_f1_score', 'metrics.test_auc_roc']
        
        available_cols = [col for col in display_cols if col in runs.columns]
        
        if len(available_cols) > 0:
            results_df = runs[available_cols].sort_values('start_time', ascending=False)
            
            # Rename columns for readability
            results_df.columns = [col.replace('metrics.', '').replace('tags.mlflow.', '') for col in results_df.columns]
            
            print(results_df.to_string(index=False))
            print(f"\n{'='*100}")
            print(f"Total runs found: {len(runs)}")
            
            # Find best models (only if metrics exist)
            if 'metrics.valid_f1_score' in runs.columns and not runs['metrics.valid_f1_score'].isna().all():
                best_f1_run = runs.loc[runs['metrics.valid_f1_score'].idxmax()]
                print(f"\nüèÜ Best F1 Score: {best_f1_run['metrics.valid_f1_score']:.4f} ({best_f1_run.get('tags.mlflow.runName', 'N/A')})")
            
            if 'metrics.valid_accuracy' in runs.columns and not runs['metrics.valid_accuracy'].isna().all():
                best_acc_run = runs.loc[runs['metrics.valid_accuracy'].idxmax()]
                print(f"üèÜ Best Accuracy: {best_acc_run['metrics.valid_accuracy']:.4f} ({best_acc_run.get('tags.mlflow.runName', 'N/A')})")
        else:
            print("‚ö†Ô∏è  No metric columns found in runs")
            print(f"\nAvailable columns: {list(runs.columns)[:10]}...")
    else:
        print("‚ö†Ô∏è  No runs found for this experiment")
else:
    print(f"‚ö†Ô∏è  Experiment '{experiment_name}' not found")
    print("\nAvailable experiments:")
    for exp in experiments:
        print(f"  - {exp.name}")
    print("\nChange the experiment_name variable to one of the above and re-run the cell")

AVAILABLE EXPERIMENTS
Experiment: nsf_models_flagged_words (ID: 352756237987121497)

RUNS FOR EXPERIMENT: nsf_models_flagged_words

                          run_id                       start_time        runName  valid_accuracy  valid_f1_score  valid_auc_roc
d6157cc3845e444f8289b105a5691b91 2025-12-15 09:45:30.661000+00:00  lr_tuned_best          0.8952        0.057554       0.702426
dc7437361b8d4a5eb1cc6f0d2d97b07a 2025-12-15 09:37:41.265000+00:00 gbt_tuned_best          0.8776        0.072727       0.654807
ddec7ab8dc594b28a58b792959ee8bc9 2025-12-15 09:35:15.691000+00:00  rf_tuned_best          0.9344        0.068182       0.673002

Total runs found: 3

üèÜ Best F1 Score: 0.0727 (gbt_tuned_best)
üèÜ Best Accuracy: 0.9344 (rf_tuned_best)
