In [None]:
# 5. Model Evaluation

This notebook evaluates and compares different fine-tuned NER models for Amharic e-commerce data extraction.


In [None]:
import os
import sys
import json
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append("..")
from src.models.model_evaluator import AmharicNEREvaluator


In [None]:
## 5.1 Load Test Data

First, we need to load the test data that will be used to evaluate our models.


In [None]:
# Define paths
data_dir = Path("../data")
test_data_path = data_dir / "labeled" / "test_data.json"
models_dir = data_dir / "models"
output_dir = data_dir / "evaluations"

# Create output directory if it does not exist
output_dir.mkdir(parents=True, exist_ok=True)

# Check if test data exists
if not test_data_path.exists():
    print(f"Test data not found at {test_data_path}")
    print("You need to create test data first by running the data labeling notebook.")
else:
    # Load test data
    with open(test_data_path, "r", encoding="utf-8") as f:
        test_data = json.load(f)
    print(f"Loaded {len(test_data)} test examples")


In [None]:
## 5.2 Initialize Evaluator

Now we will initialize the model evaluator that will compare our different models.


In [None]:
# Initialize evaluator
evaluator = AmharicNEREvaluator(
    models_dir=models_dir,
    output_dir=output_dir
)

# List available models
print("Available models:")
for model_path in evaluator.model_paths:
    print(f"- {model_path.name}")


In [None]:
## 5.3 Evaluate Individual Models

Let us evaluate each model individually to see how they perform.


In [None]:
# Evaluate each model
model_metrics = {}

for model_path in evaluator.model_paths:
    print(f"Evaluating model: {model_path.name}")
    metrics = evaluator.evaluate_model(model_path, test_data)
    model_metrics[model_path.name] = metrics
    
    # Print overall metrics
    if metrics and "overall" in metrics:
        print(f"Precision: {metrics['overall']['precision']:.4f}")
        print(f"Recall: {metrics['overall']['recall']:.4f}")
        print(f"F1 Score: {metrics['overall']['f1']:.4f}")
        print("-" * 40)


In [None]:
## 5.4 Compare Models

Now let us compare all models side by side.


In [None]:
# Extract overall metrics for each model
models = []
precisions = []
recalls = []
f1_scores = []

for model_name, metrics in model_metrics.items():
    if metrics and "overall" in metrics:
        models.append(model_name)
        precisions.append(metrics['overall']['precision'])
        recalls.append(metrics['overall']['recall'])
        f1_scores.append(metrics['overall']['f1'])

# Create DataFrame for comparison
comparison_df = pd.DataFrame({
    "Model": models,
    "Precision": precisions,
    "Recall": recalls,
    "F1 Score": f1_scores
})

# Sort by F1 score
comparison_df = comparison_df.sort_values("F1 Score", ascending=False)

# Display comparison
comparison_df


In [None]:
## 5.4 Compare Models

Now let us compare all models side by side.


In [None]:
# 5. Model Evaluation

This notebook evaluates and compares different fine-tuned NER models for Amharic e-commerce data extraction.


In [None]:
import os
import sys
import json
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append("..")
from src.models.model_evaluator import AmharicNEREvaluator


In [None]:
## 5.1 Load Test Data

First, we need to load the test data that will be used to evaluate our models.


In [None]:
# Define paths
data_dir = Path("../data")
test_data_path = data_dir / "labeled" / "test_data.json"
models_dir = data_dir / "models"
output_dir = data_dir / "evaluations"

# Create output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Check if test data exists
if not test_data_path.exists():
    print(f"Test data not found at {test_data_path}")
    print("You need to create test data first by running the data labeling notebook.")
else:
    # Load test data
    with open(test_data_path, "r", encoding="utf-8") as f:
        test_data = json.load(f)
    print(f"Loaded {len(test_data)} test examples")


In [None]:
## 5.2 Initialize Evaluator

Now we'll initialize the model evaluator that will compare our different models.


In [None]:
# Initialize evaluator
evaluator = AmharicNEREvaluator(
    models_dir=models_dir,
    output_dir=output_dir
)

# List available models
print("Available models:")
for model_path in evaluator.model_paths:
    print(f"- {model_path.name}")
