In [5]:
# Task 5: Model Interpretability with SHAP and LIME for Amharic NER

import shap
import numpy as np
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
from lime.lime_text import LimeTextExplainer

# Load the fine-tuned model (you can change this to your best model)
model_name = "Davlan/afro-xlmr-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

# Sample text for analysis
#example_text = "አዳም 1000 ብር ተሉር ሃሉ ቀከር ገንር"
example_text = "አዲስ የቤት መሳሪያ ማዕከላዊ ገበሬ በአዲስ አበባ ቦሌ ላይ እየተሸጠ ነው። የእንቁላል ማቀዝቀዣ በ 1200 ብር ነው። ነፃ ድሊቨሪ አለ።"


# ---------------- LIME ---------------- #
print("\nLIME-based Interpretation:")

# Define a prediction function compatible with LIME
class NERPredictWrapper:
    def __init__(self, pipeline_func):
        self.pipeline = pipeline_func

    def predict_proba(self, texts):
        # Dummy NER score aggregation for classification-style explanation
        labels = ["PRODUCT", "LOCATION", "PRICE", "O"]
        results = []
        for text in texts:
            ner_result = self.pipeline(text)
            label_counts = {label: 0 for label in labels}
            for ent in ner_result:
                entity = ent['entity_group'].upper()
                if entity in label_counts:
                    label_counts[entity] += 1
            total = sum(label_counts.values()) + 1e-8  # Avoid division by 0
            results.append([label_counts[l] / total for l in labels])
        return np.array(results)

lime_explainer = LimeTextExplainer(class_names=["PRODUCT", "LOCATION", "PRICE", "O"])
wrapped_model = NERPredictWrapper(ner_pipeline)

exp = lime_explainer.explain_instance(example_text, wrapped_model.predict_proba, num_features=10)

# Save LIME output to HTML file instead of using IPython.display
lime_html_path = "lime_interpretation.html"
exp.save_to_file(lime_html_path)
print(f"\n✅ LIME interpretation saved to: {lime_html_path} (Open this file in your browser)")

# ---------------- SHAP (Approximate Classification Wrapper) ---------------- #
print("\nSHAP-based Interpretation (approximate):")

def shap_predict(texts):
    labels = ["PRODUCT", "LOCATION", "PRICE", "O"]
    results = []
    for text in texts:
        preds = ner_pipeline(text)
        scores = {label: 0 for label in labels}
        for pred in preds:
            ent = pred["entity_group"].upper()
            if ent in scores:
                scores[ent] += pred["score"]
        total = sum(scores.values()) + 1e-8
        results.append([scores[l] / total for l in labels])
    return np.array(results)

shap_explainer = shap.Explainer(shap_predict, tokenizer)
shap_values = shap_explainer([example_text])
shap.plots.text(shap_values[0])

# ---------------- Analysis of Failure Cases ---------------- #
# You can log ambiguous or misclassified examples from the evaluation stage (Task 4) here
# and re-run LIME or SHAP interpretation on them.

# Example: text with ambiguous entity types or overlapping entities
# difficult_case = "ኣዳስ ግደረባ መትው 200 ብር ሃሉ"
# lime_explainer.explain_instance(difficult_case, wrapped_model.predict_proba).show_in_notebook()

# Optional: Export interpretability results as report
# exp.save_to_file("lime_interpretation.html")

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu



LIME-based Interpretation:

✅ LIME interpretation saved to: lime_interpretation.html (Open this file in your browser)

SHAP-based Interpretation (approximate):


PartitionExplainer explainer: 2it [00:45, 45.46s/it]               


In [None]:
# ------------------- Load Preprocessed Vendor Data -------------------
# Assuming CSVs are stored per vendor (e.g., 'vendorname_posts.csv')
import pandas as pd
import os
import glob

def load_all_vendor_data(path_pattern="./vendors/*.csv"):
    vendor_data = {}
    for file_path in glob.glob(path_pattern):
        vendor_name = os.path.basename(file_path).replace("_posts.csv", "")
        df = pd.read_csv(file_path, parse_dates=["timestamp"])
        vendor_data[vendor_name] = df
    return vendor_data

# ------------------- Metric Calculations -------------------

def calculate_vendor_metrics(df):
    if df.empty:
        return {
            "Posts/Week": 0,
            "Avg Views/Post": 0,
            "Top Post": None,
            "Top Views": 0,
            "Top Product": "",
            "Top Price": 0,
            "Avg Price (ETB)": 0,
            "Lending Score": 0,
        }

    df["week"] = df["timestamp"].dt.to_period("W")
    weeks_active = df["week"].nunique()
    posts_per_week = len(df) / (weeks_active or 1)

    avg_views = df["views"].mean()
    top_post = df.loc[df["views"].idxmax()]

    prices = df["price"].dropna()
    avg_price = prices.mean() if not prices.empty else 0

    # Lending Score formula (tunable)
    score = (avg_views * 0.5) + (posts_per_week * 0.5)

    return {
        "Posts/Week": round(posts_per_week, 2),
        "Avg Views/Post": int(avg_views),
        "Top Post": top_post["message"][:100] + "...",
        "Top Views": int(top_post["views"]),
        "Top Product": top_post.get("product", "N/A"),
        "Top Price": top_post.get("price", "N/A"),
        "Avg Price (ETB)": round(avg_price, 2),
        "Lending Score": round(score, 2)
    }

# ------------------- Main Aggregation & Reporting -------------------

vendor_data = load_all_vendor_data("./vendors/*.csv")
report_rows = []

for vendor, df in vendor_data.items():
    metrics = calculate_vendor_metrics(df)
    metrics["Vendor"] = vendor
    report_rows.append(metrics)

# Final Scorecard DataFrame
scorecard_df = pd.DataFrame(report_rows)[[
    "Vendor", "Avg Views/Post", "Posts/Week", "Avg Price (ETB)", "Lending Score"
]]

# Save or display
scorecard_df.to_csv("vendor_scorecard.csv", index=False)
print("\n✅ Vendor Scorecard generated and saved to 'vendor_scorecard.csv'")
print(scorecard_df.sort_values("Lending Score", ascending=False))