In [29]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tabulate import tabulate
from IPython.display import Markdown, display

def calculate_metrics_for_models(model_results, task="", tablefmt="grid"):
    """
    Given a list of tuples (model_name, file_path, row_color, section_break), compute and print classification metrics.

    Parameters:
    - model_results (list of tuples): Each tuple is (model_name, file_path, color, section_break).
    - task (str): Title printed above the table.
    - tablefmt (str): Style passed to tabulate (e.g., 'github', 'grid').
    """
    COLORS = {
        "green": "\033[92m",
        "red": "\033[91m",
        "yellow": "\033[93m",
        "blue": "\033[94m",
        "magenta": "\033[95m",
        "cyan": "\033[96m",
        "white": "\033[97m",
        None: ""
    }
    RESET = "\033[0m"

    table_rows = []

    for model_name, file_path, color, section_break in model_results:
        try:
            df = pd.read_json(file_path)

            if "y_true" not in df or "y_pred" not in df:
                print(f"Error: Missing 'y_true' or 'y_pred' in {file_path}")
                continue

            y_true = df["y_true"]
            y_pred = df["y_pred"]

            
            accuracy = round(accuracy_score(y_true, y_pred) * 100, 2)
            precision = round(precision_score(y_true, y_pred, average="weighted", zero_division=0) * 100, 2)
            recall = round(recall_score(y_true, y_pred, average="weighted") * 100, 2)
            f1 = round(f1_score(y_true, y_pred, average="weighted") * 100, 2)

            #for debug
            #from sklearn.metrics import classification_report
            #print(classification_report(y_true, y_pred,digits=4))

            row = [model_name, accuracy, precision, recall, f1]

            # Apply color if specified
            color_code = COLORS.get(color, color)
            if color_code:
                row = [f"{color_code}{val}{RESET}" for val in row]

            table_rows.append(row)

            # Insert a blank row if section_break is True
            if section_break:
                table_rows.append([""] * len(row))

        except Exception as e:
            print(f"Error processing {model_name} ({file_path}): {e}")

    headers = ["Model", "Accuracy (%)", "Precision (%)", "Recall (%)", "F1 Score (%)"]
    print(f"\n📊 **{task} Performance Metrics**\n")
    print(tabulate(table_rows, headers=headers, tablefmt=tablefmt))

def calculate_metrics_per_category(model_results, task="", tablefmt="grid"):
    """
    Given a list of tuples (model_name, file_path, row_color, section_break), compute and print per-category metrics.

    Parameters:
    - model_results (list of tuples): Each tuple is (model_name, file_path, color, section_break).
    - task (str): Title printed above the table.
    - tablefmt (str): Style passed to tabulate (e.g., 'github', 'grid').
    """
    COLORS = {
        "green": "\033[92m",
        "red": "\033[91m",
        "yellow": "\033[93m",
        "blue": "\033[94m",
        "magenta": "\033[95m",
        "cyan": "\033[96m",
        "white": "\033[97m",
        None: ""
    }
    RESET = "\033[0m"

    for model_name, file_path, color, section_break in model_results:
        try:
            df = pd.read_json(file_path)

            if "y_true" not in df or "y_pred" not in df:
                print(f"Error: Missing 'y_true' or 'y_pred' in {file_path}")
                continue

            y_true = df["y_true"]
            y_pred = df["y_pred"]

            report = classification_report(y_true, y_pred, digits=2, output_dict=True, zero_division=0)
            classes = [key for key in report.keys() if key not in ["accuracy", "macro avg", "weighted avg"]]

            table_rows = []
            for cls in classes:
                cls_metrics = report[cls]
                row = [
                    cls,
                    round(cls_metrics["precision"] * 100, 2),
                    round(cls_metrics["recall"] * 100, 2),
                    round(cls_metrics["f1-score"] * 100, 2),
                    int(cls_metrics["support"])
                ]
                color_code = COLORS.get(color, "")
                if color_code:
                    row = [f"{color_code}{v}{RESET}" for v in row]
                table_rows.append(row)

            headers = ["Class", "Precision (%)", "Recall (%)", "F1 Score (%)", "Support"]
            print(f"\n📊 **{task} — {model_name} Per-Category Metrics**\n")
            print(tabulate(table_rows, headers=headers, tablefmt=tablefmt))

            if section_break:
                print("\n" + "=" * 80 + "\n")

        except Exception as e:
            print(f"Error processing {model_name} ({file_path}): {e}")   

def calculate_metrics_with_category_breakdown(model_results, task="", tablefmt_summary="grid", tablefmt_category="grid"):
    """
    Computes overall metrics and per-category breakdown for each model, with automatic label descriptions,
    and adds an overall metrics row to each breakdown table. Uses Markdown display for Jupyter bold headers.
    """

    COLORS = {
        "green": "\033[92m",
        "red": "\033[91m",
        "yellow": "\033[93m",
        "blue": "\033[94m",
        "magenta": "\033[95m",
        "cyan": "\033[96m",
        "white": "\033[97m",
        None: ""
    }
    RESET = "\033[0m"

    # Label descriptions for Humanitarian task
    description_humanitarian = {
        "0": "Affected individuals",
        "1": "Rescue/volunteering/donation",
        "2": "Infrastructure & utility damage",
        "3": "Other relevant info",
        "4": "Not humanitarian"
    }

    # Label descriptions for Informative task
    description_informative = {
        "0": "Not informative",
        "1": "Informative"
    }

    # Choose correct description dict based on task name
    if "Humanitarian" in task:
        description_dict = description_humanitarian
    elif "Informative" in task:
        description_dict = description_informative
    else:
        description_dict = {}

    summary_rows = []
    per_model_breakdowns = []

    for model_name, file_path, color, section_break in model_results:
        try:
            df = pd.read_json(file_path)

            if "y_true" not in df or "y_pred" not in df:
                print(f"Error: Missing 'y_true' or 'y_pred' in {file_path}")
                continue

            y_true = df["y_true"].astype(str)
            y_pred = df["y_pred"].astype(str)

            # Overall metrics
            accuracy = round(accuracy_score(y_true, y_pred) * 100, 2)
            precision = round(precision_score(y_true, y_pred, average="weighted", zero_division=0) * 100, 2)
            recall = round(recall_score(y_true, y_pred, average="weighted", zero_division=0) * 100, 2)
            f1 = round(f1_score(y_true, y_pred, average="weighted", zero_division=0) * 100, 2)

            row = [model_name, accuracy, precision, recall, f1]

            color_code = COLORS.get(color, "")
            if color_code:
                row = [f"{color_code}{val}{RESET}" for val in row]

            summary_rows.append(row)

            # Per-category breakdown
            report = classification_report(y_true, y_pred, digits=2, output_dict=True, zero_division=0)
            classes = [key for key in report.keys() if key not in ["accuracy", "macro avg", "weighted avg"]]

            category_rows = []
            for cls in sorted(classes):  # sort for consistent order
                cls_metrics = report[cls]
                description = description_dict.get(cls, "")
                cat_row = [
                    cls,
                    description,
                    round(cls_metrics["precision"] * 100, 2),
                    round(cls_metrics["recall"] * 100, 2),
                    round(cls_metrics["f1-score"] * 100, 2),
                    int(cls_metrics["support"])
                ]
                if color_code:
                    cat_row = [f"{color_code}{v}{RESET}" for v in cat_row]
                category_rows.append(cat_row)

            # Add overall metrics row at the end
            overall_row = [
                "",
                f"Overall accuracy: {accuracy}%",
                precision,
                recall,
                f1,
                "—"
            ]
            if color_code:
                overall_row = [f"{color_code}{v}{RESET}" for v in overall_row]
            category_rows.append(overall_row)

            per_model_breakdowns.append((model_name, category_rows, section_break))

        except Exception as e:
            print(f"Error processing {model_name} ({file_path}): {e}")

    # Print overall summary table first
    headers_summary = ["Model", "Accuracy (%)", "Precision (%)", "Recall (%)", "F1 Score (%)"]
    display(Markdown(f"📊 **{task} — Overall Performance Summary**"))
    print(tabulate(summary_rows, headers=headers_summary, tablefmt=tablefmt_summary))

    # Then print per-category breakdowns
    for model_name, category_rows, section_break in per_model_breakdowns:
        headers_category = ["Label", "Description", "Precision (%)", "Recall (%)", "F1 Score (%)", "Support"]
        display(Markdown(f"🗂️ **{task} — {model_name} Per-Category Metrics**"))
        print(tabulate(category_rows, headers=headers_category, tablefmt=tablefmt_category))

        if section_break:
            print("\n" + "=" * 80 + "\n")

# GPT FEW SHOTS - HUMANITARIAN

In [30]:
model_results = [
    ("Zeroshot Text Only  - 4o", "test_results/humanitarian/Test-17/gpt-4o-Zeroshot-Text-Only.json", "red", False),
    ("OneShot Text Only   - 4o", "test_results/humanitarian/Test-17/gpt-4o-OneShot-Text-Only.json", "green", False),
    ("FiveShot Text Only  - 4o", "test_results/humanitarian/Test-17/gpt-4o-5shot-Text-Only.json", "blue", False),
    ("Fiveshots Text Only - 4o Inconsistent", "test_results/humanitarian/Test-17/gpt-4o-5shot-Inconsistent-Text-Only.json", "yellow", True),
    
    ("Zeroshot Image Only - 4o", "test_results/humanitarian/Test-17/gpt-4o-Zeroshot-Image-Only.json", "red", False),
    ("OneShot Image Only  - 4o", "test_results/humanitarian/Test-17/gpt-4o-Oneshot-Image-Only.json", "green", False),
    ("FiveShot Image Only - 4o", "test_results/humanitarian/Test-17/gpt-4o-5shot-Image-Only.json", "blue", True),    
    
    
    ("Zeroshot Text Image - 4o", "test_results/humanitarian/Test-17/gpt-4o-Zeroshot-Text-Image.json", "red", False),
    ("OneShot Text Image  - 4o", "test_results/humanitarian/Test-17/gpt-4o-Oneshot-Text-Image.json", "green", False),
    ("FiveShot Text Image - 4o", "test_results/humanitarian/Test-17/gpt-4o-5shot-Text-Image.json", "blue", True),  
    
  
    ("Zeroshot Text Only  - 4o mini", "test_results/humanitarian/Test-17/gpt-4o-mini-Zeroshot-Text-Only.json", "red", False),
    ("OneShot Text Only   - 4o mini", "test_results/humanitarian/Test-17/gpt-4o-mini-Oneshot-Text-Only.json", "green", False),
    ("FiveShot Text Only  - 4o mini", "test_results/humanitarian/Test-17/gpt-4o-mini-5shot-Text-Only.json", "blue", True),
    
    ("Zeroshot Image Only - 4o mini", "test_results/humanitarian/Test-17/gpt-4o-mini-Zeroshot-Image-Only.json", "red", False),
    ("OneShot Image Only  - 4o mini", "test_results/humanitarian/Test-17/gpt-4o-mini-Oneshot-Image-Only.json", "green", False),
    ("FiveShot Image Only - 4o mini", "test_results/humanitarian/Test-17/gpt-4o-mini-5shot-Image-Only.json", "blue", True),    
    
    
    ("Zeroshot Text Image - 4o mini", "test_results/humanitarian/Test-17/gpt-4o-mini-Zeroshot-Text-Image.json", "red", False),
    ("OneShot Text Image  - 4o mini", "test_results/humanitarian/Test-17/gpt-4o-mini-Oneshot-Text-Image.json", "green", False),
    ("FiveShot Text Image - 4o mini", "test_results/humanitarian/Test-17/gpt-4o-mini-5shot-Text-Image.json", "blue", False)    


]

#calculate_metrics_for_models(model_results, task="Tweet Humanitarian with GPT Fewshot")

calculate_metrics_with_category_breakdown(model_results, task="GPT Zeroshot & Fewshot - Humanitarian")


📊 **GPT Zeroshot & Fewshot - Humanitarian — Overall Performance Summary**

+---------------------------------------+----------------+-----------------+--------------+----------------+
| Model                                 |   Accuracy (%) |   Precision (%) |   Recall (%) |   F1 Score (%) |
| [91mZeroshot Text Only  - 4o[0m              |          [91m74.66[0m |           [91m75.03[0m |        [91m74.66[0m |          [91m74.27[0m |
+---------------------------------------+----------------+-----------------+--------------+----------------+
| [92mOneShot Text Only   - 4o[0m              |          [92m71.1[0m  |           [92m76.59[0m |        [92m71.1[0m  |          [92m72[0m    |
+---------------------------------------+----------------+-----------------+--------------+----------------+
| [94mFiveShot Text Only  - 4o[0m              |          [94m71.1[0m  |           [94m76.78[0m |        [94m71.1[0m  |          [94m72.17[0m |
+---------------------------------------+----------------+-----------------+--------------+-----------

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — Zeroshot Text Only  - 4o Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mAffected individuals[0m            |           [91m15.79[0m |        [91m33.33[0m |          [91m21.43[0m | [91m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mRescue/volunteering/donation[0m    |           [91m80.8[0m  |        [91m80.16[0m |          [91m80.48[0m | [91m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m2[0m       | [91mInfrastructure & utility damage[0m |           [91m77.05[0m |        [91m58.02[0m |          [91m66.2[0m  | [91m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — OneShot Text Only   - 4o Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mAffected individuals[0m            |           [92m12.2[0m  |        [92m55.56[0m |          [92m20[0m    | [92m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mRescue/volunteering/donation[0m    |           [92m65[0m    |        [92m92.86[0m |          [92m76.47[0m | [92m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m2[0m       | [92mInfrastructure & utility damage[0m |           [92m65.26[0m |        [92m76.54[0m |          [92m70.45[0m | [92m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — FiveShot Text Only  - 4o Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [94m0[0m       | [94mAffected individuals[0m            |           [94m15.69[0m |        [94m88.89[0m |          [94m26.67[0m | [94m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m1[0m       | [94mRescue/volunteering/donation[0m    |           [94m64.13[0m |        [94m93.65[0m |          [94m76.13[0m | [94m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m2[0m       | [94mInfrastructure & utility damage[0m |           [94m59.79[0m |        [94m71.6[0m  |          [94m65.17[0m | [94m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — Fiveshots Text Only - 4o Inconsistent Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [93m0[0m       | [93mAffected individuals[0m            |           [93m12.96[0m |        [93m77.78[0m |          [93m22.22[0m | [93m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [93m1[0m       | [93mRescue/volunteering/donation[0m    |           [93m61.5[0m  |        [93m91.27[0m |          [93m73.48[0m | [93m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [93m2[0m       | [93mInfrastructure & utility damage[0m |           [93m61.46[0m |        [93m72.84[0m |          [93m66.67[0m | [93m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — Zeroshot Image Only - 4o Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mAffected individuals[0m            |           [91m30.43[0m |        [91m77.78[0m |          [91m43.75[0m | [91m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mRescue/volunteering/donation[0m    |           [91m68.05[0m |        [91m91.27[0m |          [91m77.97[0m | [91m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m2[0m       | [91mInfrastructure & utility damage[0m |           [91m75.79[0m |        [91m88.89[0m |          [91m81.82[0m | [91m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — OneShot Image Only  - 4o Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mAffected individuals[0m            |           [92m22.58[0m |        [92m77.78[0m |          [92m35[0m    | [92m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mRescue/volunteering/donation[0m    |           [92m69.75[0m |        [92m89.68[0m |          [92m78.47[0m | [92m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m2[0m       | [92mInfrastructure & utility damage[0m |           [92m82.56[0m |        [92m87.65[0m |          [92m85.03[0m | [92m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — FiveShot Image Only - 4o Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [94m0[0m       | [94mAffected individuals[0m            |           [94m26.67[0m |        [94m88.89[0m |          [94m41.03[0m | [94m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m1[0m       | [94mRescue/volunteering/donation[0m    |           [94m73.25[0m |        [94m91.27[0m |          [94m81.27[0m | [94m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m2[0m       | [94mInfrastructure & utility damage[0m |           [94m78.49[0m |        [94m90.12[0m |          [94m83.91[0m | [94m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — Zeroshot Text Image - 4o Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mAffected individuals[0m            |           [91m23.08[0m |        [91m66.67[0m |          [91m34.29[0m | [91m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mRescue/volunteering/donation[0m    |           [91m56.62[0m |        [91m98.41[0m |          [91m71.88[0m | [91m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m2[0m       | [91mInfrastructure & utility damage[0m |           [91m73.68[0m |        [91m86.42[0m |          [91m79.55[0m | [91m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — OneShot Text Image  - 4o Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mAffected individuals[0m            |           [92m17.78[0m |        [92m88.89[0m |          [92m29.63[0m | [92m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mRescue/volunteering/donation[0m    |           [92m61.73[0m |        [92m96.03[0m |          [92m75.16[0m | [92m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m2[0m       | [92mInfrastructure & utility damage[0m |           [92m69.81[0m |        [92m91.36[0m |          [92m79.14[0m | [92m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — FiveShot Text Image - 4o Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [94m0[0m       | [94mAffected individuals[0m            |           [94m18[0m    |       [94m100[0m    |          [94m30.51[0m | [94m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m1[0m       | [94mRescue/volunteering/donation[0m    |           [94m63.49[0m |        [94m95.24[0m |          [94m76.19[0m | [94m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m2[0m       | [94mInfrastructure & utility damage[0m |           [94m76[0m    |        [94m93.83[0m |          [94m83.98[0m | [94m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — Zeroshot Text Only  - 4o mini Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mAffected individuals[0m            |           [91m16.22[0m |        [91m66.67[0m |          [91m26.09[0m | [91m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mRescue/volunteering/donation[0m    |           [91m72.61[0m |        [91m90.48[0m |          [91m80.57[0m | [91m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m2[0m       | [91mInfrastructure & utility damage[0m |           [91m79.63[0m |        [91m53.09[0m |          [91m63.7[0m  | [91m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — OneShot Text Only   - 4o mini Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mAffected individuals[0m            |           [92m16.67[0m |        [92m77.78[0m |          [92m27.45[0m | [92m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mRescue/volunteering/donation[0m    |           [92m64.74[0m |        [92m88.89[0m |          [92m74.92[0m | [92m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m2[0m       | [92mInfrastructure & utility damage[0m |           [92m75.81[0m |        [92m58.02[0m |          [92m65.73[0m | [92m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — FiveShot Text Only  - 4o mini Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [94m0[0m       | [94mAffected individuals[0m            |           [94m12.07[0m |        [94m77.78[0m |          [94m20.9[0m  | [94m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m1[0m       | [94mRescue/volunteering/donation[0m    |           [94m64.04[0m |        [94m90.48[0m |          [94m75[0m    | [94m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m2[0m       | [94mInfrastructure & utility damage[0m |           [94m72.31[0m |        [94m58.02[0m |          [94m64.38[0m | [94m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — Zeroshot Image Only - 4o mini Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mAffected individuals[0m            |           [91m25[0m    |        [91m77.78[0m |          [91m37.84[0m | [91m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mRescue/volunteering/donation[0m    |           [91m72.26[0m |        [91m88.89[0m |          [91m79.72[0m | [91m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m2[0m       | [91mInfrastructure & utility damage[0m |           [91m82.93[0m |        [91m83.95[0m |          [91m83.44[0m | [91m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — OneShot Image Only  - 4o mini Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mAffected individuals[0m            |           [92m25[0m    |        [92m77.78[0m |          [92m37.84[0m | [92m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mRescue/volunteering/donation[0m    |           [92m71.07[0m |        [92m89.68[0m |          [92m79.3[0m  | [92m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m2[0m       | [92mInfrastructure & utility damage[0m |           [92m81.93[0m |        [92m83.95[0m |          [92m82.93[0m | [92m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — FiveShot Image Only - 4o mini Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [94m0[0m       | [94mAffected individuals[0m            |           [94m21.88[0m |        [94m77.78[0m |          [94m34.15[0m | [94m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m1[0m       | [94mRescue/volunteering/donation[0m    |           [94m72.19[0m |        [94m86.51[0m |          [94m78.7[0m  | [94m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m2[0m       | [94mInfrastructure & utility damage[0m |           [94m91.89[0m |        [94m83.95[0m |          [94m87.74[0m | [94m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — Zeroshot Text Image - 4o mini Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mAffected individuals[0m            |           [91m15.09[0m |        [91m88.89[0m |          [91m25.81[0m | [91m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mRescue/volunteering/donation[0m    |           [91m61.11[0m |        [91m96.03[0m |          [91m74.69[0m | [91m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m2[0m       | [91mInfrastructure & utility damage[0m |           [91m78.57[0m |        [91m81.48[0m |          [91m80[0m    | [91m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — OneShot Text Image  - 4o mini Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mAffected individuals[0m            |           [92m14.29[0m |        [92m88.89[0m |          [92m24.62[0m | [92m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mRescue/volunteering/donation[0m    |           [92m65.03[0m |        [92m94.44[0m |          [92m77.02[0m | [92m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m2[0m       | [92mInfrastructure & utility damage[0m |           [92m79.07[0m |        [92m83.95[0m |          [92m81.44[0m | [92m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **GPT Zeroshot & Fewshot - Humanitarian — FiveShot Text Image - 4o mini Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [94m0[0m       | [94mAffected individuals[0m            |           [94m22.22[0m |        [94m88.89[0m |          [94m35.56[0m | [94m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m1[0m       | [94mRescue/volunteering/donation[0m    |           [94m64.71[0m |        [94m96.03[0m |          [94m77.32[0m | [94m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m2[0m       | [94mInfrastructure & utility damage[0m |           [94m68.87[0m |        [94m90.12[0m |          [94m78.07[0m | [94m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

# LLAMA FEW-SHOTS - HUMANITARIAN

In [31]:
model_results = [
    ("Zeroshot Text Only  - Llama", "test_results/humanitarian/Test-23/Llama-3.2-11B-Huma-Zeroshot-Text-Only.json", "red", False),
    ("Oneshot Text Only  - Llama", "test_results/humanitarian/Test-23/Llama-3.2-11B-Huma-Oneshot-Text-Only.json", "green", False),
    ("Fiveshot Text Only  - Llama", "test_results/humanitarian/Test-23/Llama-3.2-11B-Huma-Fiveshot-Text-Only.json", "blue", True),
   
    ("Zeroshot Image Only  - Llama", "test_results/humanitarian/Test-23/Llama-3.2-11B-Huma-Zeroshot-Image-Only.json", "red", False),
    ("Oneshot Image Only   - Llama", "test_results/humanitarian/Test-23/Llama-3.2-11B-Huma-Oneshot-Image-Only.json", "green", True),

    ("Zeroshot Text Image  - Llama", "test_results/humanitarian/Test-23/Llama-3.2-11B-Huma-Zeroshot-Text-Image.json", "red", False),
    ("Oneshot Text Image   - Llama", "test_results/humanitarian/Test-23/Llama-3.2-11B-Huma-Oneshot-Text-Image.json", "green", False)   
]

calculate_metrics_with_category_breakdown(model_results, task="LLama Zeroshot & Fewshot - Humanitarian")


📊 **LLama Zeroshot & Fewshot - Humanitarian — Overall Performance Summary**

+------------------------------+----------------+-----------------+--------------+----------------+
| Model                        |   Accuracy (%) |   Precision (%) |   Recall (%) |   F1 Score (%) |
| [91mZeroshot Text Only  - Llama[0m  |          [91m74.45[0m |           [91m78.43[0m |        [91m74.45[0m |          [91m74.09[0m |
+------------------------------+----------------+-----------------+--------------+----------------+
| [92mOneshot Text Only  - Llama[0m   |          [92m74.45[0m |           [92m77.21[0m |        [92m74.45[0m |          [92m75.09[0m |
+------------------------------+----------------+-----------------+--------------+----------------+
| [94mFiveshot Text Only  - Llama[0m  |          [94m71.52[0m |           [94m75.32[0m |        [94m71.52[0m |          [94m72.65[0m |
+------------------------------+----------------+-----------------+--------------+----------------+
| [91mZeroshot Image Only  - Llama[0m |          [91m76.54[0m

🗂️ **LLama Zeroshot & Fewshot - Humanitarian — Zeroshot Text Only  - Llama Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mAffected individuals[0m            |           [91m35.29[0m |        [91m66.67[0m |          [91m46.15[0m | [91m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mRescue/volunteering/donation[0m    |           [91m76.39[0m |        [91m87.3[0m  |          [91m81.48[0m | [91m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m2[0m       | [91mInfrastructure & utility damage[0m |          [91m100[0m    |        [91m27.16[0m |          [91m42.72[0m | [91m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **LLama Zeroshot & Fewshot - Humanitarian — Oneshot Text Only  - Llama Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mAffected individuals[0m            |           [92m22.22[0m |        [92m66.67[0m |          [92m33.33[0m | [92m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mRescue/volunteering/donation[0m    |           [92m68.52[0m |        [92m88.1[0m  |          [92m77.08[0m | [92m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m2[0m       | [92mInfrastructure & utility damage[0m |           [92m64.1[0m  |        [92m61.73[0m |          [92m62.89[0m | [92m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **LLama Zeroshot & Fewshot - Humanitarian — Fiveshot Text Only  - Llama Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [94m0[0m       | [94mAffected individuals[0m            |           [94m14.29[0m |        [94m66.67[0m |          [94m23.53[0m | [94m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m1[0m       | [94mRescue/volunteering/donation[0m    |           [94m68.87[0m |        [94m82.54[0m |          [94m75.09[0m | [94m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [94m2[0m       | [94mInfrastructure & utility damage[0m |           [94m53.95[0m |        [94m50.62[0m |          [94m52.23[0m | [94m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **LLama Zeroshot & Fewshot - Humanitarian — Zeroshot Image Only  - Llama Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mAffected individuals[0m            |           [91m36.36[0m |        [91m44.44[0m |          [91m40[0m    | [91m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mRescue/volunteering/donation[0m    |           [91m78.08[0m |        [91m45.24[0m |          [91m57.29[0m | [91m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m2[0m       | [91mInfrastructure & utility damage[0m |           [91m72[0m    |        [91m66.67[0m |          [91m69.23[0m | [91m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **LLama Zeroshot & Fewshot - Humanitarian — Oneshot Image Only   - Llama Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mAffected individuals[0m            |           [92m60[0m    |        [92m33.33[0m |          [92m42.86[0m | [92m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mRescue/volunteering/donation[0m    |           [92m70.37[0m |        [92m45.24[0m |          [92m55.07[0m | [92m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m2[0m       | [92mInfrastructure & utility damage[0m |           [92m67.14[0m |        [92m58.02[0m |          [92m62.25[0m | [92m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **LLama Zeroshot & Fewshot - Humanitarian — Zeroshot Text Image  - Llama Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mAffected individuals[0m            |           [91m25[0m    |        [91m33.33[0m |          [91m28.57[0m | [91m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mRescue/volunteering/donation[0m    |           [91m70[0m    |        [91m72.22[0m |          [91m71.09[0m | [91m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [91m2[0m       | [91mInfrastructure & utility damage[0m |          [91m100[0m    |        [91m32.1[0m  |          [91m48.6[0m  | [91m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

🗂️ **LLama Zeroshot & Fewshot - Humanitarian — Oneshot Text Image   - Llama Per-Category Metrics**

+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description                     |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mAffected individuals[0m            |          [92m100[0m    |        [92m11.11[0m |          [92m20[0m    | [92m9[0m         |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mRescue/volunteering/donation[0m    |           [92m72.81[0m |        [92m65.87[0m |          [92m69.17[0m | [92m126[0m       |
+---------+---------------------------------+-----------------+--------------+----------------+-----------+
| [92m2[0m       | [92mInfrastructure & utility damage[0m |           [92m64[0m    |        [92m59.26[0m |          [92m61.54[0m | [92m81[0m        |
+---------+---------------------------------+-----------------+--------------+----

# LLAMA FEW-SHOTS - INFORMATIVE

In [32]:
model_results = [
    ("Zeroshot Text Only  - Llama", "test_results/informative/Test-23/Llama-3.2-11B-Info-Zeroshot-Text-Only.json", "red", False),
    ("Oneshot Text Only  - Llama", "test_results/informative/Test-23/Llama-3.2-11B-Info-Oneshot-Text-Only.json", "green", False),
    ("Fiveshot Text Only  - Llama", "test_results/informative/Test-23/Llama-3.2-11B-Info-Fiveshot-Text-Only.json", "blue", True),
   
    ("Zeroshot Image Only  - Llama", "test_results/informative/Test-23/Llama-3.2-11B-Info-Zeroshot-Image-Only.json", "red", False),
    ("Oneshot Image Only   - Llama", "test_results/informative/Test-23/Llama-3.2-11B-Info-Oneshot-Image-Only.json", "green", True),

    ("Zeroshot Text Image  - Llama", "test_results/informative/Test-23/Llama-3.2-11B-Info-Zeroshot-Text-Image.json", "red", False),
    ("Oneshot Text Image   - Llama", "test_results/informative/Test-23/Llama-3.2-11B-Info-Oneshot-Text-Image.json", "green", False)
]

calculate_metrics_with_category_breakdown(model_results, task="LLama Zeroshot & Fewshot - Informative")

📊 **LLama Zeroshot & Fewshot - Informative — Overall Performance Summary**

+------------------------------+----------------+-----------------+--------------+----------------+
| Model                        |   Accuracy (%) |   Precision (%) |   Recall (%) |   F1 Score (%) |
| [91mZeroshot Text Only  - Llama[0m  |          [91m83.57[0m |           [91m83.28[0m |        [91m83.57[0m |          [91m83.22[0m |
+------------------------------+----------------+-----------------+--------------+----------------+
| [92mOneshot Text Only  - Llama[0m   |          [92m82.79[0m |           [92m84.46[0m |        [92m82.79[0m |          [92m81.22[0m |
+------------------------------+----------------+-----------------+--------------+----------------+
| [94mFiveshot Text Only  - Llama[0m  |          [94m82.53[0m |           [94m83.82[0m |        [94m82.53[0m |          [94m81.04[0m |
+------------------------------+----------------+-----------------+--------------+----------------+
| [91mZeroshot Image Only  - Llama[0m |          [91m86.05[0m

🗂️ **LLama Zeroshot & Fewshot - Informative — Zeroshot Text Only  - Llama Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mNot informative[0m          |           [91m79.17[0m |        [91m67.86[0m |          [91m73.08[0m | [91m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mInformative[0m              |           [91m85.3[0m  |        [91m91.26[0m |          [91m88.18[0m | [91m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m[0m        | [91mOverall accuracy: 83.57%[0m |           [91m83.28[0m |        [91m83.57[0m |          [91m83.22[0m | [91m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **LLama Zeroshot & Fewshot - Informative — Oneshot Text Only  - Llama Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mNot informative[0m          |           [92m92.25[0m |        [92m51.98[0m |          [92m66.5[0m  | [92m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mInformative[0m              |           [92m80.64[0m |        [92m97.86[0m |          [92m88.42[0m | [92m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m[0m        | [92mOverall accuracy: 82.79%[0m |           [92m84.46[0m |        [92m82.79[0m |          [92m81.22[0m | [92m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **LLama Zeroshot & Fewshot - Informative — Fiveshot Text Only  - Llama Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [94m0[0m       | [94mNot informative[0m          |           [94m90.14[0m |        [94m52.58[0m |          [94m66.42[0m | [94m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [94m1[0m       | [94mInformative[0m              |           [94m80.73[0m |        [94m97.18[0m |          [94m88.19[0m | [94m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [94m[0m        | [94mOverall accuracy: 82.53%[0m |           [94m83.82[0m |        [94m82.53[0m |          [94m81.04[0m | [94m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+




🗂️ **LLama Zeroshot & Fewshot - Informative — Zeroshot Image Only  - Llama Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mNot informative[0m          |           [91m90.28[0m |        [91m64.48[0m |          [91m75.23[0m | [91m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mInformative[0m              |           [91m84.75[0m |        [91m96.6[0m  |          [91m90.29[0m | [91m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m[0m        | [91mOverall accuracy: 86.05%[0m |           [91m86.57[0m |        [91m86.05[0m |          [91m85.34[0m | [91m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **LLama Zeroshot & Fewshot - Informative — Oneshot Image Only   - Llama Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mNot informative[0m          |           [92m95.61[0m |        [92m21.63[0m |          [92m35.28[0m | [92m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mInformative[0m              |           [92m72.18[0m |        [92m99.51[0m |          [92m83.67[0m | [92m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m[0m        | [92mOverall accuracy: 73.92%[0m |           [92m79.88[0m |        [92m73.92[0m |          [92m67.77[0m | [92m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+




🗂️ **LLama Zeroshot & Fewshot - Informative — Zeroshot Text Image  - Llama Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mNot informative[0m          |           [91m97.58[0m |        [91m40.08[0m |          [91m56.82[0m | [91m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mInformative[0m              |           [91m77.24[0m |        [91m99.51[0m |          [91m86.97[0m | [91m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m[0m        | [91mOverall accuracy: 79.99%[0m |           [91m83.93[0m |        [91m79.99[0m |          [91m77.07[0m | [91m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **LLama Zeroshot & Fewshot - Informative — Oneshot Text Image   - Llama Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mNot informative[0m          |           [92m49.14[0m |        [92m79.37[0m |          [92m60.7[0m  | [92m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mInformative[0m              |           [92m85.56[0m |        [92m59.81[0m |          [92m70.4[0m  | [92m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m[0m        | [92mOverall accuracy: 66.23%[0m |           [92m73.59[0m |        [92m66.23[0m |          [92m67.21[0m | [92m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


# GPT FEW-SHOTS - INFORMATIVE

In [33]:
model_results = [
    ("Zeroshot Text Only  - 4o", "test_results/informative/Test-23/gpt-4o-Info-Zeroshot-Text-Only.json", "red", False),
    ("OneShot Text Only   - 4o", "test_results/informative/Test-23/gpt-4o-Info-OneShot-Text-Only.json", "green", True),

    ("Zeroshot Image Only - 4o", "test_results/informative/Test-23/gpt-4o-Info-Zeroshot-Image-Only.json", "red", False),
    ("OneShot Image Only  - 4o", "test_results/informative/Test-23/gpt-4o-Info-Oneshot-Image-Only.json", "green", True),       
    
    ("Zeroshot Text Image - 4o", "test_results/informative/Test-23/gpt-4o-Info-Zeroshot-Text-Image.json", "red", False),
    ("OneShot Text Image  - 4o", "test_results/informative/Test-23/gpt-4o-Info-Oneshot-Text-Image.json", "green", True),
      
    ("Zeroshot Text Only  - 4o mini", "test_results/informative/Test-23/gpt-4o-mini-Info-Zeroshot-Text-Only.json", "red", False),
    ("OneShot Text Only   - 4o mini", "test_results/informative/Test-23/gpt-4o-mini-Info-Oneshot-Text-Only.json", "green", True),
    
    ("Zeroshot Image Only - 4o mini", "test_results/informative/Test-23/gpt-4o-mini-Info-Zeroshot-Image-Only.json", "red", False),
    ("OneShot Image Only  - 4o mini", "test_results/informative/Test-23/gpt-4o-mini-Info-Oneshot-Image-Only.json", "green", True),
    
    ("Zeroshot Text Image - 4o mini", "test_results/informative/Test-23/gpt-4o-mini-Info-Zeroshot-Text-Image.json", "red", False),
    ("OneShot Text Image  - 4o mini", "test_results/informative/Test-23/gpt-4o-mini-Info-Oneshot-Text-Image.json", "green", False),   
]

calculate_metrics_with_category_breakdown(model_results, task="GPT Zeroshot & Fewshot - Informative")

📊 **GPT Zeroshot & Fewshot - Informative — Overall Performance Summary**

+-------------------------------+----------------+-----------------+--------------+----------------+
| Model                         |   Accuracy (%) |   Precision (%) |   Recall (%) |   F1 Score (%) |
| [91mZeroshot Text Only  - 4o[0m      |          [91m79.47[0m |           [91m81.38[0m |        [91m79.47[0m |          [91m79.93[0m |
+-------------------------------+----------------+-----------------+--------------+----------------+
| [92mOneShot Text Only   - 4o[0m      |          [92m84.16[0m |           [92m83.94[0m |        [92m84.16[0m |          [92m83.99[0m |
+-------------------------------+----------------+-----------------+--------------+----------------+
| [91mZeroshot Image Only - 4o[0m      |          [91m85.14[0m |           [91m85.58[0m |        [91m85.14[0m |          [91m85.29[0m |
+-------------------------------+----------------+-----------------+--------------+----------------+
| [92mOneShot Image Only  - 4o[0m      |          [92m

🗂️ **GPT Zeroshot & Fewshot - Informative — Zeroshot Text Only  - 4o Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mNot informative[0m          |           [91m65.12[0m |        [91m80.75[0m |          [91m72.1[0m  | [91m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mInformative[0m              |           [91m89.33[0m |        [91m78.83[0m |          [91m83.75[0m | [91m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m[0m        | [91mOverall accuracy: 79.47%[0m |           [91m81.38[0m |        [91m79.47[0m |          [91m79.93[0m | [91m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **GPT Zeroshot & Fewshot - Informative — OneShot Text Only   - 4o Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mNot informative[0m          |           [92m78.06[0m |        [92m72.02[0m |          [92m74.92[0m | [92m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mInformative[0m              |           [92m86.81[0m |        [92m90.1[0m  |          [92m88.42[0m | [92m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m[0m        | [92mOverall accuracy: 84.16%[0m |           [92m83.94[0m |        [92m84.16[0m |          [92m83.99[0m | [92m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+




🗂️ **GPT Zeroshot & Fewshot - Informative — Zeroshot Image Only - 4o Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mNot informative[0m          |           [91m75.18[0m |        [91m81.75[0m |          [91m78.33[0m | [91m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mInformative[0m              |           [91m90.67[0m |        [91m86.8[0m  |          [91m88.69[0m | [91m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m[0m        | [91mOverall accuracy: 85.14%[0m |           [91m85.58[0m |        [91m85.14[0m |          [91m85.29[0m | [91m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **GPT Zeroshot & Fewshot - Informative — OneShot Image Only  - 4o Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mNot informative[0m          |           [92m79.21[0m |        [92m79.37[0m |          [92m79.29[0m | [92m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mInformative[0m              |           [92m89.89[0m |        [92m89.81[0m |          [92m89.85[0m | [92m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m[0m        | [92mOverall accuracy: 86.38%[0m |           [92m86.38[0m |        [92m86.38[0m |          [92m86.38[0m | [92m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+




🗂️ **GPT Zeroshot & Fewshot - Informative — Zeroshot Text Image - 4o Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mNot informative[0m          |           [91m89.24[0m |        [91m72.42[0m |          [91m79.96[0m | [91m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mInformative[0m              |           [91m87.64[0m |        [91m95.73[0m |          [91m91.51[0m | [91m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m[0m        | [91mOverall accuracy: 88.07%[0m |           [91m88.17[0m |        [91m88.07[0m |          [91m87.71[0m | [91m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **GPT Zeroshot & Fewshot - Informative — OneShot Text Image  - 4o Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mNot informative[0m          |           [92m92.82[0m |        [92m64.09[0m |          [92m75.82[0m | [92m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mInformative[0m              |           [92m84.74[0m |        [92m97.57[0m |          [92m90.7[0m  | [92m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m[0m        | [92mOverall accuracy: 86.57%[0m |           [92m87.39[0m |        [92m86.57[0m |          [92m85.81[0m | [92m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+




🗂️ **GPT Zeroshot & Fewshot - Informative — Zeroshot Text Only  - 4o mini Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mNot informative[0m          |           [91m91.4[0m  |        [91m63.29[0m |          [91m74.79[0m | [91m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mInformative[0m              |           [91m84.39[0m |        [91m97.09[0m |          [91m90.29[0m | [91m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m[0m        | [91mOverall accuracy: 85.98%[0m |           [91m86.69[0m |        [91m85.98[0m |          [91m85.2[0m  | [91m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **GPT Zeroshot & Fewshot - Informative — OneShot Text Only   - 4o mini Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mNot informative[0m          |           [92m98.31[0m |        [92m46.23[0m |          [92m62.89[0m | [92m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mInformative[0m              |           [92m79.11[0m |        [92m99.61[0m |          [92m88.18[0m | [92m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m[0m        | [92mOverall accuracy: 82.07%[0m |           [92m85.42[0m |        [92m82.07[0m |          [92m79.87[0m | [92m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+




🗂️ **GPT Zeroshot & Fewshot - Informative — Zeroshot Image Only - 4o mini Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mNot informative[0m          |           [91m81.6[0m  |        [91m79.17[0m |          [91m80.36[0m | [91m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mInformative[0m              |           [91m89.95[0m |        [91m91.26[0m |          [91m90.6[0m  | [91m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m[0m        | [91mOverall accuracy: 87.29%[0m |           [91m87.21[0m |        [91m87.29[0m |          [91m87.24[0m | [91m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **GPT Zeroshot & Fewshot - Informative — OneShot Image Only  - 4o mini Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mNot informative[0m          |           [92m90.86[0m |        [92m73.02[0m |          [92m80.97[0m | [92m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mInformative[0m              |           [92m87.95[0m |        [92m96.41[0m |          [92m91.99[0m | [92m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m[0m        | [92mOverall accuracy: 88.72%[0m |           [92m88.91[0m |        [92m88.72[0m |          [92m88.37[0m | [92m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+




🗂️ **GPT Zeroshot & Fewshot - Informative — Zeroshot Text Image - 4o mini Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [91m0[0m       | [91mNot informative[0m          |           [91m96.1[0m  |        [91m58.73[0m |          [91m72.91[0m | [91m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m1[0m       | [91mInformative[0m              |           [91m83.03[0m |        [91m98.83[0m |          [91m90.25[0m | [91m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [91m[0m        | [91mOverall accuracy: 85.66%[0m |           [91m87.33[0m |        [91m85.66[0m |          [91m84.55[0m | [91m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+


🗂️ **GPT Zeroshot & Fewshot - Informative — OneShot Text Image  - 4o mini Per-Category Metrics**

+---------+--------------------------+-----------------+--------------+----------------+-----------+
| Label   | Description              |   Precision (%) |   Recall (%) |   F1 Score (%) | Support   |
| [92m0[0m       | [92mNot informative[0m          |           [92m93.91[0m |        [92m67.26[0m |          [92m78.38[0m | [92m504[0m       |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m1[0m       | [92mInformative[0m              |           [92m85.93[0m |        [92m97.86[0m |          [92m91.51[0m | [92m1030[0m      |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
| [92m[0m        | [92mOverall accuracy: 87.81%[0m |           [92m88.55[0m |        [92m87.81[0m |          [92m87.2[0m  | [92m—[0m         |
+---------+--------------------------+-----------------+--------------+----------------+-----------+
