In [140]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import altair as alt
import json
from preprocessing_llm_output import *
from evaluate import *
from eval_metrics import *
import numpy as np
from huggingface_hub import login
from datasets import Dataset
login()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

In [4]:
def organize_dfs(llm_output_path, gt_path, zsCombined=False):
    evaluation_order = [
        "Vitals_Hema", "Neuro", "EENT", "CVS", "RESP", "GI", 
        "GU", "MSK", "DERM", "Lab_Image", "LYMPH", "History", 
        "ENDO", "Pregnancy"
    ]

    # Load Annotator Ground Truth
    gt_df = pd.read_json(gt_path, orient="records")
    gt_df = gt_df.map(lambda x: [] if x is None or (isinstance(x, float) and pd.isna(x)) else x)
    gt_df["pmcid"] = gt_df["pmcid"].astype(int)
    gt_df.sort_index(inplace=True, kind="stable")
    assert gt_df.isna().sum().sum() == 0, "There are NaN values in the DataFrame"

    # Load LLM Output
    with open(llm_output_path, "r", encoding="utf-8") as f:
        llm_output = json.load(f)

    # Standardize LLM Output
    if zsCombined:
        finalized_output = zsCombined_standardized_finalized_llm_output(llm_output, gt_path)
    else:
        finalized_output = standardized_finalized_llm_output(llm_output, gt_path)

    llm_df = make_df(finalized_output)
    llm_df = llm_df.map(process_item)
    llm_df["pmcid"] = llm_df["pmcid"].astype(int)
    assert llm_df.isna().sum().sum() == 0, f"There are NaN values in the DataFrame:\n{llm_df[llm_df.isna().any(axis=1)]}"

    # Set index to 'pmcid'
    assert "pmcid" in gt_df.columns and gt_df["pmcid"].is_unique, "Missing or non-unique 'pmcid' in gt_df"
    assert "pmcid" in llm_df.columns and llm_df["pmcid"].is_unique, "Missing or non-unique 'pmcid' in llm_df"
    gt_df.set_index("pmcid", inplace=True)
    llm_df.set_index("pmcid", inplace=True)

    # Ensure matching order and columns
    gt_df.sort_index(inplace=True, kind="stable")
    llm_df.sort_index(inplace=True, kind="stable")
    evaluation_order = [col for col in evaluation_order if col in gt_df.columns]
    gt_df = gt_df[evaluation_order]
    llm_df = llm_df[evaluation_order]

    assert gt_df.columns.equals(llm_df.columns), "Columns of gt_df and llm_df do not match"
    assert gt_df.index.equals(llm_df.index), "Indices of gt_df and llm_df do not match"

    return gt_df, llm_df


In [132]:
# # Llama3_Extracted IEM DS
# gt_df, llm_df = organize_dfs("../prompting/results/llama_fsSubheadingCategory.json", "../../output/gt_df_reconciled.json")
# llm_df = llm_df.reset_index()
# llm_df["pmcid"] = llm_df["pmcid"].astype(str)

# iem_df = pd.read_csv("../../human_verified_140_pmcids.csv", encoding="ISO-8859-1")
# iem_df["PMCID"] =  iem_df["PMCID"].astype(str)
# iem_dict = iem_df.set_index(iem_df.columns[0]).iloc[:, 0].to_dict()

# mito_pmcid = pd.read_csv("../../Mitochondrial_Disorder_PMCIDs.csv")["PMCID"].tolist()
# mito_pmcid = [str(id) for id in mito_pmcid] 
# iem_pmcid = iem_df["PMCID"].tolist()

# iem_df = iem_df.merge(llm_df, left_on="PMCID", right_on="pmcid").drop(columns=["PMCID"])
# iem_df["is_mitochondrial?"] = iem_df["pmcid"].isin(mito_pmcid)

# iem_ds = Dataset.from_pandas(iem_df)
# iem_ds.push_to_hub("cxyzhang/iem_llama3_extraction", private=True)

In [None]:
def calculate_bleu_column_wise(df_pred, df_ref):
    df_pred = df_pred.map(lambda x: ["unk"] if isinstance(x, list) and not x else x)
    df_ref = df_ref.map(lambda x: ["unk"] if isinstance(x, list) and not x else x)

    assert df_pred.shape == df_ref.shape, "Aligned DataFrames must have the same shape."

    bleu_scores = {}
    rouge_scores = {}

    for col in df_pred.columns:
        column_bleu = []
        column_rouge = []

        for idx in df_pred.index:
            pred = " ".join(df_pred.at[idx, col]) if isinstance(df_pred.at[idx, col], list) else str(df_pred.at[idx, col])
            ref = " ".join(df_ref.at[idx, col]) if isinstance(df_ref.at[idx, col], list) else str(df_ref.at[idx, col])

            translation_length = len(pred.split())
            reference_length = len(ref.split())

            if translation_length == 0 and reference_length == 0:
                bleu_score = {"bleu": 1.0, "precisions": [1.0, 1.0, 1.0, 1.0], "translation_length": 0, "reference_length": 0}
            elif translation_length == 0 or reference_length == 0:
                bleu_score = {"bleu": 0.0, "precisions": [0.0, 0.0, 0.0, 0.0], "translation_length": translation_length, "reference_length": reference_length}
            else:
                bleu_score = bleu_metric.compute(predictions=[pred], references=[[ref]])
                bleu_score.update({"translation_length": translation_length, "reference_length": reference_length})

            column_bleu.append(bleu_score)

            rouge_score = rouge_metric.compute(predictions=[pred], references=[[ref]])
            column_rouge.append(rouge_score)

        bleu_scores[col] = column_bleu
        rouge_scores[col] = column_rouge

    return bleu_scores, rouge_scores

    
def check_for_hallucination_category_wise(df1, df2, name1, name2):
    """
    Computes hallucination percentages for each column (category) in the DataFrames.
    """
    # Align DataFrames to ensure they have the same columns
    df1, df2 = df1.align(df2, join="outer", axis=1)
    
    hallucination_scores = {}

    for col in df1.columns:
        # Check for empty lists in both columns
        col1_empty = df1[col].map(lambda x: x == [])
        col2_empty = df2[col].map(lambda x: x == [])
        
        # Matching empty positions
        matching_empty_positions = col1_empty & col2_empty
        matching_empty = matching_empty_positions.sum()
        
        # Total empty lists in col1 and col2
        total_empty_col1 = col1_empty.sum()
        
        # Mismatches
        mismatch_positions = col1_empty & ~col2_empty
        mismatch_count = mismatch_positions.sum()
        mismatch_percentage = (mismatch_count / total_empty_col1 * 100) if total_empty_col1 > 0 else 0
        
        hallucination_scores[col] = mismatch_percentage

    return hallucination_scores



In [5]:
def get_token_level_metrics(llm_output_path, gt_path, zsCombined=False):
    
    gt_df, llm_df =  organize_dfs(llm_output_path, gt_path, zsCombined=False)
     
    # Ensure columns and indices match
    assert gt_df.columns.equals(llm_df.columns), "Columns of gt_df and llm_df do not match"
    assert gt_df.index.equals(llm_df.index), "Indices of gt_df and llm_df do not match"

    hallucination_category_wise = check_for_hallucination_category_wise(gt_df, llm_df, name1="gt_df", name2="llm_df")
 
    bleu, rouge = calculate_bleu_column_wise(df_pred=llm_df, df_ref=gt_df)

    # Calculate BLEU and ROUGE averages
    bleu_1_avg = {k: sum(entry['precisions'][0] for entry in entries) / len(entries) for k, entries in bleu.items()}
    bleu_4_avg = {k: sum(entry['precisions'][3] for entry in entries) / len(entries) for k, entries in bleu.items()}
    rougeL_avg = {k: sum(entry['rougeL'] for entry in entries) / len(entries) for k, entries in rouge.items()}
    rougeLSum_avg = {k: sum(entry['rougeLsum'] for entry in entries) / len(entries) for k, entries in rouge.items()}

    results_df = pd.DataFrame({
        'Category': list(bleu_1_avg.keys()),
        'BLEU-1 Avg': [round(bleu_1_avg[cat], 4) for cat in bleu_1_avg.keys()],
        'BLEU-4 Avg': [round(bleu_4_avg[cat], 4) for cat in bleu_4_avg.keys()],
        'ROUGE-L Avg': [round(rougeL_avg[cat], 4) for cat in rougeL_avg.keys()],
        'ROUGE-LSum Avg': [round(rougeLSum_avg[cat], 4) for cat in rougeLSum_avg.keys()],
        'Hallucination (%)': [round(hallucination_category_wise.get(cat, 0), 2) for cat in bleu_1_avg.keys()]
    })

    # Add average metrics to the DataFrame
    results_df = pd.concat([
        results_df,
        pd.DataFrame({
            'Category': ['Average'],
            'BLEU-1 Avg': [round(np.mean(list(bleu_1_avg.values())), 4)],
            'BLEU-4 Avg': [round(np.mean(list(bleu_4_avg.values())), 4)],
            'ROUGE-L Avg': [round(np.mean(list(rougeL_avg.values())), 4)],
            'ROUGE-LSum Avg': [round(np.mean(list(rougeLSum_avg.values())), 4)],
            'Hallucination (%)': [round(np.mean(list(hallucination_category_wise.values())), 2)]
        })
    ])

    return results_df


In [6]:
def string_metrics(llm_output_path, gt_path, zsCombined=False):
    gt_df, llm_df =  organize_dfs(llm_output_path, gt_path, zsCombined=zsCombined)    
    # Ensure columns and indices match
    assert gt_df.columns.equals(llm_df.columns), "Columns of gt_df and llm_df do not match"
    assert gt_df.index.equals(llm_df.index), "Indices of gt_df and llm_df do not match"

    df_tsr, df_Levenshiten, df_exact = calculate_comparison_metrics_ray(gt_df, llm_df)

    last_row_df1 = df_tsr.iloc[-1].to_frame(name='TSR')  # Convert the last row to a DataFrame with column name 'TSR'
    last_row_df2 = df_Levenshiten.iloc[-1].to_frame(name='Levenshtein')  # Same for Levenshtein
    last_row_df3 = df_exact.iloc[-1].to_frame(name='ExactMatch')  

    combined_df = pd.concat([last_row_df1, last_row_df2, last_row_df3], axis=1)
    combined_df.reset_index(inplace=True)
    combined_df.rename(columns={'index': 'Category'}, inplace=True)

    return combined_df

# Llama3:8B

In [32]:
# # fs Token-level metrics
# llama_fsSubheadingCategory = get_token_level_metrics("../prompting/results/llama_fsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# llama_fsWholeCategory = get_token_level_metrics("../prompting/results/llama_fsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# llama_fsWholeCombined = get_token_level_metrics("../prompting/results/llama_fsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
# # zs
# llama_zsSubheadingCategory = get_token_level_metrics("../prompting/results/llama_zsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# llama_zsWholeCategory = get_token_level_metrics("../prompting/results/llama_zsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# llama_zsWholeCombined = get_token_level_metrics("../prompting/results/llama_zsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)


In [None]:
# fs string-level metrics
llama_fsSubheadingCategory_distance = string_metrics("../prompting/results/llama_fsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
llama_fsWholeCategory_distance = string_metrics("../prompting/results/llama_fsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
 
llama_fsWholeCombined_distance = string_metrics("../prompting/results/llama_fsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
print("pass fswc")
# zs
llama_zsSubheadingCategory_distance = string_metrics("../prompting/results/llama_zsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
print("pass zsSubc")
llama_zsWholeCategory_distance = string_metrics("../prompting/results/llama_zsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
print("pass zswC")
llama_zsWholeCombined_distance = string_metrics("../prompting/results/llama_zsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
print("zsWC")

###  QWEN 2:7B

In [None]:
# fs Token-level metrics
qwen2_fsSubheadingCategory = get_token_level_metrics("../prompting/qwen_prompting/results/qwen_fsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_fsWholeCategory = get_token_level_metrics("../prompting/qwen_prompting/results/qwen_fsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_fsWholeCombined = get_token_level_metrics("../prompting/qwen_prompting/results/qwen_fsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
# zs
qwen2_zsSubheadingCategory = get_token_level_metrics("../prompting/qwen_prompting/results/qwen_zsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_zsWholeCategory = get_token_level_metrics("../prompting/qwen_prompting/results/qwen_zsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_zsWholeCombined = get_token_level_metrics("../prompting/qwen_prompting/results/qwen_zsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
 

In [2]:
# # fs string_level
# qwen2_fsSubheadingCategory_distance = string_metrics("../prompting/qwen_prompting/results/qwen_fsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# qwen2_fsWholeCategory_distance = string_metrics("../prompting/qwen_prompting/results/qwen_fsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# # zs
# qwen2_zsSubheadingCategory_distance = string_metrics("../prompting/qwen_prompting/results/qwen_zsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# qwen2_zsWholeCategory_distance = string_metrics("../prompting/qwen_prompting/results/qwen_zsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# #wholecombined
# qwen2_zsWholeCombined_distance = string_metrics("../prompting/qwen_prompting/results/qwen_zsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
# qwen2_fsWholeCombined_distance = string_metrics("../prompting/qwen_prompting/results/qwen_fsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
 

## QWEN 2.5:7B

In [None]:
# fs Token-level metrics
qwen2_5_7b_fsSubheadingCategory = get_token_level_metrics("../prompting/qwen2.5_7b/fsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_7b_fsWholeCategory = get_token_level_metrics("../prompting/qwen2.5_7b/fsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_7b_fsWholeCombined = get_token_level_metrics("../prompting/qwen2.5_7b/fsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
# zs
qwen2_5_7b_zsSubheadingCategory = get_token_level_metrics("../prompting/qwen2.5_7b/zsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_7b_zsWholeCategory = get_token_level_metrics("../prompting/qwen2.5_7b/zsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_7b_zsWholeCombined = get_token_level_metrics("../prompting/qwen2.5_7b/zsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
 

In [None]:
# fs String-level metrics
qwen2_5_7b_fsSubheadingCategory_distance = string_metrics("../prompting/qwen2.5_7b/fsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_7b_fsWholeCategory_distance = string_metrics("../prompting/qwen2.5_7b/fsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_7b_fsWholeCombined_distance = string_metrics("../prompting/qwen2.5_7b/fsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
# zs
qwen2_5_7b_zsSubheadingCategory_distance = string_metrics("../prompting/qwen2.5_7b/zsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_7b_zsWholeCategory_distance = string_metrics("../prompting/qwen2.5_7b/zsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_7b_zsWholeCombined_distance = string_metrics("../prompting/qwen2.5_7b/zsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
 

## QWEN 2.5:32B

In [None]:
# fs Token-level metrics
qwen2_5_32b_fsSubheadingCategory = get_token_level_metrics("../prompting/qwen2.5_32b/fsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_32b_fsWholeCategory = get_token_level_metrics("../prompting/qwen2.5_32b/fsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_32b_fsWholeCombined = get_token_level_metrics("../prompting/qwen2.5_32b/fsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
# zs
qwen2_5_32b_zsSubheadingCategory = get_token_level_metrics("../prompting/qwen2.5_32b/zsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_32b_zsWholeCategory = get_token_level_metrics("../prompting/qwen2.5_32b/zsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_32b_zsWholeCombined = get_token_level_metrics("../prompting/qwen2.5_32b/zsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
qwen2_5_32b_cot_zsWholeCateogry =get_token_level_metrics("../prompting/qwen2.5_32b/zsWHoleCategory_cot.json", "../../output/gt_df_reconciled.json", zsCombined=False)
qwen2_5_32b_cot_zsSubheadingCategory = get_token_level_metrics("../prompting/qwen2.5_32b/zsSubheadingCategory_cot.json", "../../output/gt_df_reconciled.json", zsCombined=True)

In [4]:
# # fs string-level metrics
# qwen2_5_32b_fsSubheadingCategory_distance = string_metrics("../prompting/qwen2.5_32b/fsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# qwen2_5_32b_fsWholeCategory_distance = string_metrics("../prompting/qwen2.5_32b/fsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# # zs
# qwen2_5_32b_zsSubheadingCategory_distance = string_metrics("../prompting/qwen2.5_32b/zsSubheadingCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# qwen2_5_32b_zsWholeCategory_distance = string_metrics("../prompting/qwen2.5_32b/zsWholeCategory.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# qwen2_5_32b_zsWholeCombined_distance = string_metrics("../prompting/qwen2.5_32b/zsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
# qwen2_5_32b_cot_zsWholeCateogry_distance = string_metrics("../prompting/qwen2.5_32b/zsWHoleCategory_cot.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# qwen2_5_32b_cot_zsSubheadingCategory_distance = string_metrics("../prompting/qwen2.5_32b/zsSubheadingCategory_cot.json", "../../output/gt_df_reconciled.json", zsCombined=True)

# qwen2_5_32b_fsWholeCombined_distance = string_metrics("../prompting/qwen2.5_32b/fsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)
# qwen2_5_32b_zsWholeCombined_distance = string_metrics("../prompting/qwen2.5_32b/zsWholeCombined.json", "../../output/gt_df_reconciled.json", zsCombined=True)


## GPT-4o

In [None]:
# fs Token-level metrics
gpt4o_fsSubheadingCategory = get_token_level_metrics("../prompting/open_ai_prompting/fs_subheading_filtered_per_category/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=False)
gpt4o_fsWholeCategory = get_token_level_metrics("../prompting/open_ai_prompting/fs_whole_per_category//extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# zs
gpt4o_zsSubheadingCategory = get_token_level_metrics("../prompting/open_ai_prompting/zs_subheading_filtered_per_category/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=False)
gpt4o_zsWholeCategory = get_token_level_metrics("../prompting/open_ai_prompting/zs_whole_per_category/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=False)
#whole combined
gpt4o_fsWholeCombined = get_token_level_metrics("../prompting/open_ai_prompting/fs_whole_combined/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=True)
gpt4o_zsWholeCombined = get_token_level_metrics("../prompting/open_ai_prompting/zs_whole_combined/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=True)
 

In [None]:
# fs string-level metrics
gpt4o_fsSubheadingCategory_distance= string_metrics("../prompting/open_ai_prompting/fs_subheading_filtered_per_category/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=False)
gpt4o_fsWholeCategory_distance = string_metrics("../prompting/open_ai_prompting/fs_whole_per_category//extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=False)
# zs
gpt4o_zsSubheadingCategory_distance = string_metrics("../prompting/open_ai_prompting/zs_subheading_filtered_per_category/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=False)
gpt4o_zsWholeCategory_distance = string_metrics("../prompting/open_ai_prompting/zs_whole_per_category/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=False)
#whole combined
gpt4o_zsWholeCombined_distance = string_metrics("../prompting/open_ai_prompting/zs_whole_combined/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=True)
gpt4o_fsWholeCombined_distance =string_metrics("../prompting/open_ai_prompting/fs_whole_combined/extracted_output/testing_case_set_out.json", "../../output/gt_df_reconciled.json", zsCombined=True)


In [None]:
def convert_json(df, filename):
    """
    Converts a pandas DataFrame to a JSON file.
    
    Parameters:
    - df: pandas DataFrame to be converted.
    - filename: The name of the output JSON file (without the .json extension).
    """
    # Convert DataFrame to a list of dictionaries
    data = df.to_dict(orient='records')  # Produces a JSON-serializable list of dictionaries
    with open(f'{filename}.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)

In [None]:
l = [
    # LLaMA model (Few-shot first, then Zero-shot)
    (llama_fsSubheadingCategory, "llama3_8b_FS_FCSP"),
    (llama_fsWholeCategory, "llama3_8b_FS_UCP"),
    (llama_fsWholeCombined, "llama3_8b_FS_UGP"),
    (llama_zsSubheadingCategory, "llama3_8b_ZS_FCSP"),
    (llama_zsWholeCategory, "llama3_8b_ZS_UCP"),
    (llama_zsWholeCombined, "llama3_8b_ZS_UGP"),
    
    # Qwen2 7B model (Few-shot first, then Zero-shot)
    (qwen2_fsSubheadingCategory, "qwen2_7b_FS_FCSP"),
    (qwen2_fsWholeCategory, "qwen2_7b_FS_UCP"),
    (qwen2_fsWholeCombined, "qwen2_7b_FS_UGP"),
    (qwen2_zsSubheadingCategory, "qwen2_7b_ZS_FCSP"),
    (qwen2_zsWholeCategory, "qwen2_7b_ZS_UCP"),
    (qwen2_zsWholeCombined, "qwen2_7b_ZS_UGP"),
    
    # Qwen2 5.7B model (Few-shot first, then Zero-shot)
    (qwen2_5_7b_fsSubheadingCategory, "qwen2.5_7b_FS_FCSP"),
    (qwen2_5_7b_fsWholeCategory, "qwen2.5_7b_FS_UCP"),
    (qwen2_5_7b_fsWholeCombined, "qwen2.5_7b_FS_UGP"),
    (qwen2_5_7b_zsSubheadingCategory, "qwen2.5_7b_ZS_FCSP"),
    (qwen2_5_7b_zsWholeCategory, "qwen2.5_7b_ZS_UCP"),
    (qwen2_5_7b_zsWholeCombined, "qwen2.5_7b_ZS_UGP"),
  
    
    # Qwen2 5.32B model (Few-shot first, then Zero-shot)
    (qwen2_5_32b_fsSubheadingCategory, "qwen2.5_32b_FS_FCSP"),
    (qwen2_5_32b_fsWholeCategory, "qwen2.5_32b_FS_UCP"),
    (qwen2_5_32b_fsWholeCombined, "qwen2.5_32b_FS_UGP"),
    (qwen2_5_32b_zsSubheadingCategory, "qwen2.5_32b_ZS_FCSP"),
    (qwen2_5_32b_zsWholeCategory, "qwen2.5_32b_ZS_UCP"),
    (qwen2_5_32b_zsWholeCombined, "qwen2.5_32b_ZS_UGP"),
    (qwen2_5_32b_cot_zsSubheadingCategory, "qwen2.5_32b_ZSCOT_FCSP"),
    (qwen2_5_32b_cot_zsWholeCateogry, "qwen2.5_32b_ZSCOT_UCP"),
    
    # GPT-4o model (Few-shot first, then Zero-shot)
    (gpt4o_fsSubheadingCategory, "gpt4o_unknown_FS_FCSP"),
    (gpt4o_fsWholeCategory, "gpt4o_unknown_FS_UCP"),
    (gpt4o_fsWholeCombined, "gpt4o_unknown_FS_UGP"),
    (gpt4o_zsSubheadingCategory, "gpt4o_unknown_ZS_FCSP"),
    (gpt4o_zsWholeCategory, "gpt4o_unknown_ZS_UCP"),
    (gpt4o_zsWholeCombined, "gpt4o_unknown_ZS_UGP")
]


In [None]:
# convert_json(gpt4o_zsWholeCombined, "gpt4o_zsWholeCombined")

In [None]:
def get_avg_row(df, row_name):
    last_row_df = df.iloc[[-1]].copy()
    last_row_df.rename(index={last_row_df.index[0]: row_name}, inplace=True)
    return last_row_df

In [None]:
combined_df = pd.DataFrame()

for df, name in l:
    last_row = get_avg_row(df, name)
    combined_df = pd.concat([combined_df, last_row])


In [None]:
combined_df = combined_df.drop(columns=["Category"])

In [None]:
combined_df['Model'] = combined_df.index.str.split('_').str[0:2].str.join(':')

In [None]:
sorted_df = combined_df.sort_values(by=['Model','ROUGE-L Avg'], ascending=[False,True]) 
sorted_df.drop(columns=["Model"], inplace=True)

In [None]:
def align_columns(dataframe):
    return dataframe.style.set_properties(**{
        'text-align': 'left'  # Apply left alignment for all columns first
    }).set_table_styles([
        {'selector': 'th', 'props': [('text-align', 'center')]},  # Center-align headers
        {'selector': 'td:nth-child(1)', 'props': [('text-align', 'left')]},  # Left-align the first column
        {'selector': 'td', 'props': [('text-align', 'center')]}  # Center-align other cells
    ])

# Apply the styling function to your DataFrame
styled_df = align_columns(combined_df)


In [14]:
def create_combined_dataframe(data_list):
    # Initialize an empty list to store processed DataFrames
    processed_dataframes = []
    
    # Loop through each tuple in the list
    for df, identifier in data_list:
        # Split the identifier into parts based on underscores
        parts = identifier.split("_")
        model = parts[0]  # First part is the model
        parameter = parts[1]
        prompting = parts[2]  # Second part is the prompting strategy
        category = parts[3]  # Rest is the category
        model_with_size = ":".join(parts[:2])
        
        # Add the extracted information as new columns in the DataFrame
        df["model"] = model
        df["parameters"] = parameter
        df["prompting"] = prompting
        df["application"] = category
        df["model_with_size"] = model_with_size
        df["identifier"] = identifier
        # Append the processed DataFrame to the list
        processed_dataframes.append(df)
    
    # Concatenate all processed DataFrames into a single DataFrame
    combined_df = pd.concat(processed_dataframes, ignore_index=True)
    return combined_df

In [36]:
d = [
    # LLaMA model (Few-shot first, then Zero-shot)
    (llama_fsSubheadingCategory_distance, "llama3_8b_FS_FCSP"),
    (llama_fsWholeCategory_distance, "llama3_8b_FS_UCP"),
    (llama_fsWholeCombined_distance, "llama3_8b_FS_UGP"),
    (llama_zsSubheadingCategory_distance, "llama3_8b_ZS_FCSP"),
    (llama_zsWholeCategory_distance, "llama3_8b_ZS_UCP"),
    (llama_zsWholeCombined_distance, "llama3_8b_ZS_UGP"),
    
    # Qwen2 7B model (Few-shot first, then Zero-shot)
    (qwen2_fsSubheadingCategory_distance, "qwen2_7b_FS_FCSP"),
    (qwen2_fsWholeCategory_distance, "qwen2_7b_FS_UCP"),
    (qwen2_fsWholeCombined_distance, "qwen2_7b_FS_UGP"),
    (qwen2_zsSubheadingCategory_distance, "qwen2_7b_ZS_FCSP"),
    (qwen2_zsWholeCategory_distance, "qwen2_7b_ZS_UCP"),
    (qwen2_zsWholeCombined_distance, "qwen2_7b_ZS_UGP"),
    
    # Qwen2 5.7B model (Few-shot first, then Zero-shot)
    (qwen2_5_7b_fsSubheadingCategory_distance, "qwen2.5_7b_FS_FCSP"),
    (qwen2_5_7b_fsWholeCategory_distance, "qwen2.5_7b_FS_UCP"),
    (qwen2_5_7b_fsWholeCombined_distance, "qwen2.5_7b_FS_UGP"),
    (qwen2_5_7b_zsSubheadingCategory_distance, "qwen2.5_7b_ZS_FCSP"),
    (qwen2_5_7b_zsWholeCategory_distance, "qwen2.5_7b_ZS_UCP"),
    (qwen2_5_7b_zsWholeCombined_distance, "qwen2.5_7b_ZS_UGP"),
  
    
    # Qwen2 5.32B model (Few-shot first, then Zero-shot)
    (qwen2_5_32b_fsSubheadingCategory_distance, "qwen2.5_32b_FS_FCSP"),
    (qwen2_5_32b_fsWholeCategory_distance, "qwen2.5_32b_FS_UCP"),
    (qwen2_5_32b_fsWholeCombined_distance, "qwen2.5_32b_FS_UGP"),
    (qwen2_5_32b_zsSubheadingCategory_distance, "qwen2.5_32b_ZS_FCSP"),
    (qwen2_5_32b_zsWholeCategory_distance, "qwen2.5_32b_ZS_UCP"),
    (qwen2_5_32b_zsWholeCombined_distance, "qwen2.5_32b_ZS_UGP"),
    (qwen2_5_32b_cot_zsSubheadingCategory_distance, "qwen2.5_32b_ZSCOT_FCSP"),
    (qwen2_5_32b_cot_zsWholeCateogry_distance, "qwen2.5_32b_ZSCOT_UCP"),
    
    # GPT-4o model (Few-shot first, then Zero-shot)
    (gpt4o_fsSubheadingCategory_distance, "gpt4o_unknown_FS_FCSP"),
    (gpt4o_fsWholeCategory_distance, "gpt4o_unknown_FS_UCP"),
    (gpt4o_fsWholeCombined_distance, "gpt4o_unknown_FS_UGP"),
    (gpt4o_zsSubheadingCategory_distance, "gpt4o_unknown_ZS_FCSP"),
    (gpt4o_zsWholeCategory_distance, "gpt4o_unknown_ZS_UCP"),
    (gpt4o_zsWholeCombined_distance, "gpt4o_unknown_ZS_UGP")
]


In [65]:
df = create_combined_dataframe(d)

In [84]:
# case_avg_df = df[df['Category'] == 'case_average']

# # Create a horizontal dot plot for TSR
# dot_plot_tsr = alt.Chart(case_avg_df).mark_circle(size=100).encode(
#     x=alt.X('TSR:Q', title='Average TSR (%)'),
#     y=alt.Y('identifier:N', sort='-x', title=None),
#     color=alt.Color('identifier:N', scale=alt.Scale(scheme='blues'), legend=None),
# ).properties(
#     width=800,
#     height=500,
#     title=''
# )

# dot_plot_tsr.show()

# dot_plot_tsr.save("../../EDA/TSR_by_Identifier.pdf")

In [83]:
# case_avg_df = df[df['Category'] == 'case_average']

# # Create a horizontal dot plot for TSR
# dot_plot_levenshtein = alt.Chart(case_avg_df).mark_circle(size=100).encode(
#     x=alt.X('Levenshtein:Q', title='Avergage Levenshtein'),
#     y=alt.Y('identifier:N', sort='-x', title=None),
#     color=alt.Color('identifier:N', scale=alt.Scale(scheme='blues'), legend=None),
#     tooltip=['identifier:N', 'TSR:Q']
# ).properties(
#     width=800,
#     height=500,
#     title= ""
# )

# dot_plot_levenshtein.show()
# dot_plot_levenshtein.save("../../EDA/levenshtein_by_Identifier.pdf")

In [None]:
case_avg_df = df[df['Category'] == 'case_average']

# Create a horizontal dot plot for TSR
dot_plot_em = alt.Chart(case_avg_df).mark_circle(size=100).encode(
    x=alt.X('ExactMatch:Q', title='Average Exact Match'),
    y=alt.Y('identifier:N', sort='-x', title=None),
    color=alt.Color('identifier:N', scale=alt.Scale(scheme='blues'), legend=None),
    tooltip=['identifier:N', 'TSR:Q']
).properties(
    width=800,
    height=500,
    title=''
)

dot_plot_em.show()
dot_plot_em.save("../../EDA/EM_by_Identifier.pdf")

## 

In [None]:
combined_plot = alt.vconcat(
    dot_plot_tsr,
    dot_plot_levenshtein, 
    dot_plot_em
).resolve_scale(
    x='independent',  # Shared x-axis
    y='independent'  # Independent y-axis
).configure_axis(
    labelFontSize=12,
    titleFontSize=12,
)
combined_plot.show()
combined_plot.save("../../EDA/tsr_levenshtein_em.pdf")

In [93]:
selected = ["qwen2_7b_FS_FCSP", "qwen2.5_7b_FS_FCSP", "qwen2.5_32b_FS_FCSP", "llama3_8b_FS_FCSP", "gpt4o_unknown_FS_UCP"]

In [165]:
import altair as alt
import pandas as pd

# Filter out `case_average`
filtered_df = df[(df['Category'] != 'case_average') & (df["identifier"].isin(selected))]

# Melt the DataFrame for plotting multiple metrics
melted_df = filtered_df.melt(
    id_vars=['identifier', 'Category'],
    value_vars=['TSR', 'Levenshtein', 'ExactMatch'],
    var_name='Metric',
    value_name='Value'
)

# Filter for TSR only
tsr_df = melted_df[melted_df['Metric'] == 'TSR']

# Identify categories that have valid data
valid_categories = tsr_df.groupby('Category').filter(lambda x: x['Value'].notnull().any())['Category'].unique()

# Filter the tsr_df DataFrame to include only valid categories
tsr_df = tsr_df[tsr_df['Category'].isin(valid_categories)]

# Create the bar chart for TSR across different identifiers per category
bar_chart_tsr = alt.Chart(tsr_df).mark_bar().encode(
    x=alt.X(
        'identifier:N', 
        title=None,
        sort=alt.EncodingSortField(
            field="Value",  
            op="mean",  
            order="descending"
        ),
        axis=None
    ),  
    y=alt.Y('Value:Q', title='TSR (%)', axis=alt.Axis(labelFontSize=50, titleFontSize=50)),
    color=alt.Color(
        'identifier:N', 
        title=None,  # Remove legend title if unnecessary
        legend=alt.Legend(
            labelFontSize=50,  # SIGNIFICANTLY Larger legend labels
            titleFontSize=52,  # Larger legend title
            symbolSize=400,  # Increase size of color markers
            orient="top",   
            direction="horizontal",  # Arrange legend items in a row
            columns=5,  # Reduce to 3 columns for better spacing
            rowPadding=10,  # Add spacing between legend items
            labelLimit=5000  # Ensure long names are fully displayed
        )
    ),
    tooltip=['identifier:N', 'Category:N', 'Value:Q']
).properties(
    width=500,  
    height=500
)

# Facet by 'Category' instead of 'identifier'
faceted_chart = bar_chart_tsr.facet(
    facet=alt.Facet(
        'Category:N',  # Each plot represents one category
        title=None,
        header=alt.Header(labelFontSize=50, titleFontSize=54)  # EVEN BIGGER category headings
    ),
    columns=7  # Adjust to control layout
).configure_title(
    fontSize=50
).configure_axis(
    labelFontSize=50,  # Increase axis label size for readability
    titleFontSize=50,
).configure_facet(
    spacing=20  # Increase spacing for better separation
)

faceted_chart.show()

# Save to PDF
faceted_chart.save("../../EDA/per_category_TSR_across_models.pdf")


In [117]:
# Filter out `case_average`
filtered_df = df[(df['Category'] != 'case_average')& (df["identifier"].isin(selected))]

# Melt the DataFrame for plotting multiple metrics
melted_df = filtered_df.melt(
    id_vars=['identifier', 'Category'],
    value_vars=['TSR', 'Levenshtein', 'ExactMatch'],
    var_name='Metric',
    value_name='Value'
)

# Filter for TSR only
tsr_df = melted_df[melted_df['Metric'] == 'TSR']

# Identify identifiers that have valid data
valid_identifiers = tsr_df.groupby('identifier').filter(lambda x: x['Value'].notnull().any())['identifier'].unique()

# Filter the tsr_df DataFrame to include only valid identifiers
tsr_df = tsr_df[tsr_df['identifier'].isin(valid_identifiers)]

# Create the bar chart for TSR with faceting and rotated x-axis labels
bar_chart_tsr = alt.Chart(tsr_df).mark_bar().encode(
    x=alt.X(
        'Category:N', 
        title=None, 
        sort=alt.EncodingSortField(
            field="Value",  
            op="mean",  
            order="descending"  # Sort from highest to lowest
        ),
        axis=alt.Axis(labelAngle=-90, labelFontSize=18, titleFontSize=20)
    ),  
    y=alt.Y('Value:Q', title='TSR (%)', axis=alt.Axis(labelFontSize=20, titleFontSize=24) ),
    color=alt.Color('Category:N', title='Category', legend=None),
    tooltip=['identifier:N', 'Category:N', 'Value:Q']
).properties(
    width=400,
    height=100
)


# Facet the chart by 'identifier' with row labels
faceted_chart = bar_chart_tsr.facet(
    facet=alt.Facet(
        'identifier:N',
        title=None,
        header=alt.Header(labelAngle=0, labelFontSize=20, titleFontSize=20)  # Customize row headers
    ),
    columns=3  # Arrange 6 plots per row
).configure_title(
    fontSize=14
).configure_axis(
    labelFontSize=10,
    titleFontSize=12,
).configure_facet(
    spacing=8  # Adjust spacing between plots
)


faceted_chart.show()
# Save to PDF
faceted_chart.save("../../EDA/per_category_TSR_selected_by_identifier.pdf")


In [97]:
# Filter out `case_average`
filtered_df = df[df['Category'] != 'case_average']

# Melt the DataFrame for plotting multiple metrics
melted_df = filtered_df.melt(
    id_vars=['identifier', 'Category'],
    value_vars=['TSR', 'Levenshtein', 'ExactMatch'],
    var_name='Metric',
    value_name='Value'
)

# Filter for TSR only
tsr_df = melted_df[melted_df['Metric'] == 'Levenshtein']

# Create the bar chart for TSR with faceting and rotated x-axis labels
bar_chart_tsr = alt.Chart(tsr_df).mark_bar().encode(
    x=alt.X('Category:N', title='Category', sort=None, axis=alt.Axis(labelAngle=-45)),  # Rotate labels at -45 degrees
    y=alt.Y('Value:Q', title='Levenshtein Distance'),
    color=alt.Color('Category:N', title='Category'),
    tooltip=['identifier:N', 'Category:N', 'Value:Q']
).properties(
    width=150,
    height=200
)

# Facet the chart by 'identifier' with row labels
faceted_chart = bar_chart_tsr.facet(
    facet=alt.Facet(
        'identifier:N',
        title='model, prompting and application',
        header=alt.Header(labelAngle=0, labelFontSize=12, titleFontSize=14)  # Customize row headers
    ),
    columns=4  # Arrange 4 plots per row
).properties(
    title='Per-Category Levenshtein Distance Across Model, Prompting and Application'
).configure_title(
    fontSize=14
).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_facet(
    spacing=15  # Adjust spacing between plots
)

faceted_chart.show()
faceted_chart.save("../../EDA/per_category_levenshtein_by_identifier.pdf")

In [6]:
# # Filter out `case_average`
# filtered_df = df[df['Category'] != 'case_average']

# # Melt the DataFrame for plotting multiple metrics
# melted_df = filtered_df.melt(
#     id_vars=['identifier', 'Category'],
#     value_vars=['TSR', 'Levenshtein', 'ExactMatch'],
#     var_name='Metric',
#     value_name='Value'
# )

# # Filter for TSR only
# tsr_df = melted_df[melted_df['Metric'] == 'ExactMatch']

# # Create the bar chart for TSR with faceting and rotated x-axis labels
# bar_chart_tsr = alt.Chart(tsr_df).mark_bar().encode(
#     x=alt.X('Category:N', title='Category', sort=None, axis=alt.Axis(labelAngle=-45)),  # Rotate labels at -45 degrees
#     y=alt.Y('Value:Q', title='TSR Value'),
#     color=alt.Color('Category:N', title='Category'),
#     tooltip=['identifier:N', 'Category:N', 'Value:Q']
# ).properties(
#     width=150,
#     height=200
# )

# # Facet the chart by 'identifier' with row labels
# faceted_chart = bar_chart_tsr.facet(
#     facet=alt.Facet(
#         'identifier:N',
#         title='model, prompting and application',
#         header=alt.Header(labelAngle=0, labelFontSize=12, titleFontSize=14)  # Customize row headers
#     ),
#     columns=4  # Arrange 4 plots per row
# ).properties(
#     title='Per-Category Exact Match Value by Model, Prompting and Application Combination'
# ).configure_title(
#     fontSize=14
# ).configure_axis(
#     labelFontSize=10,
#     titleFontSize=12
# ).configure_facet(
#     spacing=15  # Adjust spacing between plots
# )

# faceted_chart.show()
# faceted_chart.save("../../EDA/per-cateegory_EM_identifier.pdf")

In [None]:
# Filter out `case_average`
filtered_df = df[df['Category'] == 'case_average']

# Melt the DataFrame for plotting multiple metrics
melted_df = filtered_df.melt(
    id_vars=['parameters', 'Category'],
    value_vars=['TSR', 'Levenshtein', 'ExactMatch'],
    var_name='Metric',
    value_name='Value'
)

# Filter for TSR only
tsr_df = melted_df[melted_df['Metric'] == 'TSR']
average_tsr = tsr_df.groupby('parameters', as_index=False)['Value'].mean()


# Create the bar chart for TSR with faceting and rotated x-axis labels
bar_chart_tsr = alt.Chart(average_tsr).mark_bar().encode(
    x=alt.X('Category:N', title='Category', sort=None, axis=alt.Axis(labelAngle=-45)),  # Rotate labels at -45 degrees
    y=alt.Y('Value:Q', title='TSR Value'),
    color=alt.Color('Category:N', title='Category'),
).properties(
    width=150,
    height=200
)

# Facet the chart by 'identifier' with row labels
faceted_chart = bar_chart_tsr.facet(
    facet=alt.Facet(
        'parameters:N',
        title='Parameters',
        header=alt.Header(labelAngle=0, labelFontSize=12, titleFontSize=14)  # Customize row headers
    ),
    columns=4  # Arrange 4 plots per row
).properties(
    title='TSR Value by Model Parameters'
).configure_title(
    fontSize=14
).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_facet(
    spacing=15  # Adjust spacing between plots
)

faceted_chart.show()
faceted_chart.save("../../EDA/TSR_by_parameters.pdf")

In [None]:
filtered_df = df[df['Category'] == 'case_average']
melted_df = filtered_df.melt(
    id_vars=['model_with_size', 'Category'],
    value_vars=['TSR', 'Levenshtein', 'ExactMatch'],
    var_name='Metric',
    value_name='Value'
)
 
tsr_df = melted_df[melted_df['Metric'] == 'TSR']

average_tsr = tsr_df.groupby('model_with_size', as_index=False)['Value'].mean()

tsr_by_model = alt.Chart(average_tsr).mark_bar(size=40).encode(
    x=alt.X('model_with_size:N', title='Model', sort=None),
    y=alt.Y('Value:Q', title='Average TSR(%) Value'),  # Use average TSR values
    color=alt.Color('model_with_size:N', title='Model', legend=None),  # Optional: color by model
).properties(
    width=300,
    height=400,
    title='A'
).configure_title(
    fontSize=20
).configure_axis(
    labelFontSize=20,
    titleFontSize=20,
).configure_legend(
    titleFontSize=20,
    labelFontSize=20 

)

 
tsr_by_model.show()

 


In [8]:
# filtered_df = df[(df['Category'] == 'case_average') & (df['model_with_size']=="qwen2.5:32b")]
# filtered_df 

In [9]:

# filtered_df = df[(df['Category'] == 'case_average') & (df['model_with_size']=="qwen2.5:32b") & (df["application"]!="UGP")]


# melted_df = filtered_df.melt(
#     id_vars=['prompting', 'Category'],  # Change from 'parameters' to 'model'
#     value_vars=['TSR', 'Levenshtein', 'ExactMatch'],
#     var_name='Metric',
#     value_name='Value'
# )

 

# tsr_df = melted_df[melted_df['Metric'] == 'TSR']
# average_tsr = tsr_df.groupby('prompting', as_index=False)['Value'].mean()
 
# tsr_by_prompts = alt.Chart(average_tsr).mark_bar(size=40).encode(
#     x=alt.X('prompting:N', title='Prompting', sort=None),  # Prompting strategies on x-axis
#     y=alt.Y('Value:Q', title=None),  # Average TSR values on y-axis
#     color=alt.Color('prompting:N', title='Prompting Strategy', legend=None),
#     tooltip=['prompting:N', 'Value:Q']
# ).properties(
#     title='C',
#     width=300,
#     height=400
# ).configure_title(
#     fontSize=20
# ).configure_axis(
#     labelFontSize=20,
#     titleFontSize=20)
 
# tsr_by_prompts.show()


In [None]:
filtered_df = df[df['Category'] == 'case_average']

# Melt the DataFrame for plotting multiple metrics
melted_df = filtered_df.melt(
    id_vars=['application', 'Category'],  # Keep 'application' and 'Category'
    value_vars=['TSR', 'Levenshtein', 'ExactMatch'],
    var_name='Metric',
    value_name='Value'
)

# Filter for TSR only
tsr_df = melted_df[melted_df['Metric'] == 'TSR']

# Group by application and calculate average TSR
average_tsr = tsr_df.groupby(['application', 'Category'], as_index=False)['Value'].mean()

tsr_by_application = alt.Chart(average_tsr).mark_bar(size=40).encode(
    x=alt.X('application:N', title='Data Integration', sort=None),  # Applications on the x-axis
    y=alt.Y('Value:Q', title=None),  # Remove y-axis label
    color=alt.Color('application:N', title="application", legend=None),  # Color by Category
).properties(
    title='B',
    width=300,
    height=400
).configure_title(
    fontSize=20
).configure_axis(
    labelFontSize=20,
    titleFontSize=20
)

tsr_by_application.show()
tsr_by_application.save("../../EDA/tsr_application.pdf")


#### EDA for groundtruth dataset

In [None]:
with open("../../output/gt_df_reconciled.json", "r", encoding="utf-8") as f:
    gt_df = json.load(f)

In [None]:
gt_df = pd.read_json("../../output/gt_df_reconciled.json", orient="records")

In [None]:
col_list = list(gt_df.columns)

In [None]:
avg_string_lengths = {}
for col in col_list:
    if col not in ["pmcid", "text"]:
        string_lengths = gt_df[col].apply(lambda x: [len(s) for s in x] if isinstance(x, list) and len(x)>0 else []). explode()
        avg_length = string_lengths.dropna().mean()
        avg_string_lengths[col] = round(avg_length,1)

In [None]:
avg_string_lengths

In [None]:
df_length = pd.DataFrame.from_dict(avg_string_lengths, orient="index", columns=['Average String Length'])

In [None]:
per_annoate_dict = {}
for col in col_list:
    if col not in ["pmcid", "text"]:
        # Count the non-empty values (values not equal to [])
        non_empty_percentage = gt_df[col].apply(lambda x: len(x) > 0 if isinstance(x, list) else bool(x)).sum()/len(gt_df) *100 
        per_annoate_dict[col] = round(non_empty_percentage, 1)


In [None]:
per_annoate_dict 
df_per_annote =  pd.DataFrame.from_dict(per_annoate_dict , orient="index", columns=['Annotation (%)'])

In [None]:
pd.vconcat(df_per_annote, df_length)

In [None]:
df = pd.concat([df_per_annote, df_length], axis=1)

In [None]:
df.to_dict()

In [None]:
filtered_df = df[df['Category'] == 'case_average']
new_df = filtered_df[['model_with_size', 'prompting', 'application', 'TSR', 'Levenshtein', 'ExactMatch']]

In [None]:
for col in new_df.columns:
    if col not in ["model_with_size", "prompting", "application"]:
        new_df[col] = new_df[col].apply(lambda x: round(x, 3)) 

In [None]:
new_df.rename(columns={
    'TSR': 'TSR(%)',
    'model_with_size': 'Model : Parameters',
    'application': 'Method',
    'prompting': 'Prompting'
}, inplace=True)


In [None]:
sorted_df = new_df.sort_values(by=['Model : Parameters', 'TSR(%)'], ascending=[False, True])

In [None]:
sorted_df.set_index("Model : Parameters").to_latex()