In [None]:
import glob
import json

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
ls

In [None]:
def combine_output(json_list):
    """
    receive a list of output files. load then and combine them into a single one.
    """
    data = []
    for jf in json_list:
        with open(jf, "r") as f:
            data += json.load(f)
    return data        

def extract_answer_pandas(
    model_output, 
    answer_options, 
    prefix="The correct answer is:", 
    letter_options=["A", "B", "C", "D"]):
    """ adaptation of muchomusic function but applies to our pandas dataframe """

    output = model_output.split(prefix)[-1].strip()
    response = list(set(letter_options).intersection(output))
    if len(response) == 1:
        final_response = letter_options.index(response[0])
    else:
        normalized_output = output.lower().strip()
        normalized_answers = [j.lower().strip() for j in answer_options]

        for j, answer in enumerate(normalized_answers):
            if answer in normalized_output:
                final_response = j
                break
            else:
                final_response = -1
    return final_response

def compare_answers(response, answer_orders):
    """
    return correct/incorrect/unanswered
    """
    answer = 0
    if response == answer_orders.index(0):
        answer = 1
    elif response == -1:
        answer = -1

    return answer

def accuracy(df):
    return df[df["final_answer"] == 1]["final_answer"].count()/df["final_answer"].count()


def json_to_df(model, experiment, model_output, musiccaps_only, mmshap=True):
    df = pd.DataFrame(model_output)
    df["model"] = model
    df["experiment"] = experiment
    df["extracted_response"] = df[["model_output", "answers"]].apply(lambda x: extract_answer_pandas(x.model_output, x.answers), axis=1)
    df["final_answer"] = df[["extracted_response", "answer_orders"]].apply(lambda x: compare_answers(x.extracted_response, x.answer_orders), axis=1)
    if mmshap:
        df["mmshap_audio"] = 1-df["mmshap_text"]

    if musiccaps_only:
        df = df[df["dataset"] == "musiccaps"]
    return df

### load QwenAudio results

In [None]:
# original few shot default
qwen_fs_list = glob.glob("experiments/output_data/parallel_mmshap_qwenaudio_original_few_shot_20_*")
qwen_fs = combine_output(qwen_fs_list)

len(qwen_fs)

In [None]:
# description
qwen_desc_list = glob.glob("experiments/output_data/parallel_mmshap_qwenaudio_description_5_*")
qwen_desc = combine_output(qwen_desc_list)
len(qwen_desc)

In [None]:
# question only
qwen_qo_list = glob.glob("experiments/output_data/parallel_mmshap_qwenaudio_question_only_5_*")
qwen_qo = combine_output(qwen_qo_list)
len(qwen_qo)

### load MU-LLaMA results

In [None]:
# original few shot default
#mullama_fs_list = glob.glob("experiments/output_data/mullama_original_few_shot_1_*") + glob.glob("experiments/output_data/musiccaps/mullama_original_few_shot_1_*")
mullama_fs_list = glob.glob("experiments/output_data/mullama_original_few_shot_1_*")
print(len(mullama_fs_list))
mullama_fs = combine_output(mullama_fs_list)

len(mullama_fs)

In [None]:
# description (not complete yet)
# mullama_desc_list = glob.glob("experiments/output_data/mullama_description_4_*")
mullama_desc_list = glob.glob("experiments/output_data/mullama_description_5_*")
print(len(mullama_desc_list)*5)
mullama_desc = combine_output(mullama_desc_list)
len(mullama_desc)

In [None]:
# description (not complete yet)
mullama_qo_list = glob.glob("experiments/output_data/mullama_question_only_5_*")
print(len(mullama_qo_list)*5)
mullama_qo = combine_output(mullama_qo_list)
len(mullama_qo)

## load LLaMA results

In [None]:
# description (not complete yet)
llama = glob.glob("experiments/output_data/llama_original.json")
llama = combine_output(llama)
len(llama)

# inspect T-SHAP for different models

In [None]:
musiccaps_only = True

In [None]:
llama = json_to_df(model="LLaMA2", experiment="few_shot", model_output=llama, musiccaps_only=musiccaps_only, mmshap=False)
print(f"Accuracy for LLaMA few shot: {accuracy(llama)} ({len(llama)} samples)")

In [None]:
qwen_fs_df = json_to_df(model="QwenAudio", experiment="few_shot", model_output=qwen_fs, musiccaps_only=musiccaps_only).dropna()
qwen_desc_df = json_to_df(model="QwenAudio", experiment="description", model_output=qwen_desc, musiccaps_only=musiccaps_only).dropna()
qwen_qo_df = json_to_df(model="QwenAudio", experiment="question_only", model_output=qwen_qo, musiccaps_only=musiccaps_only).dropna()

qwen_fs_df.to_csv("qwenaudio_fewshot_results.csv", index=False)
qwen_desc_df.to_csv("qwenaudio_desc_results.csv", index=False)
qwen_qo_df.to_csv("qwenaudio_qo_results.csv", index=False)

print(f"Accuracy for QwenAudio few shot: {accuracy(qwen_fs_df)} ({len(qwen_fs_df)} samples)")
print(f"Accuracy for QwenAudio question only: {accuracy(qwen_qo_df)} ({len(qwen_qo_df)} samples)")
print(f"Accuracy for QwenAudio description: {accuracy(qwen_desc_df)} ({len(qwen_desc_df)} samples)")

In [None]:
mu_fs_df = json_to_df(model="MU-LLaMA", experiment="few_shot", model_output=mullama_fs, musiccaps_only=musiccaps_only)
mu_fs_df = mu_fs_df.dropna()
mu_desc_df = json_to_df(model="MU-LLaMA", experiment="description", model_output=mullama_desc, musiccaps_only=musiccaps_only)
mu_desc_df = mu_desc_df.dropna()
mu_qo_df = json_to_df(model="MU-LLaMA", experiment="question_only", model_output=mullama_qo, musiccaps_only=musiccaps_only)
mu_qo_df = mu_qo_df.dropna()

mu_fs_df.to_csv("mullama_fewshot_results.csv", index=False)
mu_desc_df.to_csv("mullama_desc_results.csv", index=False)
mu_qo_df.to_csv("mullama_qo_results.csv", index=False)

print(f"Accuracy for MU-LLaMA few shot: {accuracy(mu_fs_df)} ({len(mu_fs_df)} samples)")
print(f"Accuracy for MU-LLaMA question only: {accuracy(mu_qo_df)} ({len(mu_qo_df)} samples)")
print(f"Accuracy for MU-LLaMA description: {accuracy(mu_desc_df)} ({len(mu_desc_df)} samples)")

In [None]:
results = pd.concat([
    qwen_fs_df, qwen_desc_df, qwen_qo_df,
    mu_fs_df, mu_desc_df, mu_qo_df
], ignore_index=True)

In [None]:
# get genres and categories information
all_genres = qwen_fs_df["genre"].unique()

knowledge_categories = []
for i in qwen_fs_df["knowledge"].tolist():
    knowledge_categories += i

knowledge_categories = list(set(knowledge_categories))

reasoning_categories = []
for i in qwen_fs_df["reasoning"].tolist():
    reasoning_categories += i

reasoning_categories = list(set(reasoning_categories))

In [None]:
results.groupby(["experiment", "model"]).agg({
    # "mmshap_text": ["mean", "median", "std"], 
    "mmshap_audio": ["mean", "median", "min", "max", "count"]
    # "final_answer": "count"
})

In [None]:
results.groupby(["experiment", "model", "final_answer"]).agg({
    # "mmshap_text": ["mean", "std"], 
    "mmshap_audio": ["mean", "std"],#, "median", "min", "max"]
    "final_answer": "count"
})

In [None]:
results[results["dataset"] == "musiccaps"].groupby(["experiment", "model", "dataset"]).agg({
    # "mmshap_text": ["mean", "median", "std"], 
    "mmshap_audio": ["mean", "std", "count"], #, "median"], # "min", "max"]
})
# .plot(kind="bar")
# plt.show()

In [None]:
results.groupby(["experiment", "model"]).agg({
    # "mmshap_text": ["mean", "median", "std"], 
    "mmshap_audio": ["mean", "median", "min", "max"],
    "final_answer": "count"
})

In [None]:
grouped_data = results.groupby(["experiment", "model", "final_answer"]).agg({
    "mmshap_audio": ["mean", "median", "max", "min"], 
    # "final_answer": "count"
})

In [None]:
grouped_data

In [None]:
results.groupby(["experiment", "model"])[["mmshap_audio"]].boxplot(subplots=False)
plt.xlabel("Experiment")
plt.ylabel("A-SHAP")
# locs, labels = plt.xticks()
# new_labels = [i.get_text().split(",")[0][1:] for i in labels]
# plt.xticks(ticks=locs, labels=new_labels, rotation=45)
# print(locs, labels)

plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# bar plot

In [None]:
results_exploded = results[results["experiment"].isin(["few_shot", "question_only"])].explode("knowledge").explode("reasoning")

results_knowledge = results_exploded.dropna(subset=["knowledge"])
results_reasoning = results_exploded.dropna(subset=["reasoning"])

results_exploded

knowledge_grouped = results_knowledge.groupby(["knowledge", "model", "experiment"]).agg({"mmshap_audio": "mean"}).reset_index().rename(columns={"knowledge": "question_type"})

reasoning_grouped = results_reasoning.groupby(["reasoning", "model", "experiment"]).agg({"mmshap_audio": "mean"}).reset_index().rename(columns={"reasoning": "question_type"})

finegrained_results = pd.concat([knowledge_grouped, reasoning_grouped])

finegrained_results

In [None]:
agg_results = results[["mmshap_audio", "model", "experiment"]].groupby(["model", "experiment"]).mean().reset_index()

questions_label_mapping ={
    "temporal relations between elements": "Temporal \n Relations",
    "mood and expression": "Mood",
    "lyrics": "Lyrics",
    "historical and cultural context": "Cultural \n Context",
    "genre and style": "Genre",
    "functional context": "Functional \n Context",
    "melody": "Melody",
    "harmony": "Harmony",
    "metre and rhythm": "Metre & \nRhythm",
    "structure": "Structure",
    "performance": "Performance",
    "instrumentation": "Instrumentation",
    "sound texture": "Sound \n Texture",
    "dynamics and expression": "Dynamics & \nExpression",
}

# temporary just so i can plot stuff
finegrained_results = finegrained_results.fillna(0)
agg_results = agg_results.fillna(0)

finegrained_with_ref = pd.merge(
    left=finegrained_results, 
    right=agg_results,
    how='left',
    left_on=['model', 'experiment'],
    right_on=['model', 'experiment'],
)

In [None]:
finegrained_with_ref = finegrained_with_ref.rename(columns={"mmshap_audio_x": "question_type_ashap", "mmshap_audio_y": "dataset_ashap"})

finegrained_with_ref["question_cat"] = finegrained_with_ref.apply(lambda x: questions_label_mapping[x["question_type"]], axis=1)
finegrained_with_ref["diff"] = finegrained_with_ref["question_type_ashap"] - finegrained_with_ref["dataset_ashap"]

finegrained_with_ref["experiment_name"] = finegrained_with_ref["experiment"].apply(lambda x: "M-Choice" if x == "few_shot" else "Q-Only")

tmp = finegrained_with_ref[["model", "experiment_name", "question_cat", "diff"]]

qwen_tmp = tmp[tmp["model"] == "QwenAudio"]
mu_tmp = tmp[tmp["model"] == "MU-LLaMA"]

In [None]:
fig, ax = plt.subplots(2,1, sharex=True, sharey=False)
for a in ax:
    a.spines['top'].set_visible(False)
    a.spines['right'].set_visible(False)
    a.spines['left'].set_visible(False)
    
    a.axhline(y=0, color="black", linewidth=0.5, alpha=0.5)
    a.grid(axis="y")
    a.set(axisbelow=True)

sns.barplot(qwen_tmp, x="question_cat", y="diff", hue="experiment_name", ax=ax[0])
sns.barplot(mu_tmp, x="question_cat", y="diff", hue="experiment_name", ax=ax[1])
plt.xticks(ticks=qwen_tmp["question_cat"].values, rotation=73, fontsize=10)

ax[0].legend(loc="upper center", ncols=2, bbox_to_anchor=(0.5,1.25))
ax[1].get_legend().remove()

ax[0].set_ylim(-0.1, 0.1)
ax[1].set_ylim(-0.01, 0.01)

ax[1].set_xlabel("")
ax[0].set_ylabel("QwenAudio\n$\Delta$ A-SHAP$_{avg}$")
ax[1].set_ylabel("MU-LLaMA\n$\Delta$ A-SHAP$_{avg}$")
plt.savefig("paper_figures/ashap_avg_comparison.png", bbox_inches="tight")
plt.show()

# spider plot

In [None]:
results_exploded = results[results["experiment"].isin(["few_shot", "question_only"])].explode("knowledge").explode("reasoning")

results_knowledge = results_exploded.dropna(subset=["knowledge"])
results_reasoning = results_exploded.dropna(subset=["reasoning"])

knowledge_grouped = results_knowledge.groupby(["knowledge", "model", "experiment"]).agg({"mmshap_audio": "mean"}).reset_index().rename(columns={"knowledge": "question_type"})

reasoning_grouped = results_reasoning.groupby(["reasoning", "model", "experiment"]).agg({"mmshap_audio": "mean"}).reset_index().rename(columns={"reasoning": "question_type"})

In [None]:
finegrained_results = pd.concat([knowledge_grouped, reasoning_grouped])

In [None]:
questions_label_mapping ={
    "temporal relations between elements": "Temporal \n Relations",
    "mood and expression": "Mood",
    "lyrics": "Lyrics",
    "historical and cultural context": "Cultural \n Context",
    "genre and style": "Genre",
    "functional context": "Functional \n Context",
    "melody": "Melody",
    "harmony": "Harmony",
    "metre and rhythm": "Metre & \nRhythm",
    "structure": "Structure",
    "performance": "Performance",
    "instrumentation": "Instrumentation",
    "sound texture": "Sound \n Texture",
    "dynamics and expression": "Dynamics & \nExpression",
}

In [None]:
exp_map = {"question_only": "Question Only", "few_shot": "Few-Shot"}
style_map = {
    "QwenAudio": {
        "question_only": {"color": "deepskyblue", "linestyle":'--', "label": "Qwen-Audio - Q-Only"},
        "few_shot": {"color": "darkblue", "linestyle": ':', "label": "Qwen-Audio - M-Choice"}
    },
    "MU-LLaMA": {
        "question_only": {"color": "orangered", "linestyle":'-', "label": "MU-LLaMA - Q-Only"},
        "few_shot": {"color": "darkred", "linestyle":'-.', "label": "MU-LLaMA - M-Choice"},
    }
    # ["MU-LLaMA"]["few_shot"]: {color="darkgreen", linestyle='solid', label="Qwen-Audio -- Few Shot"}
}

In [None]:
len(finegrained_results[
            (finegrained_results["experiment"] == "question_only") &
            (finegrained_results["model"] == "QwenAudio")
        ]["question_type"].unique())

In [None]:
# for this, we are comparing few shot vs question only
fig, ax = plt.subplots(1,1, figsize=(10,5), subplot_kw=dict(polar=True))

# Get the categories (knowledge areas)
categories = list(questions_label_mapping.values())
N = len(categories)

# What will be the angle of each axis in the plot
angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
angles += angles[:1]
experiments = finegrained_results["experiment"].unique()

# Plot each experiment
for model in ["QwenAudio"]: #, "MU-LLaMA"]:
    for experiment in experiments:
        values = finegrained_results[
            (finegrained_results["experiment"] == experiment) &
            (finegrained_results["model"] == model)
        ]["mmshap_audio"].values
        #values += values[:1]
        print(model, experiment, len(angles), len(values))
        values = np.append(values, values[0])
        print(values)
        ax.plot(angles, values, **style_map[model][experiment])
        ax.fill(angles, values, alpha=0.05)


categories_raw = finegrained_results[
            (finegrained_results["experiment"] == experiment) &
            (finegrained_results["model"] == model)
        ]["question_type"].unique()

ax.tick_params(pad=22)
ax.set_xticks(angles[:-1])
ax.set_xticklabels([questions_label_mapping[c] for c in categories_raw], fontsize=10)
ax.set_yticks(np.linspace(0,1,6), np.round(np.linspace(0,1,6),2), color="black", size=10, alpha=0.7)
ax.set_ylim(0, 0.2)
#ax.legend(loc="upper center", bbox_to_anchor=(0.5, 1.3), ncol=2)
ax.legend(loc="lower center", bbox_to_anchor=(0.5, -0.3), ncol=2)


# plt.savefig("paper_figures/spider_plot.png", bbox_inches="tight")
plt.show()