In [None]:
from libs.dataset_loader import MulTweEmoDataset
import sklearn.metrics as skm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import json

In [None]:
def threshold_predictions(data, threshold):
    y_pred = np.zeros(data.shape)
    for i, pred in enumerate(data):
        y_pred[i] = pred > threshold
    return y_pred

def get_metrics(labels, data, target_names):
    results = skm.classification_report(labels, data, output_dict=True, zero_division=0, target_names=target_names)
    results = pd.DataFrame(results)
    results.columns = map(str.capitalize, results.columns)
    results = results.T.drop(columns="support")
    results.columns = map(str.capitalize, results.columns)
    return results

def plot_metrics(labels, data, target_names):
    results = get_metrics(labels, data, target_names)
    ax = pd.DataFrame(results).plot(kind="bar", figsize=(10,4), yticks=[x / 10 for x in range(0,11)])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right');
    ax.set_axisbelow(True)
    ax.yaxis.grid(True)
    
def metrics_to_latex(labels, data, target_names):
    results = get_metrics(labels, data, target_names)
    return(results.to_latex(float_format="%.4f"))

In [None]:
model_type = "base"
datasets = ["train", "val", "test"]
predictions = {}

classes = list(range(9))
drop_low_support=False
if model_type == "high_support":
    classes = [0,1,2,4,5,6]
    drop_low_support=True

load_dir = "./multimodal_results"

val, _ = MulTweEmoDataset.load(csv_path="./dataset/val_MulTweEmo.csv", drop_something_else=True, drop_low_support=drop_low_support, test_split=None)
test, _ = MulTweEmoDataset.load(csv_path="./dataset/test_MulTweEmo.csv", drop_something_else=True, drop_low_support=drop_low_support, test_split=None)
train, _ =  MulTweEmoDataset.load(csv_path="./dataset/train_MulTweEmo.csv", drop_something_else=True, drop_low_support=drop_low_support, test_split=None)
for set in datasets:

    with open(f"{load_dir}/{model_type}/{set}_predictions.np", "rb") as f:
        predictions[set] = np.load(f)

In [None]:
storage_name = "sqlite:///final_study_2.db"
final_model_trial = {"bert": 268,
                     "base":260,
                     "base_captions":287,
                     "base_augment":214,
                     "high_support":187,
                     "text_only":148}
study = optuna.create_study(study_name=model_type+"_final_study", storage=storage_name, load_if_exists=True, directions=["minimize", "maximize", "maximize"])
trials = study.get_trials()
params = trials[final_model_trial[model_type]].params
params

In [None]:
# def format_number(number):
#     if type(number) is int:
#         return str(number)
#     else:
#         return "{:.6f}".format(number)
print("\\begin{tabular}{|" + "r|"*len(params.keys()) + "}\n\\hline")
print(" & ".join([key.replace("_", "\_") for key in params.keys()]) + "\\\\\n\\hline")
print(" & ".join([str(value) for value in params.values()]) + "\\\\\n\\hline")
print("\\end{tabular}")

In [None]:
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [None]:
val_predictions = np.array(predictions["val"])
test_predictions = np.array(predictions["test"])
train_predictions = np.array(predictions["train"])
emotions = MulTweEmoDataset.get_labels(drop_low_support=drop_low_support)

In [None]:
if model_type == "bert":
    test = test.drop_duplicates(subset=["id"])
    val = val.drop_duplicates(subset=["id"])
    train = train.drop_duplicates(subset=["id"])
    test_predictions = test_predictions[test.index]
    val_predictions = np.vectorize(sigmoid)(val_predictions)
    test_predictions = np.vectorize(sigmoid)(test_predictions)
    train_predictions = np.vectorize(sigmoid)(train_predictions)
if model_type == "text_only":
    test = test.drop_duplicates(subset=["id"])
    val = val.drop_duplicates(subset=["id"])
    train = train.drop_duplicates(subset=["id"])
if model_type == "base_augment":
    train_predictions = train_predictions[:train.shape[0]]

In [None]:
val_labels = np.array(val["labels"].to_list())
test_labels = np.array(test["labels"].to_list())
train_labels = np.array(train["labels"].to_list())

# Model evaluation on validation and test data

In [None]:
val_default = threshold_predictions(val_predictions, 0.5)
test_default = threshold_predictions(test_predictions, 0.5)
train_default = threshold_predictions(train_predictions, 0.5)

In [None]:
val_predictions

In [None]:
print(skm.accuracy_score(val_labels, threshold_predictions(val_predictions, 0.5)))
get_metrics(val_labels, val_default, target_names=emotions)

In [None]:
plot_metrics(val_labels, val_default, target_names=emotions)

In [None]:
print(metrics_to_latex(val_labels, val_default, emotions))

In [None]:
print(skm.accuracy_score(test_labels, threshold_predictions(test_predictions, 0.5)))
get_metrics(test_labels, test_default, target_names=emotions)

In [None]:
test_supports = np.array(test_labels).sum(axis=0).astype(int)
test_supports = np.append(test_supports,[test_supports.sum()]*4)
default_threshold_results = get_metrics(test_labels, test_default, target_names=emotions)
default_threshold_results.columns = default_threshold_results.columns.map(lambda x: x+" ")
default_threshold_results["Support"] = test_supports
print(default_threshold_results.to_latex(float_format="%.4f", column_format="l|rrr|r"))

In [None]:
test_losses = {
"bert": 0.38037171959877014,
"base": 0.3544072210788727,
"base_captions": 0.37323105335235596,
"base_augment": 0.4171484708786011,
"high_support": 0.42006585001945496,
"text_only": 0.3724941909313202}

metrics = ['loss', 'exact_match', 'precision', 'recall', 'f1_score']

with open(f"checkpoint_metrics/{model_type}.json", "r") as fp:
    training_results = json.load(fp)
summary_results = {"train": {}, "val": {}, "test": {}}

summary_results["train"] = training_results["train"][list(training_results["train"].keys())[-1]]
summary_results["val"] = training_results["val"][list(training_results["val"].keys())[-1]]

# summary_results["val"]["loss"] =  training_results["val"][list(training_results["val"].keys())[-1]]["loss"]
# summary_results["val"]["exact_match"] =  skm.accuracy_score(val_labels, val_default)
# summary_results["val"]["precision"] =  skm.precision_score(val_labels, val_default, zero_division=0, average="samples")
# summary_results["val"]["recall"] =  skm.recall_score(val_labels, val_default, zero_division=0, average="samples")
# summary_results["val"]["f1_score"] =  skm.f1_score(val_labels, val_default, zero_division=0, average="samples")

summary_results["test"]["loss"] = test_losses[model_type]
summary_results["test"]["exact_match"] = skm.accuracy_score(test_labels, threshold_predictions(test_predictions, 0.5))
summary_results["test"]["precision"] =  skm.precision_score(test_labels, test_default, zero_division=0, average="samples")
summary_results["test"]["recall"] =  skm.recall_score(test_labels, test_default, zero_division=0, average="samples")
summary_results["test"]["f1_score"] =  skm.f1_score(test_labels, test_default, zero_division=0, average="samples")

summary_results = pd.DataFrame(summary_results)
summary_results.columns = ["Training", "Validation", "Test"]
summary_results = summary_results.T[metrics]
cols = ["Loss", "Accuracy", "Precision", "Recall", "F1-score"]
summary_results.columns = cols
print(summary_results.to_latex(float_format="%.4f", column_format="l|rrrrr"))
summary_results

In [None]:
plot_metrics(test_labels, test_default, target_names=emotions)

In [None]:
print(metrics_to_latex(test_labels, test_default, emotions))

In [None]:
test_default.sum(axis=0)

In [None]:
threshold_predictions(np.array(test_labels), 0.5).sum(axis=1).mean()

In [None]:
fig = plt.figure(figsize=(12, 7))
sns.heatmap(pd.DataFrame(threshold_predictions(test_predictions, 0.5)).corr(), annot = True, fmt = '.3f', xticklabels=emotions, yticklabels=emotions, vmin=-1, vmax=1)
plt.show()
plt.close()

# Threshold analysis

In [None]:
plt.figure(0, figsize=(10,5)).clf()

for i, emotion in enumerate(emotions):
    index = 3
    precision, recall, thresholds = skm.precision_recall_curve(train_labels[:, i], train_predictions[:, i])
    plt.plot(precision,recall,label=f"{emotion.capitalize()}")

plt.xlabel("recall")
plt.ylabel("precision")
plt.legend(loc=0)
plt.show()

In [None]:
plt.figure(0, figsize=(10,5)).clf()


for i, emotion in enumerate(emotions):
    index = 3
    fpr, tpr, thresholds = skm.roc_curve(test_labels[:,i], test_predictions[:,i])
    auc = skm.auc(fpr, tpr)
    plt.plot(fpr,tpr,label=f"{emotion.capitalize()}, auc="+"{:.4f}".format(auc))
# print(fpr, tpr, thresholds)

plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend(loc=0)
plt.show()
# display = skm.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,

#                                 estimator_name='example estimator')
# display.plot()

In [None]:
class Objective(object):
    def __init__(self, train_logits, train_labels, val_logits, val_labels, base_prec, base_rec, ranges):
        self.train_logits = train_logits.copy()
        self.train_labels = train_labels
        self.val_logits = val_logits.copy()
        self.val_labels = val_labels
        self.base_prec = base_prec
        self.base_rec = base_rec
        self.ranges = ranges

    def __call__(self, trial):
        emotions = MulTweEmoDataset.get_labels(drop_low_support=drop_low_support)
        thresholds = [trial.suggest_float(emotion, self.ranges[emotion][0], self.ranges[emotion][1]) for emotion in emotions]
        preds = np.zeros(self.train_logits.shape)
        for i, pred in enumerate(self.train_logits):
            preds[i] = pred > thresholds
        

        train_precision = skm.precision_score(self.train_labels, preds, zero_division=0, average="samples")
        train_recall = skm.recall_score(self.train_labels, preds, zero_division=0, average="samples")
        
        val_preds = np.zeros(self.val_logits.shape)
        for i, pred in enumerate(self.val_logits):
            val_preds[i] = pred > thresholds
        val_precision = skm.precision_score(self.val_labels, val_preds, zero_division=0, average="samples")
        val_recall = skm.recall_score(self.val_labels, val_preds, zero_division=0, average="samples")
        
        metrics = skm.classification_report(self.val_labels, val_preds, output_dict=True, zero_division=0, target_names=emotions)

        if val_precision < self.base_prec:
            raise optuna.exceptions.TrialPruned()
        if val_recall < self.base_rec:
            raise optuna.exceptions.TrialPruned()
        
        trial.set_user_attr("Precision", val_precision)
        trial.set_user_attr("Recall", val_recall)

        for key, value in metrics.items():
            trial.set_user_attr(key, value)
        count = 0
        for sample in preds:
            if 1 not in sample:
                count+=1
                
        trial.set_user_attr("no_prediction_samples", count)

        return train_precision, train_recall


In [None]:
ranges = {e: [0.15, 0.6] for e in emotions}

val_metrics = get_metrics(val_labels, val_default, target_names=emotions)

objective = Objective(train_predictions, train_labels, val_predictions, val_labels,
                       base_prec=val_metrics["Precision"]["Samples avg"], base_rec=val_metrics["Recall"]["Samples avg"], ranges=ranges)
storage_name = f"sqlite:///threshold_training_study.db"
study = optuna.create_study(study_name=f"{model_type}", storage=storage_name, load_if_exists=True, directions=["maximize", "maximize"])
study.set_metric_names(["precision", "recall"])

if len(study.trials) == 0:
    study.optimize(objective,n_trials=500)
trials = study.get_trials()

In [None]:
class Objective_val(object):
    def __init__(self, trials):
        self.trials = trials

    def __call__(self, trial):
        index=trial.number
        if trials[index].state == 2:
            raise optuna.exceptions.TrialPruned()
        precision = self.trials[index].user_attrs["Precision"]
        recall = self.trials[index].user_attrs["Recall"]
        return precision, recall
    
objective = Objective_val(trials)
storage_name = f"sqlite:///threshold_val_study.db"
study_val = optuna.create_study(study_name=f"{model_type}", storage=storage_name, load_if_exists=True, directions=["maximize", "maximize"])
study_val.set_metric_names(["precision", "recall"])

if len(study_val.trials) == 0:
    study_val.optimize(objective,n_trials=500)
trials = study_val.get_trials()

In [None]:
fig = optuna.visualization.plot_pareto_front(study=study_val, targets=lambda x:(x.values[0], x.values[1]), target_names=["Precision", "Recall"])

fig.update_layout(
    autosize=False,
    width=1100,
    height=400,
    title=None,
    margin=dict(l=20, r=20, t=20, b=20),
    font_size=14,
)
fig.show()

In [None]:
best_treshold_trials = {
    "bert":497,
    "base":323,
    "base_captions":158,
    "base_augment":474,
    "high_support":184,
    "text_only": 414
}
trial = study.trials[best_treshold_trials[model_type]]
thresholds = [t for t in trial.params.values()]
# thresholds = [thresholds[i] for i in classes]
thresholds[0]=0.09033203

In [None]:
fig = optuna.visualization.plot_pareto_front(study=study_val, targets=lambda x:(x.values[0], x.values[1]), target_names=["Precision", "Recall"])

fig.update_layout(
    autosize=False,
    width=1100,
    height=400,
    title=None,
    margin=dict(l=20, r=20, t=20, b=20),
    font_size=14,
)
fig.add_scatter(x=[trial.user_attrs["Precision"]], y=[trial.user_attrs["Recall"]], marker_size=12, marker_symbol="star", marker_color="Yellow", 
                marker_line_width=1, marker_line_color="black", showlegend=False)
fig.write_image(f"final_models/{model_type}/pareto_front.png")
fig.show()

In [None]:
print("\\begin{table}\n\t\\centering")
print("\t\\begin{adjustbox}{width=\\textwidth,center=\\textwidth}")
print("\t\\begin{tabular}{l|" + "r"*len(emotions) + "}")
print("\t\t\\toprule")
print("\t\tEmotion & " + " & ".join([e.capitalize() for e in emotions]) + "\\\\ \n\t\t\\midrule")
print("\t\tThreshold & " + " & ".join([str(round(t, 5)) for t in thresholds]) + "\\\\")
# print(" & ".join([key.replace("_", "\_") for key in params.keys()]) + "\\\\\n\\hline")
# print(" & ".join([str(value) for value in params.values()]) + "\\\\\n\\hline")
print("\t\t\\bottomrule")
print("\t\\end{tabular}")
print("\t\\end{adjustbox}")
print(f"\t\\caption{{() Optimal thresholds}}\n\t\\label{{tab:{model_type}_thresholds}}")
print("\\end{table}")

In [None]:
val_custom = threshold_predictions(val_predictions, thresholds)
test_custom =  threshold_predictions(test_predictions, thresholds)

In [None]:
print(skm.accuracy_score(val_labels, val_custom))
unique, counts = np.unique((val_custom).sum(axis=1), return_counts=True)
print(0 if unique[0]!=0 else counts[0])
get_metrics(val_labels, val_custom, emotions)

In [None]:
test_custom.sum(axis=0)

In [None]:
print(skm.accuracy_score(test_labels, test_custom))
unique, counts = np.unique((test_custom).sum(axis=1), return_counts=True)
print(0 if unique[0]!=0 else counts[0])
get_metrics(test_labels, test_custom, emotions)

In [None]:
print([e.capitalize() for e in emotions])

In [None]:
def plot_metrics(labels, data, target_names):
    results = get_metrics(labels, data, target_names).loc[[e.capitalize() for e in target_names]]
    print(results.loc[[e.capitalize() for e in target_names]])
    ax = pd.DataFrame(results).plot(kind="bar", figsize=(6,4), yticks=[x / 10 for x in range(0,11)])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right');
    ax.set_axisbelow(True)
    ax.yaxis.grid(True)

In [None]:
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=14)
plot_metrics(test_labels, test_custom, emotions)
plt.legend(fontsize=12)

In [None]:
default_threshold_results = get_metrics(test_labels, test_default, emotions)

custom_threshold_results = get_metrics(test_labels, test_custom, emotions)


In [None]:
default_threshold_results

In [None]:
metrics = ['Precision', 'Recall', 'F1-score']
comparison_dict = {key:{} for key in default_threshold_results.T.keys()}
for e in comparison_dict.keys():
    for metric in metrics:
        comparison_dict[e][f"{metric}".capitalize() + " (Default)"] = default_threshold_results[metric][e]
        comparison_dict[e][f"{metric}".capitalize() + " (Optimized)"] = custom_threshold_results[metric][e]
my_colors = [(0.5,0.4,0.5), (0.75, 0.75, 0.25)]*5
ax = pd.DataFrame(comparison_dict).T.plot(kind='bar', figsize=(13,4.5),
                                           color=["#ff0000", "#990000", "#0066ff", "#003399", "#33cc33", "#196619"], yticks=[x / 10 for x in range(0,11)])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right');
ax.set_axisbelow(True)
ax.yaxis.grid(True)

# box = ax.get_position()
# ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
# # Put a legend to the right of the current axis
# ax.legend(loc='lower left', bbox_to_anchor=(1, 0.62))
ax.legend(loc='upper right')
plt.savefig(f"final_models/{model_type}/emotion_threshold_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

#### Number of samples in test set with no label assigned

In [None]:
unique, counts = np.unique(test_default.sum(axis=1), return_counts=True)
print(0 if unique[0]!=0 else counts[0])
unique, counts = np.unique(test_custom.sum(axis=1), return_counts=True)
print(0 if unique[0]!=0 else counts[0])

#### Average number of labels

In [None]:
print(test_default.sum(axis=1).mean())
print(test_custom.sum(axis=1).mean())

In [None]:
test_supports = np.array(test_labels).sum(axis=0).astype(int)
test_supports = np.append(test_supports,[test_supports.sum()]*4)
default_threshold_results.columns = default_threshold_results.columns.map(lambda x: x+" ")
joined_results = pd.concat([default_threshold_results, custom_threshold_results], axis=1, join="inner")
joined_results["Support"] = test_supports
print(joined_results.to_latex(float_format="%.4f", column_format="l|rrr|rrr|r"))

In [None]:
pd.DataFrame(test_custom).corr()-pd.DataFrame(test_labels).corr()

In [None]:
fig = plt.figure(figsize=(10, 7))
sns.heatmap(pd.DataFrame(test_custom).corr(), annot = True, fmt = '.3f',
             xticklabels=[e.capitalize() for e in emotions], yticklabels=[e.capitalize() for e in emotions],
             square=True)
plt.xticks(rotation=45, ha='right') 
plt.show()
plt.close()

In [None]:
fig = plt.figure(figsize=(10, 7))
sns.heatmap(pd.DataFrame(test_labels).corr(), annot = True, fmt = '.3f',
             xticklabels=[e.capitalize() for e in emotions], yticklabels=[e.capitalize() for e in emotions],
             square=True)
plt.xticks(rotation=45, ha='right') 
plt.show()
plt.close()

In [None]:
# print(skm.classification_report(test_labels, test_predictions>0.5, target_names=emotions))

In [None]:
pos_classes=[1,4,7,8]
neg_classes=[0,2,3,6]
# pos_classes=[1,2,3,4,6,7,8]
# neg_classes=[5]

In [None]:
kek=0
for test in test_custom:
    for i in range(len(test)):
        if test[i]:
            for j in range(len(test)):
                if test[j]:
                    if i in pos_classes and j in neg_classes:
                        kek+=1
                        # print(test)
kek

In [None]:
test_labels

In [None]:
def confusion_matrix(labels, predictions, class_names, supports, title="Confusion Matrix", save_name=None):
    num_classes = len(class_names)
    num_samples = labels.shape[0]
    confusion_mtx = np.zeros((num_classes, num_classes), dtype=float)
    npl = np.zeros(num_classes)

    for i in range(num_samples):
        true_labels = np.where(labels[i] == 1)[0]
        pred_labels = np.where(predictions[i] == 1)[0]
        
        
        for t in true_labels:
            # cat 1
            if t not in pred_labels:
                npl[t] += 1
            
            if t in pred_labels:
                confusion_mtx[t, t] += 1
            for p in pred_labels:
                if p not in true_labels:
                    confusion_mtx[t, p] += 1
            #         # print(t,p)
                # if t not in pred_labels and p not in true_labels:
                #     confusion_mtx[t, p] += 1
            #         # print(t, p)

    for i in range(confusion_mtx.shape[0]):
        confusion_mtx[i] = confusion_mtx[i]/(supports[i])

    def plot_confusion_matrix(cm, class_names):
        fig, ax = plt.subplots(figsize=(7,7))
        im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)

        cax = fig.add_axes([ax.get_position().x1+0.01,ax.get_position().y0,0.02,ax.get_position().height])
        ax.figure.colorbar(im, cax=cax)
        
        ax.set(
            xticks=np.arange(cm.shape[1]),
            yticks=np.arange(cm.shape[0]),
            xticklabels=class_names,
            yticklabels=class_names,
            xlabel='Predicted Label',
            ylabel='True Label',
            title=title,
        )
        plt.rc('font', size=11)
        plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
        thresh = cm.max() / 2
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(
                    j, i,  "{:.3f}".format(cm[i, j]),
                    # j, i,  int(cm[i, j]),
                    ha='center', va='center',
                    color='white' if cm[i, j] > thresh else 'black', size=11
                )
        # plt.tight_layout()
        if save_name:
            plt.savefig(save_name, dpi=300, bbox_inches='tight')
        plt.show()
    print(npl)
    plot_confusion_matrix(confusion_mtx, class_names)



In [None]:
test_supports = np.array(test_labels).sum(axis=0)
test_supports

In [None]:
confusion_matrix(np.array(test_labels), test_default, [e.capitalize() for e in emotions],
                  test_supports, "Default thresholds", save_name=f"final_models/{model_type}/conf_matrix_default.png")

In [None]:
confusion_matrix(np.array(test_labels), test_custom, [e.capitalize() for e in emotions],
                  test_supports, "Optimized thresholds", save_name=f"final_models/{model_type}/conf_matrix_custom.png")

In [None]:
true_example = np.array([[0, 1, 0],
                [1, 0, 1],
                [1, 0, 0],
                [0, 1, 1]])
pred_example = np.array([[1, 1, 0],
                [0, 1, 1],
                [1, 0, 0],
                [0, 0, 1]])
confusion_matrix(true_example, pred_example, class_names=[0,1,2], supports=pred_example.sum(axis=0))

In [None]:
# skm.multilabel_confusion_matrix(test_labels, test_custom)

In [None]:
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# f, axes = plt.subplots(2, 5, figsize=(15, 10))
# axes = axes.ravel()
# for i, e in enumerate(emotions):
#     disp = ConfusionMatrixDisplay(confusion_matrix(np.array(test_labels)[:, i],
#                                                    test_custom[:, i]),
#                                   display_labels=[0, 1])
#     disp.plot(ax=axes[i])
#     disp.ax_.set_title(e.capitalize())
#     if i<5:
#         disp.ax_.set_xlabel('')
#     if i%5!=0:
#         disp.ax_.set_ylabel('')
#     disp.im_.colorbar.remove()
# plt.subplots_adjust(bottom=0.17, top=0.7, hspace=0.1)
# # f.colorbar(disp.im_, ax=axes)
# # plt.tight_layout()
# f.delaxes(axes[-1])
# # plt.rcParams.update({'font.size': 22})
# plt.show()


# Zero-shot LLaVA

In [None]:
val, _ = MulTweEmoDataset.load(csv_path="./dataset/val_MulTweEmo.csv", drop_something_else=True, test_split=None)
test, _ = MulTweEmoDataset.load(csv_path="./dataset/test_MulTweEmo.csv", drop_something_else=True, test_split=None)
val_labels = val["labels"].to_list()
test_labels = test["labels"].to_list()

In [None]:
prompt_names = ["Base",
                "Context",
                "Posting",
                "Expert"]

In [None]:
f1_scores = {}
for i in range(4):
    f1_scores[f"Prompt {i}"] = {}
    llava_results_path = f"./zero_shot_results/list/results_{i}.np"
    with open(llava_results_path, "rb") as f:
        val_predictions = np.load(f)
    results = skm.classification_report(val_labels, val_predictions, zero_division=0, target_names=emotions, output_dict=True)
    for key in results.keys():
        f1_scores[f"Prompt {i}"][key.capitalize()] = results[key]["f1-score"] 
    print(f"Average number of labels: {val_predictions.sum(axis=1).mean()}")
    print(skm.accuracy_score(val_labels, val_predictions))
ax = pd.DataFrame(f1_scores).plot(kind="bar", figsize=(7,4), yticks=[x / 10 for x in range(0,11)])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right');
ax.set_axisbelow(True)
ax.yaxis.grid(True)
ax.legend(prompt_names)
# box = ax.get_position()
# ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
# # Put a legend to the right of the current axis
# ax.legend(loc='lower left', bbox_to_anchor=(1, 0.7))
plt.savefig(f"final_models/llava/list_prompts_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
f1_scores = {}
for i in range(4):
    f1_scores[f"Prompt {i}"] = {}
    llava_results_path = f"./zero_shot_results/binary/results_{i}.np"
    with open(llava_results_path, "rb") as f:
        val_predictions = np.load(f)
    results = skm.classification_report(val_labels, val_predictions, zero_division=0, target_names=emotions, output_dict=True)
    for key in results.keys():
        f1_scores[f"Prompt {i}"][key.capitalize()] = results[key]["f1-score"] 
    print(f"Average number of labels: {val_predictions.sum(axis=1).mean()}")
    print(skm.accuracy_score(val_labels, val_predictions))
ax = pd.DataFrame(f1_scores).plot(kind="bar", figsize=(7,4), yticks=[x / 10 for x in range(0,11)])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right');
ax.set_axisbelow(True)
ax.yaxis.grid(True)
ax.legend(prompt_names)

# box = ax.get_position()
# ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
# # Put a legend to the right of the current axis
# ax.legend(loc='lower left', bbox_to_anchor=(1, 0.7))
plt.savefig(f"final_models/llava/binary_prompts_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
results_table = {}
for j in range(1, 3):
    results_table["Method "+str(j)] = {}
    for i in range(4):
        results_table["Method "+str(j)]["Prompt "+str(i)] = {}
        if j == 1:
            llava_results_path = f"./zero_shot_results/binary/results_{i}.np"
        else:
            llava_results_path = f"./zero_shot_results/list/results_{i}.np"
        with open(llava_results_path, "rb") as f:
            val_predictions = np.load(f)
            
        model_results = get_metrics(val_labels, val_predictions, emotions)
        results_table["Method "+str(j)]["Prompt "+str(i)]["Precision"] = model_results["Precision"]["Samples avg"]
        results_table["Method "+str(j)]["Prompt "+str(i)]["Recall"] = model_results["Recall"]["Samples avg"]
        results_table["Method "+str(j)]["Prompt "+str(i)]["F1-score"] = model_results["F1-score"]["Samples avg"]
        results_table["Method "+str(j)]["Prompt "+str(i)]["F1-score"]
        results_table["Method "+str(j)]["Prompt "+str(i)]["Accuracy"] = skm.accuracy_score(val_labels, val_predictions,)
        results_table["Method "+str(j)]["Prompt "+str(i)]["Hamming"] = skm.hamming_loss(val_labels, val_predictions,)
        
        unique, count = np.unique(val_predictions.sum(axis=1), return_counts=True)
        print(unique,count)
        results_table["Method "+str(j)]["Prompt "+str(i)]["No labels"] = 0 if unique[0]!=0 else count[0]
        results_table["Method "+str(j)]["Prompt "+str(i)]["Average labels"] = val_predictions.sum(axis=1).mean()

results_table_1 = pd.DataFrame(results_table["Method 1"]).T
results_table_2 = pd.DataFrame(results_table["Method 2"]).T
results_table_1["No labels"] = results_table_1["No labels"].astype(int)
results_table_2["No labels"] = results_table_2["No labels"].astype(int)
print(results_table_1.to_latex(float_format="%.4f", column_format="l|rrr|rr|rr"))

In [None]:
llava_results_path = "./zero_shot_results/test/results_3.np"
with open(llava_results_path, "rb") as f:
    test_predictions = np.load(f)

unique, count = np.unique(test_predictions.sum(axis=1), return_counts=True)
count = count[0]
print(f"Average number of labels: {test_predictions.sum(axis=1).mean()}")
print(skm.accuracy_score(test_labels, test_predictions))

print(count, "samples with no label\n\n")
print(metrics_to_latex(test_labels, test_predictions, emotions))

In [None]:
with open("./zero_shot_results/list/results_3.np", "rb") as f:
    val_predictions = np.load(f)

with open("./zero_shot_results/test/results_3.np", "rb") as f:
    test_predictions = np.load(f)

val_results = get_metrics(val_labels, val_predictions, emotions)
test_results = get_metrics(test_labels, test_predictions, emotions)

test_supports = np.array(test_labels).sum(axis=0).astype(int)
test_supports = np.append(test_supports,[test_supports.sum()]*4)

val_supports = np.array(val_labels).sum(axis=0).astype(int)
val_supports = np.append(val_supports,[val_supports.sum()]*4)

val_results["Support"] = val_supports
test_results["Support"] = test_supports
val_results.columns = val_results.columns.map(lambda x: x+" ")
print(pd.concat([val_results, test_results], axis=1, join="inner").to_latex(float_format="%.4f", column_format="l|rrrr|rrrr"))


# default_threshold_results.columns = default_threshold_results.columns.map(lambda x: x+" ")
# joined_results = pd.concat([default_threshold_results, custom_threshold_results], axis=1, join="inner")
# print(joined_results.to_latex(float_format="%.4f", column_format="l|rrrr|rrrr"))

In [None]:
test_supports = np.array(test_labels).sum(axis=0)

In [None]:
test_predictions.sum(axis=0)

In [None]:
confusion_matrix(np.array(test_labels), test_predictions, ([e.capitalize() for e in emotions]),
                  supports = test_supports, title = "Zero-shot LLaVA", save_name=f"final_models/llava/conf_matrix_custom.png")