In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from collections import Counter,defaultdict

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# %matplotlib inline
PROJECT_ROOT = Path.cwd().parent
data_path = PROJECT_ROOT.joinpath('data')
print(PROJECT_ROOT)
import json

In [None]:
group_name_map = {
    'anli[dev_r1]': 'ANLI R1',
    'anli[dev_r2]':  'ANLI R2',
    'anli[dev_r3]':  'ANLI R3',
    'AQuA[validation]':  'AQuA',
    'CommitmentBank[validation]': 'CB',
    'craigslist_bargains[validation]':'CraigslistBargains',
    'RecognizingTextualEntailment[validation]':'RTE',
    'WordsinContext[validation]': 'WiC',
}

In [None]:
baselines = pd.read_csv(data_path.joinpath('baselines.csv'))
baselines = baselines[baselines['group'].isin(group_name_map) & baselines['run_name'].isin(
    ['T0','T5']
)].copy()
baselines['group'] = baselines['group'].apply(lambda x: group_name_map[x])

all_cross_task = pd.read_csv(data_path.joinpath('cross_task.csv'))
cross_task = all_cross_task[all_cross_task['run_name']=='CTBase'].copy()
cross_task_prompts = cross_task.groupby(['name','prompt_id']).median().reset_index()

In [None]:
ranks_df = cross_task.groupby(['group','prompt_task']).rank()


In [None]:
training_prompts_df = cross_task[cross_task['training_task']].copy()
unseen_prompts_df=  cross_task[(~cross_task['training_task']) & ~(cross_task['prompt_task']=="No Prompt")].copy()
with_choices = cross_task[cross_task['choices_in_prompt']].copy()
no_choices =  cross_task[~cross_task['choices_in_prompt']].copy()
is_mcq = with_choices[with_choices['is_mcq']].copy()
not_mcq =  with_choices[~with_choices['is_mcq']].copy()

diff_added_text =  pd.read_csv(data_path.joinpath('diff.csv'))
diff_added_text = diff_added_text[diff_added_text['run_name'].isin(["CTBase","CTNoText"])]
no_text = diff_added_text[diff_added_text['run_name']=='CTNoText']
with_text = diff_added_text[diff_added_text['run_name']=='CTBase']


def get_ablation_stats(name, df):
    if "Extra Text" not in name:
        f1_stats = df.groupby(['name','prompt_id']).median().describe()['f1_rank']
        acc_stats = df.groupby(['name','prompt_id']).median().describe()['accuracy_rank']
    else:
        f1_stats = df.groupby(['name','prompt_id']).median().describe()['f1_rank']
        acc_stats = df.groupby(['name','prompt_id']).median().describe()['accuracy_rank']
    return ({
        'name':name,
#         "Accuracy Count": acc_stats['count'],
        'Accuracy Mean':acc_stats['mean'],
        'Accuracy Median':acc_stats['50%'],
        'Accuracy Q1':acc_stats['25%'],
        'Accuracy Q3':acc_stats['75%'],
        'F1 Mean':f1_stats['mean'],
        'F1 Median':f1_stats['50%'],
        'F1 Q1':f1_stats['25%'],
        'F1 Q3':f1_stats['75%']
    })


all_stats = []
# all = []
ablations = {
    "Training Prompts":training_prompts_df,
    "Unseen Prompts":unseen_prompts_df,
    "With Choices":with_choices,
    "No Choices": no_choices,
    "Is MCQ":is_mcq,
    "Not MCQ":not_mcq,
    "Extra Text":with_text,
    "No Extra Text":no_text,
}
for a,b in ablations.items():
    all_stats.append(get_ablation_stats(a,b))
all_stats = pd.DataFrame.from_records(all_stats,index=["name"])
all_stats 

In [None]:
corr_df = cross_task[cross_task['run_name']=="CTBase"].corr()
# corr_df = cross_task_prompts.corr()
corr_df = corr_df.loc[
    ["choices_in_prompt","is_mcq","training_task","prompt_tokens"],
    ["accuracy_rank","f1_rank"]
]
# text_vs_no_text = pd.read_csv(data_path.joinpath('diff.csv'))
# text_vs_no_text = text_vs_no_text[text_vs_no_text['run_name'].isin(['CTBase','CTNoText'])]
# text_vs_no_text['extra_text'] = text_vs_no_text['run_name'].apply(lambda l: 1 if l=="CTBase" else 0)

# corr_df = corr_df.append(text_vs_no_text.corr().loc[['extra_text'],['accuracy_rank','f1_rank']])
print(corr_df.to_latex(float_format='{:0.2f}'.format))

In [None]:
text_vs_no_text[['run_name','extra_text','accuracy_rank']]

# Training Prompts Results

In [None]:
training_prompts_df = cross_task[cross_task['training_task']].copy()
unseen_prompts_df=  cross_task[(~cross_task['training_task']) & ~(cross_task['prompt_task']=="No Prompt")].copy()

print(f"Training Prompts:")
print(f"\tF1 Rank:{training_prompts_df['f1_rank'].median():0.2f}")
print(f"\tAccuracy Rank:{training_prompts_df['accuracy_rank'].median():0.2f}")

print(f"Unseen Prompts:")
print(f"\tF1 Rank:{unseen_prompts_df['f1_rank'].median():0.2f}")
print(f"\tAccuracy Rank:{unseen_prompts_df['accuracy_rank'].median():0.2f}")

# Logit Range

In [None]:
fig = make_subplots(
    1,
    2,
    subplot_titles=["Accuracy","F1"],
)

acc_chart = go.Scatter(
    x=cross_task_prompts['logits/range_mean'].tolist(),
    y=cross_task_prompts['accuracy_rank'].tolist(),
    mode='markers',
    showlegend=False
    
)

f1_chart = go.Scatter(
    x=cross_task_prompts['logits/range_mean'].tolist(),
    y=cross_task_prompts['f1_rank'].tolist(),
    mode='markers',
    showlegend=False
)
fig.append_trace(f1_chart, row=1, col=2)
fig.append_trace(acc_chart, row=1, col=1)
fig.update_layout(
#                 title=title+" - "+met_name,
    title_x=0.5,
    font=dict(size=15),
    template="plotly_white",
    legend_orientation='h',
    legend=dict(xanchor="center", x=0.5, bgcolor="rgba(0,0,0,0)"),
    # yaxis=dict(range=[0,100]),
    width=1000,
    height=600,
)
fig['layout']['xaxis']['title']='Mean Range of the Log Probabilities'
fig['layout']['xaxis2']['title']='Mean Range of the Log Probabilities'
fig['layout']['yaxis']['autorange'] = "reversed"
fig['layout']['yaxis2']['autorange'] = "reversed"
fig['layout']['yaxis']['title']='Median Rank out of 98 Prompts'
fig['layout']['yaxis2']['title']='Median Rank out of 98 Prompts'
fig.update_yaxes(range=[0,100])# hide all the xticks
fig.write_image(re.sub(' ', '-', 'shared_ranks_graphs.png'))
fig.show()

In [None]:
corr = cross_task_prompts.corrwith(cross_task_prompts['logits/range_mean']).dropna()

corr = corr.drop([c for c in corr.index.values if 'logits/' in c])

corr.sort_values()

# Text vs No Text

In [None]:
diff_added_text =  pd.read_csv(data_path.joinpath('diff.csv'))
diff_added_text = diff_added_text[diff_added_text['run_name'].isin(["CTBase","CTNoText"])]
diff_added_text = diff_added_text.groupby(["run_name","group",'prompt_task','prompt_id']).median()

In [None]:
difference = (
    diff_added_text.loc["CTBase"] - diff_added_text.loc['CTNoText']
)

difference["accuracy"] = difference['accuracy'] /diff_added_text.loc['CTNoText','accuracy']*100
difference["f1"] = difference['f1'] /diff_added_text.loc['CTNoText','f1']*100

difference.replace([np.inf, -np.inf], np.nan, inplace=True)
difference= difference.drop(["choices_in_prompt","training_task"],axis=1)

In [None]:
difference.reset_index().groupby(["group","prompt_task"]).describe().mean()

# Choices in Prompt

In [None]:
with_choices = cross_task[cross_task['choices_in_prompt']].copy()
no_choices =  cross_task[~cross_task['choices_in_prompt']].copy()

print(f"With Choices:")
print(f"\tF1 Rank:{with_choices['f1_rank'].median():0.2f}")
print(f"\tAccuracy Rank:{with_choices['accuracy_rank'].median():0.2f}")

print(f"No Choices:")
print(f"\tF1 Rank:{no_choices['f1_rank'].median():0.2f}")
print(f"\tAccuracy Rank:{no_choices['accuracy_rank'].median():0.2f}")

In [None]:
corr = cross_task_prompts.corrwith(cross_task_prompts['prompt_tokens']).dropna()
corr = corr.drop(["prompt_tokens"])
corr.sort_values()

In [None]:
corr = cross_task_prompts.corrwith(cross_task_prompts['is_mcq']).dropna()
corr = corr.drop(["is_mcq"])
corr.sort_values()