In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

import matplotlib.pyplot as plt

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# %matplotlib inline
PROJECT_ROOT = Path.cwd().parent
data_path = PROJECT_ROOT.joinpath('data')
print(PROJECT_ROOT)

In [None]:
data = pd.read_csv(data_path.joinpath('baselines.csv'))

In [None]:
gpt_df = pd.read_csv(PROJECT_ROOT.joinpath('data','GPT3_align.csv'))
gpt_df.loc[gpt_df['name'] == 'Copa', 'name'] = 'COPA'
gpt_df.loc[gpt_df['name'] == 'WiC', 'val'] = 1

t0_df= pd.read_csv(PROJECT_ROOT.joinpath('data','t0_data.csv'))

Boxplot_Datasets = {
    'Baselines-Accuracy': {
        'AQuA[validation]': {
            'display name': 'AQuA',
            'range': [0,40],
            't0_name':'super_glue_rte'
            },
        'craigslist_bargains[validation]': {
            'display name': 'CraigslistBargains',
            'range': [0,60],
            't0_name':'super_glue_wic'
        }
    },
}

chart_dict = {
    'Figure-1': {
        'T5': {
            'display name': 'T5 (3B)',
            'marker color': '#FFA500', #'#229954',
            },
        'T0': {
            'display name': 'T0 (3B)',
            'marker color': '#5bbd4a', #'#196F3D',
            }
    },
}



valid_metrics = ['f1', 'em', 'accuracy']

figure_object = {}

for task_category in Boxplot_Datasets:
    
    for chart_name in chart_dict:
        chart = list(chart_dict[chart_name].keys())
        chart_df = data[data['run_name'].isin(chart)]
        t0_chart_df = t0_df[t0_df['runs'].isin(chart)]
        subplot_datasets = [i for i in Boxplot_Datasets[task_category]]

        fig = make_subplots(1, 2,
            subplot_titles=[Boxplot_Datasets[task_category][i]['display name'] for i in subplot_datasets],
                            
#             title_font_size=10
            )
        
        for idx, dataset_name in enumerate(subplot_datasets):
            plot_data = chart_df[chart_df['group'] == dataset_name]
            t0_name = Boxplot_Datasets[task_category][dataset_name]['t0_name']
            t0_plot_data = t0_chart_df[t0_chart_df['dataset_name']==t0_name]
            xxl_gap = False
            
            
            for idx_row, run in enumerate(chart):

                subplot_data = plot_data[plot_data['run_name'] == run]
                subplot = go.Box(
                    y=subplot_data['accuracy'].tolist(),
                    x=subplot_data['run_name'].tolist(),
                    showlegend=True if idx==0 else False,
                    boxpoints='all',
                    name=chart_dict[chart_name][run]['display name'],
                    marker_color=chart_dict[chart_name][run]['marker color']
                )

                describe = subplot_data.describe()
                # y_mean = subplot_data['score'].mean()
                y_mean = describe['accuracy']['50%']
                # y_max = subplot_data['score'].max()
                y_max = describe['accuracy']['75%']
                # y_min = subplot_data['score'].min()
                y_min = describe['accuracy']['25%']
                subplot = go.Bar(
                    x=[subplot_data['run_name'].tolist()[0]],
                    y=[y_mean],
                    error_y=dict(type='data', symmetric=False,
                        array=[y_max-y_mean],
                        arrayminus=[y_mean-y_min],
                        ),
                    showlegend=True if idx==0 else False,
                    name=chart_dict[chart_name][run]['display name'],
                    # opacity=0.75,
                    marker_color=chart_dict[chart_name][run]['marker color']
                )

                fig.append_trace(subplot, row=1, col=idx+1)
                fig.update_annotations(font_size=15, yshift=20)
                fig.update_yaxes(
                    range=Boxplot_Datasets[task_category][dataset_name]['range'],
                    row=1, col=idx+1
                    )
                
            
        
        fig.update_layout(
#                 title="Median Accuracy",
            title_x=0.5,
                font=dict(size=16),
                template="plotly_white",
                legend_orientation='h',
                legend=dict(xanchor="center", x=0.5, bgcolor="rgba(0,0,0,0)"),
                width=500,
                height=300
                )
#         fig.for_each_xaxis(lambda axis: axis.title.update(font=dict(size=219o90)))
        fig.update_xaxes(showticklabels=False) # hide all the xticks
        fig.write_image(re.sub(' ', '-', 'acc_new_tasks.png'.format(chart_name, task_category)))
        fig.show()

In [None]:
gpt_df = pd.read_csv(PROJECT_ROOT.joinpath('data','GPT3_align.csv'))
gpt_df.loc[gpt_df['name'] == 'Copa', 'name'] = 'COPA'
gpt_df.loc[gpt_df['name'] == 'WiC', 'val'] = 1

t0_df= pd.read_csv(PROJECT_ROOT.joinpath('data','t0_data.csv'))

Boxplot_Datasets = {
    'Baselines-Accuracy': {
        'anli[dev_r1]': {
            'display name': 'ANLI R1',
            'range': [0,50],
            't0_name':'anli_r1'
            },
        'anli[dev_r2]': {
            'display name': 'ANLI R2',
            'range': [0,50],
            't0_name':'anli_r2'
            },
        'anli[dev_r3]': {
            'display name': 'ANLI R3',
            'range': [0,50],
            't0_name':'anli_r3'
            },
        'AQuA[validation]': {
            'display name': 'AQuA',
            'range': [0,40],
            't0_name':'super_glue_rte'
            },
        
        
        'CommitmentBank[validation]': {
            'display name': 'CB',
            'range': [0,100],
            't0_name':'super_glue_cb'
            },
        
        'craigslist_bargains[validation]': {
            'display name': 'CraigslistBargains',
            'range': [0,60],
            't0_name':'super_glue_wic'
        },
    
    
        'RecognizingTextualEntailment[validation]': {
            'display name': 'RTE',
            'range': [0,100],
            't0_name':'super_glue_rte'
            },
        'WordsinContext[validation]': {
            'display name': 'WiC',
            'range': [0,80],
            't0_name':'super_glue_wic'
        },
    },
}

chart_dict = {
    'Figure-1': {
        'T5': {
            'display name': 'T5 (3B)',
            'marker color': '#FFA500', #'#229954',
            },
        
        'xxl-lm-d4-091621': {
            'display name': 'T5 (11B)',
            'marker color': '#b07200', #'#196F3D',
            },
        
        'T0': {
            'display name': 'T0 (3B)',
            'marker color': '#5bbd4a', #'#196F3D',
            },
        'xxl-lm-d4-091621-512': {
            'display name': 'T0 (11B)',
            'marker color': '#2a781d', #'#196F3D',
            },
        'GPT3 (6.7B)': {
            'display name': 'GPT3 (6.7B)',
            'marker color': '#5DADE2',
            },
        'GPT3 (13B)': {
            'display name': 'GPT3 (13B)',
            'marker color': '#2E86C1',
            },
        'GPT3 (175B)': {
            'display name': 'GPT3 (175B)',
            'marker color': '#21618C',
            },
    },
}



valid_metrics = ['f1', 'em', 'accuracy']

figure_object = {}

for task_category in Boxplot_Datasets:
    
    for chart_name in chart_dict:
        chart = list(chart_dict[chart_name].keys())
        chart_df = data[data['run_name'].isin(chart)]
        t0_chart_df = t0_df[t0_df['runs'].isin(chart)]
        subplot_datasets = [i for i in Boxplot_Datasets[task_category]]

        fig = make_subplots(2, 4,
            subplot_titles=[Boxplot_Datasets[task_category][i]['display name'] for i in subplot_datasets]
            )
        for idx, dataset_name in enumerate(subplot_datasets):

#             For GPT3
            gpt_score = gpt_df[(gpt_df['metric'] == 'acc') & (gpt_df['name'] == Boxplot_Datasets[task_category][dataset_name]['display name'])]
            for exp, score in zip(gpt_score['exp'].tolist(), gpt_score['val'].tolist()):
                subplot = go.Bar(
                    x=[exp],
                    y=[score],
                    showlegend=True if idx==0 else False,
                    name=exp,
                    marker_color=chart_dict[chart_name][exp]['marker color']
                )

                fig.append_trace(subplot, row=(idx//4)+1, col=(idx%4)+1)
                fig.update_annotations(font_size=15, yshift=20)
                fig.update_yaxes(
                    # range=Boxplot_Datasets[dataset]['range'],
                    row=(idx//4)+1, col=idx+1
                    )

            plot_data = chart_df[chart_df['group'] == dataset_name]
            t0_name = Boxplot_Datasets[task_category][dataset_name]['t0_name']
            t0_plot_data = t0_chart_df[t0_chart_df['dataset_name']==t0_name]
            xxl_gap = False
            for idx_row, run in enumerate(chart):
                
                if 'GPT' not in run and 'xxl' not in run:
                    subplot_data = plot_data[plot_data['run_name'] == run]
                    subplot = go.Box(
                        y=subplot_data['accuracy'].tolist(),
                        x=subplot_data['run_name'].tolist(),
                        showlegend=True if idx==0 else False,
                        boxpoints='all',
                        name=chart_dict[chart_name][run]['display name'],
                        marker_color=chart_dict[chart_name][run]['marker color']
                    )

                    describe = subplot_data.describe()
                    # y_mean = subplot_data['score'].mean()
                    y_mean = describe['accuracy']['50%']
                    # y_max = subplot_data['score'].max()
                    y_max = describe['accuracy']['75%']
                    # y_min = subplot_data['score'].min()
                    y_min = describe['accuracy']['25%']
                    subplot = go.Bar(
                        x=[subplot_data['run_name'].tolist()[0]],
                        y=[y_mean],
                        error_y=dict(type='data', symmetric=False,
                            array=[y_max-y_mean],
                            arrayminus=[y_mean-y_min],
                            ),
                        showlegend=True if idx==0 else False,
                        name=chart_dict[chart_name][run]['display name'],
                        # opacity=0.75,
                        marker_color=chart_dict[chart_name][run]['marker color']
                    )

                    fig.append_trace(subplot, row=(idx//4)+1, col=(idx%4)+1)
                    fig.update_annotations(font_size=15, yshift=20)
                    fig.update_yaxes(
                        range=Boxplot_Datasets[task_category][dataset_name]['range'],
                        row=(idx//4)+1, col=(idx%4)+1
                        )
                elif (
                    "xxl" in run 
                    and dataset_name not in ["AQuA[validation]","craigslist_bargains[validation]"]
                ):
                    

                    subplot_data = t0_plot_data[t0_plot_data['runs'] == run]

                    describe = subplot_data.describe()
                    # y_mean = subplot_data['score'].mean()
                    y_mean = describe['score']['50%']
                    # y_max = subplot_data['score'].max()
                    y_max = describe['score']['75%']
                    # y_min = subplot_data['score'].min()
                    y_min = describe['score']['25%']
                    subplot = go.Bar(
                        x=[subplot_data['runs'].tolist()[0]],
                        y=[y_mean],
                        error_y=dict(type='data', symmetric=False,
                            array=[y_max-y_mean],
                            arrayminus=[y_mean-y_min],
                            ),
                        showlegend=True if idx==0 else False,
                        name=chart_dict[chart_name][run]['display name'],
                        # opacity=0.75,
                        marker_color=chart_dict[chart_name][run]['marker color']
                    )

                    fig.append_trace(subplot, row=(idx//4)+1, col=(idx%4)+1)
                    fig.update_annotations(font_size=15, yshift=20)
                    fig.update_yaxes(
                        range=Boxplot_Datasets[task_category][dataset_name]['range'],
                        row=(idx//4)+1, col=(idx%4)+1
                        )
        
        fig.update_layout(
#                 title="Median Accuracy",
            title_x=0.5,
                font=dict(size=15),
                template="plotly_white",
                legend_orientation='h',
                legend=dict(xanchor="center", x=0.5, bgcolor="rgba(0,0,0,0)"),
                width=1200,
                height=500,
                )
        fig.update_xaxes(showticklabels=False) # hide all the xticks
        fig.write_image(re.sub(' ', '-', '{}-{}.png'.format(chart_name, task_category)))
        fig.show()

In [None]:
gpt_df = pd.read_csv(PROJECT_ROOT.joinpath('data','GPT3_align.csv'))
gpt_df.loc[gpt_df['name'] == 'Copa', 'name'] = 'COPA'
gpt_df.loc[gpt_df['name'] == 'WiC', 'val'] = 1

t0_df= pd.read_csv(PROJECT_ROOT.joinpath('data','t0_data.csv'))

Boxplot_Datasets = {
    'Baseline-F1': {
        'anli[dev_r1]': {
            'display name': 'ANLI R1',
            'range': [0,50],
            't0_name':'anli_r1'
            },
        'anli[dev_r2]': {
            'display name': 'ANLI R2',
            'range': [0,50],
            't0_name':'anli_r2'
            },
        'anli[dev_r3]': {
            'display name': 'ANLI R3',
            'range': [0,50],
            't0_name':'anli_r3'
            },
        'AQuA[validation]': {
            'display name': 'AQuA',
            'range': [0,50],
            't0_name':'super_glue_rte'
            },
        
        
        'CommitmentBank[validation]': {
            'display name': 'CB',
            'range': [0,100],
            't0_name':'super_glue_cb'
            },
        
        'craigslist_bargains[validation]': {
            'display name': 'CraigslistBargains',
            'range': [0,60],
            't0_name':'super_glue_wic'
        },
    
    
        'RecognizingTextualEntailment[validation]': {
            'display name': 'RTE',
            'range': [0,100],
            't0_name':'super_glue_rte'
            },
        'WordsinContext[validation]': {
            'display name': 'WiC',
            'range': [0,80],
            't0_name':'super_glue_wic'
        },
    },
}

chart_dict = {
    'Figure-1': {
        'T5': {
            'display name': 'T5 (3B)',
            'marker color': '#FFA500', #'#229954',
            },
        
        'T0': {
            'display name': 'T0 (3B)',
            'marker color': '#5bbd4a', #'#196F3D',
            },
    },
}



valid_metrics = ['f1', 'em', 'accuracy']

figure_object = {}

for task_category in Boxplot_Datasets:
    
    for chart_name in chart_dict:
        chart = list(chart_dict[chart_name].keys())
        chart_df = data[data['run_name'].isin(chart)]
        t0_chart_df = t0_df[t0_df['runs'].isin(chart)]
        subplot_datasets = [i for i in Boxplot_Datasets[task_category]]

        fig = make_subplots(2, 4,
            subplot_titles=[Boxplot_Datasets[task_category][i]['display name'] for i in subplot_datasets]
            )
        for idx, dataset_name in enumerate(subplot_datasets):

#             # Add Bar Gap
#             subplot = go.Bar(
#                 x=[''],
#                 y=[0],
#                 showlegend=False,
#                 name='',
#                 marker_color='#FFFFFF'
#             )

#             fig.append_trace(subplot, row=(idx//4)+1, col=(idx%4)+1)
#             fig.update_annotations(font_size=15, yshift=20)
#             fig.update_yaxes(
# #                 range=Boxplot_Datasets[dataset]['range'],
#                 row=(idx//4)+1, col=(idx%4)+1
#                 )

            plot_data = chart_df[chart_df['group'] == dataset_name]
            t0_name = Boxplot_Datasets[task_category][dataset_name]['t0_name']
            t0_plot_data = t0_chart_df[t0_chart_df['dataset_name']==t0_name]
            for idx_row, run in enumerate(chart):
                subplot_data = plot_data[plot_data['run_name'] == run]
                subplot = go.Box(
                    y=subplot_data['accuracy'].tolist(),
                    x=subplot_data['run_name'].tolist(),
                    showlegend=True if idx==0 else False,
                    boxpoints='all',
                    name=chart_dict[chart_name][run]['display name'],
                    marker_color=chart_dict[chart_name][run]['marker color']
                )

                describe = subplot_data.describe()
                # y_mean = subplot_data['score'].mean()
                y_mean = describe['f1']['50%']
                # y_max = subplot_data['score'].max()
                y_max = describe['f1']['75%']
                # y_min = subplot_data['score'].min()
                y_min = describe['f1']['25%']
                subplot = go.Bar(
                    x=[subplot_data['run_name'].tolist()[0]],
                    y=[y_mean],
                    error_y=dict(type='data', symmetric=False,
                        array=[y_max-y_mean],
                        arrayminus=[y_mean-y_min],
                        ),
                    showlegend=True if idx==0 else False,
                    name=chart_dict[chart_name][run]['display name'],
                    # opacity=0.75,
                    marker_color=chart_dict[chart_name][run]['marker color']
                )

                fig.append_trace(subplot, row=(idx//4)+1, col=(idx%4)+1)
                fig.update_annotations(font_size=15, yshift=20)
                fig.update_yaxes(
                    range=Boxplot_Datasets[task_category][dataset_name]['range'],
                    row=(idx//4)+1, col=(idx%4)+1
                )
        
        fig.update_layout(
#                 title="Median Unweighted Multiclass F1",
            title_x=0.5,
                font=dict(size=15),
                template="plotly_white",
                legend_orientation='h',
                legend=dict(xanchor="center", x=0.5, bgcolor="rgba(0,0,0,0)"),
                width=1000,
                height=600,
                )
        fig.update_xaxes(showticklabels=False) # hide all the xticks
        fig.write_image(re.sub(' ', '-', '{}-{}.png'.format(chart_name, task_category)))
        fig.show()

In [None]:
median_df = data[data["run_name"].isin(["T0","T5"]) & ~data['group'].isin(
    ['TheWinogradSchemaChallenge.Fixed[validation]']
)].copy()
median_df['group_name'] = median_df['group'].apply(
    lambda x: Boxplot_Datasets['Baseline-F1'][x]['display name']
)
median_df = median_df.groupby(["group_name","run_name"]).median()
median_df = median_df[['accuracy','f1']]


In [None]:
print(median_df.to_latex(float_format="{:0.2f}".format ))

In [None]:
median_df