In [None]:
# Copyright 2020 Google LLC.                                                                                                                                   
#                                                                                                                                                              
# Licensed under the Apache License, Version 2.0 (the "License");                                                                                              
# you may not use this file except in compliance with the License.                                                                                             
# You may obtain a copy of the License at                                                                                                                      
#                                                                                                                                                              
#     http://www.apache.org/licenses/LICENSE-2.0                                                                                                               
#                                                                                                                                                              
# Unless required by applicable law or agreed to in writing, software                                                                                          
# distributed under the License is distributed on an "AS IS" BASIS,                                                                                            
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                                                                     
# See the License for the specific language governing permissions and                                                                                          
# limitations under the License. 

In [None]:
import os,sys
import pandas as pd
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import tensorflow as tf
from tensorflow.python.summary.summary_iterator import summary_iterator
from scipy import stats

import seaborn as sns
sns.set()

# Define Helper functions

## Extract tf board data

In [None]:
# Extraction function
def sum_log(path,config,model,runlog):
#     try:
    r={}
    for e in summary_iterator(path):
        for v in e.summary.value:
            if v.tag not in r:
                r[v.tag] = {'Model': model, 'config': config, 'Metric': v.tag, 'Values': [], 'Steps': []}
            r[v.tag]['Values'].append(v.simple_value)
            r[v.tag]['Steps'].append(e.step)
    for k,v in r.items():
        runlog = runlog.append(v, ignore_index=True)
    
    return runlog

# load tf logs
def log_tflogs_from_path(tfboardroot, trim_config=None, trim_model=None):
    models = os.listdir(tfboardroot)

    all_log = pd.DataFrame(columns=['Model', 'config', 'Metric', 'Values', 'Steps'])

    modellist = set()
    configlist = set()

    for m in tqdm(models):
        modelroot = os.path.join(tfboardroot,m)
        modellist.add(m)
        for config in os.listdir(modelroot):
            configlist.add(config)
            logpath = os.path.join(modelroot,config,'log')
            for f in os.listdir(logpath):
                if f.startswith('events'):
                    logpath = os.path.join(logpath,f)
                    break

            all_log = sum_log(logpath,config,m,all_log)
    if trim_config is not None:
        all_log['config'] = all_log['config'].apply(trim_config)
        configlist = set(all_log['config'])
    if trim_model is not None:
        all_log['Model'] = all_log['Model'].apply(trim_model)
        modellist = set(all_log['Model'])
    return all_log, modellist, configlist

## Learning Curve plot helper

In [None]:
def get_plot_df(sub_human_stats, sub_log, metric, std_scale = 1.645/10):
    steps = sub_log.iloc[0]['Steps']
    sub_human_stats['Steps'] = [steps] * sub_human_stats.shape[0]
    sub_human_stats = sub_human_stats.explode('Steps')
    sub_log = sub_log[['Model','Values','Steps']]
    
    y = sub_log.iloc[0]['Values']
    
    sub_log = sub_log.set_index(['Model']).apply(pd.Series.explode).reset_index()
    sub_log = sub_log.infer_objects()
    value_mean = sub_log.groupby(['Model'])['Values'].apply(lambda x: x.ewm(halflife=10).mean())
    value_std = sub_log.groupby(['Model'])['Values'].apply(lambda x: x.ewm(halflife=10).std()) 
    
    plot_log = sub_log.copy()

    plot_log['Evaluation']='OPE'
    
    sub_human_stats['Evaluation']='Truth'
    new_plot_log = sub_human_stats.copy()
    new_plot_log['Values'] = new_plot_log[metric+'-mean']
    plot_log = plot_log.append(new_plot_log[['Model','Values','Steps','Evaluation']], ignore_index=True)
    new_plot_log = sub_human_stats.copy()
    if std_scale > 0:
        new_plot_log['Values'] = new_plot_log[metric+'-mean'] + new_plot_log[metric+'-std']*std_scale #90% confidenen interval
        plot_log = plot_log.append(new_plot_log[['Model','Values','Steps','Evaluation']], ignore_index=True)
        new_plot_log = sub_human_stats.copy()
        new_plot_log['Values'] = new_plot_log[metric+'-mean'] - new_plot_log[metric+'-std']*std_scale
        plot_log = plot_log.append(new_plot_log[['Model','Values','Steps','Evaluation']], ignore_index=True)
    
    return plot_log

## Calculate OPE helper

In [None]:
def gather_ope_logs_by_config(
    all_log, 
    plot_metrics, 
    config, 
    last_n = 100,
    trim_metric= lambda x: 'est_reward_dual_{}_normalized'.format(x),
    final_n = None
):

    all_plotlog = None
    for metric in plot_metrics:
        plot_log = human_stats[['Model', metric+'-mean', metric+'-std']]
        plot_log = plot_log.rename(columns={metric+'-mean': 'Human-mean', metric+'-std': 'Human-std'})
        sub_log = all_log.loc[all_log['config']==config]
        sub_log = sub_log.loc[sub_log['Metric']==trim_metric(metric)]
        sub_log = sub_log[['Model','Values']]
        if final_n is None:
            sub_log['Values'] = sub_log['Values'].apply(lambda x: np.array(x[-last_n:]).mean())
        else:
            sub_log['Values'] = sub_log['Values'].apply(lambda x: np.array(x[-last_n:-final_n]).mean())
        sub_log = sub_log.rename(columns={'Values':'OPE'})

        plot_log = plot_log.set_index('Model')
        sub_log = sub_log.set_index('Model')

        plot_log = plot_log.join(sub_log)
        plot_log['Metric'] = metric
        if all_plotlog is None:
            all_plotlog = plot_log
        else:
            all_plotlog = all_plotlog.append(plot_log)
    all_plotlog['Error'] = (all_plotlog['OPE']-all_plotlog['Human-mean']).apply(lambda x: abs(x))

    return all_plotlog

# AirDialogue Rule Based

In [None]:
plotroot="outputs/plotdir/air_ope/rule/"
os.makedirs(plotroot,exist_ok=True)
tfboardroot = 'outputs/syn_air_ope/syn_ope_data_500'
default_config = 'epoch_500'

human_stats = {
    'Model': ['L0', 'L1', 'L2', 'L3', 'L4', 'L5'], 
    'reward-mean': [0.4928, 0.5914, 0.6917, 0.7958, 0.8973, 1.0000], 
    'reward-std': [0.3292, 0.3589, 0.3574, 0.3261, 0.2525, 0.0001]
}
human_stats = pd.DataFrame(data=human_stats)

In [None]:
human_stats

In [None]:
all_log, modellist, configlist = log_tflogs_from_path(tfboardroot, 
                                                      trim_config = lambda x: 'epoch_'+x.split('epoch_')[-1].split('_')[0],
                                                      trim_model = lambda x: x.replace('tgt_', ''))

print(modellist)
print(configlist)

In [None]:
all_log

## Learning Curve

In [None]:
sub_log = all_log[all_log['config']==default_config]
sub_log = sub_log[sub_log['Metric']=='est_reward_dual_normalized']

plot_log = get_plot_df(human_stats, sub_log, 'reward', std_scale=0)

In [None]:
plot_log

In [None]:
plot_log = plot_log.sort_values(by='Model')
g = sns.lineplot(x='Steps', y='Values', hue='Model', style='Evaluation', data=plot_log)
g.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), ncol=1)
g.set_position([0.15,0.15,0.6,0.8])
g.figure.savefig(plotroot+'learning_curve.pdf')

## Human vs OPE

In [None]:
all_plotlog = gather_ope_logs_by_config(all_log, ['reward'], default_config, trim_metric=lambda x:'est_reward_dual_normalized')

In [None]:
all_plotlog

In [None]:
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2
all_plotlog = all_plotlog.rename(columns={'Human-mean': 'Reward', 'OPE': 'OPE'})
g = sns.jointplot('Reward', 'OPE', kind="reg", data=all_plotlog, stat_func=r2,
             xlim=[0.4,1.1],ylim=[0.4,1.1])
g.savefig(plotroot+'ope_vs_human.pdf')

# AirDialogue Model-Model

In [None]:
plotroot="outputs/plotdir/air_ope/model/"
os.makedirs(plotroot,exist_ok=True)

# load stats
statspath = 'data/selfplay_opedata/orig/stats.csv'

human_stats = pd.read_csv(statspath)
human_stats = human_stats.rename(columns={'model_name': 'Model'}).set_index('Model')

human_stats_auto = {
    'Model': ['5K','10K','20K','30K','40K','50K','75K','100K','150K','200K','250K','full',
              '5K_w','10K_w','20K_w','30K_w','40K_w','50K_w','75K_w','100K_w','150K_w','200K_w','250K_w','full_w'], 
    'ppl': [2.671, 2.368, 2.141, 2.053, 2.121, 1.919, 1.843, 2.084, 2.021, 2.076, 1.949, 1.954,
            2.673, 2.369, 2.141, 2.053, 2.121, 1.919, 1.843, 2.084, 2.021, 2.076, 1.949, 1.954], 
    'BLEU': [12.53, 19.07, 23.72, 24.49, 19.64, 29.98, 31.70, 20.10, 21.95, 20.24, 26.04, 25.41,
             12.53, 19.07, 23.71, 24.49, 19.64, 29.98, 31.70, 20.11, 21.95, 20.23, 26.04, 25.41], 
}
human_stats_auto = pd.DataFrame(data=human_stats_auto).set_index('Model')
human_stats = pd.concat([human_stats_auto, human_stats], axis=1, join='inner').reset_index()

metriclist = ['flight_score', 'reward', 'status_score']
print(metriclist)

In [None]:
human_stats

In [None]:
tfboardroot = "outputs/selfplay_air_ope_all"
default_config = 'roberta-base_fix_false_share_true_freeze_true_epoch_300_invsqrt_adam_lr_2e-4_C_1_Q_2_L_10x100_BERT_1_warmup_30_mom_0.5_MAXNORM_1_WD_1e-4_BS_20x1_Linit_0_alphaR_0_C_1_Q_0_L_0_A_0_regfunC_square_Q_abs_cut20_L_square_actC_square_Q_no_tag__seed_0'
all_log, modellist, configlist = log_tflogs_from_path(tfboardroot)

print(modellist)
print(configlist)

In [None]:
all_log

## Learning Curve

In [None]:
plot_metrics = metriclist
# plot_metrics = ['reward','avoid_rep']

num_models = human_stats.shape[0]
print('num_models: ', num_models)

all_plotlog = None

for metric in plot_metrics:
    sub_human_stats = human_stats[['Model', metric+'-mean', metric+'-std']]
    sub_log = all_log.loc[all_log['config']==default_config]
    sub_log = sub_log.loc[sub_log['Metric']=='est_reward_dual_{}_normalized'.format(metric)]
    sub_human_stats = sub_human_stats.sort_values(by=[metric+'-mean', metric+'-std'])
    sub_human_stats = sub_human_stats.iloc[[0,6,12,23]]
    
    selected_models = list(sub_human_stats['Model'])
    
    sub_log.index = sub_log['Model']
    sub_log = sub_log.loc[selected_models]
    
    # change model name
    model_dict = {
        selected_models[0]: 'Model 0%',
        selected_models[1]: 'Model 25%',
        selected_models[2]: 'Model 50%',
        selected_models[3]: 'Model 100%',
    }
    sub_log['Model'] = sub_log['Model'].apply(lambda x: model_dict[x])
    sub_human_stats['Model'] = sub_human_stats['Model'].apply(lambda x: model_dict[x])
    plot_log = get_plot_df(sub_human_stats, sub_log, metric)
    plot_log['Metric'] = metric
    if all_plotlog is None:
        all_plotlog = plot_log
    else:
        all_plotlog = all_plotlog.append(plot_log)


In [None]:
g = sns.FacetGrid(all_plotlog, col='Metric', col_wrap=3,
                  height=3, aspect=1.5,sharey=False)
def mylineplot(x, y, h, s, **kwargs):
    sns.lineplot(x=x, y=y, hue=h, style=s, **kwargs)
g.map(mylineplot, 
      'Steps', 'Values','Model','Evaluation',
       ci='sd')
g.add_legend();
g.savefig(plotroot+'learning_curve.pdf')

## Human vs. OPE

In [None]:
all_plotlog = gather_ope_logs_by_config(all_log, metriclist, default_config)
bleu_ppl_stats = human_stats[['Model', 'BLEU', 'ppl']].set_index('Model')
all_plotlog = all_plotlog.join(bleu_ppl_stats)
all_plotlog

In [None]:
# ope vs human
g = sns.lmplot(x="Human-mean", y='OPE', col='Metric', hue='Metric',
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'ope_vs_human.pdf')

In [None]:
for m in all_plotlog.Metric.unique():
    tempdf = all_plotlog[all_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['OPE']
    X = X[~y.isna()]
    y = y[~y.isna()]

#     model = sm.OLS(y, X)
#     results = model.fit()
#     print('{} R2: {:.4f}'.format(m, results.rsquared))
    print('{} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
#     print(results.summary())

In [None]:
# 
from scipy import stats
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2
g = sns.jointplot('Human-mean', 'OPE', kind="reg", data=all_plotlog, stat_func=r2)
g.savefig(plotroot+'ope_vs_human_all.pdf')

## vs. BLEU PPL

In [None]:
g = sns.lmplot(x="Human-mean", y='BLEU', col='Metric', hue='Metric',
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'bleu_vs_human.pdf')
for m in all_plotlog.Metric.unique():
    tempdf = all_plotlog[all_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['BLEU']
    print('BLEU {} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
g = sns.lmplot(x="Human-mean", y='ppl', col='Metric', hue='Metric',
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'ppl_vs_human.pdf')
for m in all_plotlog.Metric.unique():
    tempdf = all_plotlog[all_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['ppl']
    print('ppl {} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
# g = sns.jointplot(metric+"-mean", "ppl", kind="reg", data=all_plotlog, stat_func=r2)
# g.savefig(plotroot+'ppl_vs_human.pdf')
# g = sns.jointplot(metric+"-mean", 'OPE', kind="reg", data=all_plotlog, stat_func=r2)
# g.savefig(plotroot+'ope_vs_human.pdf')

## Error Analysis

In [None]:
g = sns.lmplot(x="Human-mean", y='Error', col='Metric', hue='Metric', order=2,
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.set(ylim=(-0.008, None))
for ax, (_, subdata) in zip(g.axes, all_plotlog.groupby('Metric')):
    ax2=ax.twinx()
    sns.distplot(subdata["Human-mean"], ax=ax2,color='#95a5a6')
    plt.setp(ax2.get_yticklabels(), visible=False)
    plt.setp(ax2.get_yticklines(), visible=False)
    ax.patch.set_visible(True)
    
g.savefig(plotroot+'error_analysis.pdf')

# AirDialogue Model-Human

In [None]:
plotroot="outputs/plotdir/air_ope/human/"
os.makedirs(plotroot,exist_ok=True)

# load stats
statspath = 'data/human_opedata/orig/stats.csv'

human_stats = pd.read_csv(statspath)
human_stats = human_stats.rename(columns={'model_name': 'Model'}).set_index('Model')

# load SP stats
statspath = 'data/selfplay_opedata/orig/stats.csv'
human_stats_sp = pd.read_csv(statspath).add_prefix('SP-')
human_stats_sp = human_stats_sp.rename(columns={'SP-model_name': 'Model'}).set_index('Model')
human_stats = pd.concat([human_stats, human_stats_sp], axis=1, join='inner')

human_stats_auto = {
    'Model': ['5K','10K','20K','30K','40K','50K','75K','100K','150K','200K','250K','full',
              '5K_w','10K_w','20K_w','30K_w','40K_w','50K_w','75K_w','100K_w','150K_w','200K_w','250K_w','full_w'], 
    'ppl': [2.671, 2.368, 2.141, 2.053, 2.121, 1.919, 1.843, 2.084, 2.021, 2.076, 1.949, 1.954,
            2.673, 2.369, 2.141, 2.053, 2.121, 1.919, 1.843, 2.084, 2.021, 2.076, 1.949, 1.954], 
    'BLEU': [12.53, 19.07, 23.72, 24.49, 19.64, 29.98, 31.70, 20.10, 21.95, 20.24, 26.04, 25.41,
             12.53, 19.07, 23.71, 24.49, 19.64, 29.98, 31.70, 20.11, 21.95, 20.23, 26.04, 25.41], 
}
human_stats_auto = pd.DataFrame(data=human_stats_auto).set_index('Model')
human_stats = pd.concat([human_stats_auto, human_stats], axis=1, join='inner').reset_index()


metriclist = ['flight_score', 'reward', 'status_score']

In [None]:
human_stats

In [None]:
tfboardroot = "outputs/human_air_ope_all"
default_config = 'roberta-base_fix_false_share_true_freeze_true_epoch_500_invsqrt_adam_lr_1.5e-4_C_1_Q_2_L_10x100_BERT_1_warmup_30_mom_0.5_MAXNORM_1_WD_1e-4_BS_20x1_Linit_0_alphaR_0_C_1_Q_0_L_0_A_0_regfunC_square_Q_abs_cut20_L_square_actC_square_Q_no_tag__seed_0'
all_log, modellist, configlist = log_tflogs_from_path(tfboardroot)

print(modellist)
print(configlist)
all_log

## Learning Curve

In [None]:
plot_metrics = metriclist
# plot_metrics = ['reward','avoid_rep']

num_models = human_stats.shape[0]
print('num_models: ', num_models)

all_plotlog = None

for metric in plot_metrics:
    sub_human_stats = human_stats[['Model', metric+'-mean', metric+'-std']]
    sub_log = all_log.loc[all_log['config']==default_config]
    sub_log = sub_log.loc[sub_log['Metric']=='est_reward_dual_{}_normalized'.format(metric)]
    sub_human_stats = sub_human_stats.sort_values(by=[metric+'-mean', metric+'-std'])
    sub_human_stats = sub_human_stats.iloc[[0,6,12,23]]
    
    selected_models = list(sub_human_stats['Model'])
    
    sub_log.index = sub_log['Model']
    sub_log = sub_log.loc[selected_models]
    
    # change model name
    model_dict = {
        selected_models[0]: 'Model 0%',
        selected_models[1]: 'Model 25%',
        selected_models[2]: 'Model 50%',
        selected_models[3]: 'Model 100%',
    }
    sub_log['Model'] = sub_log['Model'].apply(lambda x: model_dict[x])
    sub_human_stats['Model'] = sub_human_stats['Model'].apply(lambda x: model_dict[x])
    plot_log = get_plot_df(sub_human_stats, sub_log, metric)
    plot_log['Metric'] = metric
    if all_plotlog is None:
        all_plotlog = plot_log
    else:
        all_plotlog = all_plotlog.append(plot_log)


In [None]:
g = sns.FacetGrid(all_plotlog, col='Metric', col_wrap=3,
                  height=3, aspect=1.5,sharey=False)
def mylineplot(x, y, h, s, **kwargs):
    sns.lineplot(x=x, y=y, hue=h, style=s, **kwargs)
g.map(mylineplot, 
      'Steps', 'Values','Model','Evaluation',
       ci='sd')
g.add_legend();
g.savefig(plotroot+'learning_curve.pdf')

## Human vs. OPE

In [None]:
all_plotlog = gather_ope_logs_by_config(all_log, metriclist, default_config)
bleu_ppl_stats = human_stats[['Model', 'BLEU', 'ppl']].set_index('Model')
all_plotlog = all_plotlog.join(bleu_ppl_stats)
all_splog = None
for m in metriclist:
    sp_log = human_stats[['Model', 'SP-'+m+'-mean', 'SP-'+m+'-std']].set_index('Model')
    sp_log = sp_log.rename(columns = {'SP-'+m+'-mean': 'SP-mean', 'SP-'+m+'-std': 'SP-std'})
    sp_log['Metric'] = m
    if all_splog is None:
        all_splog = sp_log
    else:
        all_splog = all_splog.append(sp_log)

all_plotlog = all_plotlog.reset_index().set_index(['Model', 'Metric'])
all_splog = all_splog.reset_index().set_index(['Model', 'Metric'])
all_plotlog = all_plotlog.join(all_splog).reset_index()
all_plotlog

In [None]:
# ope vs human
g = sns.lmplot(x="Human-mean", y='OPE', col='Metric', hue='Metric',
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'ope_vs_human.pdf')

In [None]:
for m in all_plotlog.Metric.unique():
    tempdf = all_plotlog[all_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['OPE']
    X = X[~y.isna()]
    y = y[~y.isna()]

#     model = sm.OLS(y, X)
#     results = model.fit()
#     print('{} R2: {:.4f}'.format(m, results.rsquared))
    print('{} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
#     print(results.summary())

In [None]:
# 
from scipy import stats
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2
g = sns.jointplot('Human-mean', 'OPE', kind="reg", data=all_plotlog, stat_func=r2)
g.savefig(plotroot+'ope_vs_human_all.pdf')

## vs. BLEU PPL

In [None]:
g = sns.lmplot(x="Human-mean", y='BLEU', col='Metric', hue='Metric',
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'bleu_vs_human.pdf')
for m in all_plotlog.Metric.unique():
    tempdf = all_plotlog[all_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['BLEU']
    print('BLEU {} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
g = sns.lmplot(x="Human-mean", y='ppl', col='Metric', hue='Metric',
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'ppl_vs_human.pdf')
for m in all_plotlog.Metric.unique():
    tempdf = all_plotlog[all_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['ppl']
    print('ppl {} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
g = sns.lmplot(x="Human-mean", y='SP-mean', col='Metric', hue='Metric',
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'selfplay_vs_human.pdf')
for m in all_plotlog.Metric.unique():
    tempdf = all_plotlog[all_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['SP-mean']
    print('Selfplay {} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
# g = sns.jointplot(metric+"-mean", "ppl", kind="reg", data=all_plotlog, stat_func=r2)
# g.savefig(plotroot+'ppl_vs_human.pdf')
# g = sns.jointplot(metric+"-mean", 'OPE', kind="reg", data=all_plotlog, stat_func=r2)
# g.savefig(plotroot+'ope_vs_human.pdf')

## vs. Selfplay flight_score > 0.50, reward > 0.65, status_score > 0.7

In [None]:
sub_plotlog = all_plotlog[(all_plotlog['Metric']=='flight_score') & (all_plotlog['Human-mean']>0.51)]
sub_plotlog = sub_plotlog.append(all_plotlog[(all_plotlog['Metric']=='reward') & (all_plotlog['Human-mean']>0.65)])
sub_plotlog = sub_plotlog.append(all_plotlog[(all_plotlog['Metric']=='status_score') & (all_plotlog['Human-mean']>0.7)])


In [None]:
g = sns.lmplot(x="Human-mean", y='BLEU', col='Metric', hue='Metric',
               data=sub_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'bleu_vs_human_top.pdf')
for m in sub_plotlog.Metric.unique():
    tempdf = sub_plotlog[sub_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['BLEU']
    print('BLEU {} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
g = sns.lmplot(x="Human-mean", y='ppl', col='Metric', hue='Metric',
               data=sub_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'ppl_vs_human_top.pdf')
for m in sub_plotlog.Metric.unique():
    tempdf = sub_plotlog[sub_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['ppl']
    print('ppl {} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
g = sns.lmplot(x="Human-mean", y='OPE', col='Metric', hue='Metric',
               data=sub_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'ope_vs_human_top.pdf')
for m in sub_plotlog.Metric.unique():
    tempdf = sub_plotlog[sub_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['OPE']
    print('OPE {} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
g = sns.lmplot(x="Human-mean", y='SP-mean', col='Metric', hue='Metric',
               data=sub_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'selfplay_vs_human_top.pdf')
for m in sub_plotlog.Metric.unique():
    tempdf = sub_plotlog[sub_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['SP-mean']
    print('Selfplay {} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
# g = sns.jointplot(metric+"-mean", "ppl", kind="reg", data=sub_plotlog, stat_func=r2)
# g.savefig(plotroot+'ppl_vs_human.pdf')
# g = sns.jointplot(metric+"-mean", 'OPE', kind="reg", data=sub_plotlog, stat_func=r2)
# g.savefig(plotroot+'ope_vs_human.pdf')

## Error Analysis

In [None]:
# sp_all_plotlog = all_plotlog.copy()
# sp_all_plotlog['Error'] = (sp_all_plotlog['SP-mean'] - sp_all_plotlog['Human-mean'])
# sp_all_plotlog['Evaluation'] = 'Selfplay'
# ope_all_plotlog = all_plotlog.copy()
# ope_all_plotlog['Evaluation'] = 'OPE'
# error_all_plotlog = ope_all_plotlog.append(sp_all_plotlog)
g = sns.lmplot(x="Human-mean", y='Error', col='Metric', hue='Metric', order=2,
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.set(ylim=(-0.008, None))
for ax, (_, subdata) in zip(g.axes, all_plotlog.groupby('Metric')):
    ax2=ax.twinx()
    sns.distplot(subdata["Human-mean"], ax=ax2,color='#95a5a6')
    plt.setp(ax2.get_yticklabels(), visible=False)
    plt.setp(ax2.get_yticklines(), visible=False)
    ax.patch.set_visible(True)
    
g.savefig(plotroot+'error_analysis.pdf')

# Covai2

In [None]:

# plotroot="outputs/plotdir/convai2/hard/"
# tfboardroot = 'outputs/convai2_ope_all_hard'

plotroot="outputs/plotdir/convai2/all/"
tfboardroot = 'outputs/convai2_ope_all'
default_config = 'roberta-base_fix_false_share_true_freeze_true_epoch_300_invsqrt_adam_lr_1e-4_C_1_Q_2_L_10x100_BERT_1_warmup_30_mom_0.5_MAXNORM_1_WD_1e-4_BS_20x1_Linit_-0.01_alphaR_0_C_1_Q_0_L_0_A_0_regfunC_square_Q_abs_cut20_L_square_actC_square_Q_no_tag__seed_0'

os.makedirs(plotroot,exist_ok=True)

# load stats
statspath = 'data/convai2/orig/stats.csv'

human_stats = pd.read_csv(statspath)
human_stats = human_stats.rename(columns={'model_name': 'Model'})
human_stats = human_stats.loc[human_stats['Model'] != 'human_eval']
metriclist = [i.replace('-mean','') for i in list(human_stats.columns) if i.endswith('-mean')]
print(metriclist)

In [None]:
human_stats

In [None]:
all_log, modellist, configlist = log_tflogs_from_path(tfboardroot)

print(modellist)
print(configlist)

In [None]:
all_log

## Plot Training Curves

In [None]:
plot_metrics = metriclist
# plot_metrics = ['reward','avoid_rep']

num_models = human_stats.shape[0]
print('num_models: ', num_models)

all_plotlog = None

for metric in plot_metrics:
    sub_human_stats = human_stats[['Model', metric+'-mean', metric+'-std']]
    sub_log = all_log.loc[all_log['config']==default_config]
    sub_log = sub_log.loc[sub_log['Metric']=='est_reward_dual_{}_normalized'.format(metric)]
    sub_human_stats = sub_human_stats.sort_values(by=[metric+'-mean', metric+'-std'])
    sub_human_stats = sub_human_stats.iloc[[0,6,13,27]]
    
    selected_models = list(sub_human_stats['Model'])
    
    sub_log.index = sub_log['Model']
    sub_log = sub_log.loc[selected_models]
    
    # change model name
    model_dict = {
        selected_models[0]: 'Model 0%',
        selected_models[1]: 'Model 25%',
        selected_models[2]: 'Model 50%',
        selected_models[3]: 'Model 100%',
    }
    sub_log['Model'] = sub_log['Model'].apply(lambda x: model_dict[x])
    sub_human_stats['Model'] = sub_human_stats['Model'].apply(lambda x: model_dict[x])
    plot_log = get_plot_df(sub_human_stats, sub_log, metric)
    plot_log['Metric'] = metric
    if all_plotlog is None:
        all_plotlog = plot_log
    else:
        all_plotlog = all_plotlog.append(plot_log)


In [None]:
g = sns.FacetGrid(all_plotlog, col='Metric', col_wrap=5,
                  height=3, aspect=1.5,sharey=False)
def mylineplot(x, y, h, s, **kwargs):
    sns.lineplot(x=x, y=y, hue=h, style=s, **kwargs)
g.map(mylineplot, 
      'Steps', 'Values','Model','Evaluation',
       ci='sd')
g.add_legend();
g.savefig(plotroot+'learning_curve.pdf')

## Human vs OPE

In [None]:
all_plotlog = gather_ope_logs_by_config(all_log, metriclist, default_config)

In [None]:
# ope vs human
g = sns.lmplot(x="Human-mean", y='OPE', col='Metric', hue='Metric',
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.savefig(plotroot+'ope_vs_human.pdf')

In [None]:
for m in all_plotlog.Metric.unique():
    tempdf = all_plotlog[all_plotlog.Metric == m]
    X = tempdf['Human-mean']
    y = tempdf['OPE']

#     model = sm.OLS(y, X)
#     results = model.fit()
#     print('{} R2: {:.4f}'.format(m, results.rsquared))
    print('{} R2: {:.4f}'.format(m, stats.pearsonr(X, y)[0] ))
#     print(results.summary())

In [None]:
# 
from scipy import stats
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2
g = sns.jointplot('Human-mean', 'OPE', kind="reg", data=all_plotlog, stat_func=r2)
g.savefig(plotroot+'ope_vs_human_all.pdf')

## Error Analysis

In [None]:
# g = sns.FacetGrid(all_plotlog, col='Metric', col_wrap=5,
#                   height=3, aspect=1,sharey=False,sharex=False)
# g.map(sns.jointplot, 
#       'Human-mean', 'OPE', kind="reg")
g = sns.lmplot(x="Human-mean", y='Error', col='Metric', hue='Metric', order=2,
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.set(ylim=(-0.008, None))
for ax, (_, subdata) in zip(g.axes, all_plotlog.groupby('Metric')):
    ax2=ax.twinx()
    sns.distplot(subdata["Human-mean"], ax=ax2,color='#95a5a6')
    plt.setp(ax2.get_yticklabels(), visible=False)
    plt.setp(ax2.get_yticklines(), visible=False)
    ax.patch.set_visible(True)
    
g.savefig(plotroot+'error_analysis.pdf')
                            
#g = sns.FacetGrid(all_plotlog, hue='Metric', col='Metric', col_wrap=5, ).map(sns.distplot, "Human-mean")

## vs. Auxilary Loss

In [None]:
default_plotlog = gather_ope_logs_by_config(all_log, metriclist, default_config)
default_plotlog['config'] = 'default'
aux_plotlog = gather_ope_logs_by_config(all_log, metriclist, config = 'roberta-base_fix_false_share_true_freeze_true_epoch_300_invsqrt_adam_lr_1e-4_C_1_Q_2_L_10x100_BERT_1_warmup_30_mom_0.5_MAXNORM_1_WD_1e-4_BS_20x1_Linit_-0.01_alphaR_0_C_1_Q_0_L_0_A_1_regfunC_square_Q_abs_cut20_L_square_actC_square_Q_no_tag__seed_0')
aux_plotlog['config'] = 'with Aux. Loss'
all_plotlog = default_plotlog.append(aux_plotlog)

In [None]:
g = sns.lmplot(x="Human-mean", y='Error', col='Metric', hue="config", order=2,
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.set(ylim=(-0.008, None))
g.savefig(plotroot+'error_vs_auxloss.pdf')

Auxilary loss is useless

## vs Data Size

In [None]:
all_log_half, _, _ = log_tflogs_from_path('outputs/convai2_ope_all_half/')
config_half = "roberta-base_fix_false_share_true_freeze_true_epoch_300_invsqrt_adam_lr_1e-4_C_1_Q_2_L_10x100_BERT_1_warmup_30_mom_0.5_MAXNORM_1_WD_1e-4_BS_20x1_Linit_-0.01_alphaR_0_C_1_Q_0_L_0_A_0_regfunC_square_Q_abs_cut20_L_square_actC_square_Q_no_tag__seed_0"
plotlog_half = gather_ope_logs_by_config(all_log_half, metriclist, config_half, last_n=100)

In [None]:
# ope vs human half
g = sns.lmplot(x="Human-mean", y='OPE', col='Metric', hue='Metric',
               data=plotlog_half, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False,robust=True)
g.savefig(plotroot+'ope_vs_human_half.pdf')

In [None]:
for m in plotlog_half.Metric.unique():
    tempdf = plotlog_half[plotlog_half.Metric == m]
    tempdf = tempdf[~tempdf.OPE.isnull()]
    X = np.array(tempdf['Human-mean'])
    y = np.array(tempdf['OPE'])

    print('{} R2: {:.4f}'.format(m, stats.pearsonr(y, X)[0] ))

In [None]:
all_log_small, _, _ = log_tflogs_from_path('outputs/convai2_ope_all_small/')
config_small = "roberta-base_fix_false_share_true_freeze_true_epoch_1000_invsqrt_adam_lr_1e-4_C_1_Q_2_L_10x100_BERT_1_warmup_30_mom_0.5_MAXNORM_1_WD_1e-4_BS_20x1_Linit_-0.01_alphaR_0_C_1_Q_0_L_0_A_0_regfunC_square_Q_abs_cut20_L_square_actC_square_Q_no_tag__seed_0"
plotlog_small = gather_ope_logs_by_config(all_log_small, metriclist, config_small, last_n=500)

In [None]:
# ope vs human small
g = sns.lmplot(x="Human-mean", y='OPE', col='Metric', hue='Metric',
               data=plotlog_small, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False,robust=True)
g.savefig(plotroot+'ope_vs_human_small.pdf')

In [None]:
for m in plotlog_small.Metric.unique():
    tempdf = plotlog_small[plotlog_half.Metric == m]
    tempdf = tempdf[~tempdf.OPE.isnull()]
    X = np.array(tempdf['Human-mean'])
    y = np.array(tempdf['OPE'])

    print('{} R2: {:.4f}'.format(m, stats.pearsonr(y, X)[0] ))

In [None]:
all_plotlog = gather_ope_logs_by_config(all_log, metriclist, default_config)
all_plotlog['Data'] = '100%'
plotlog_50 = plotlog_half.copy()
plotlog_50['Data'] = '50%'
all_plotlog = all_plotlog.append(plotlog_50)
plotlog_10 = plotlog_small.copy()
plotlog_10['Data'] = '10%'
all_plotlog = all_plotlog.append(plotlog_10)

In [None]:
g = sns.lmplot(x="Human-mean", y='Error', col='Metric', hue='Data', order=2,
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.set(ylim=(-0.008, None))
g.savefig(plotroot+'error_vs_datasize.pdf')

## Hard Dataset

In [None]:
plotroot="outputs/plotdir/convai2/hard/"
all_log_hard, _, _ = log_tflogs_from_path('outputs/convai2_ope_all_hard/')
os.makedirs(plotroot,exist_ok=True)

In [None]:
plotlog_hard = gather_ope_logs_by_config(all_log_hard, metriclist, default_config, last_n=300)

In [None]:
# ope vs human
g = sns.lmplot(x="Human-mean", y='OPE', col='Metric', hue='Metric',
               data=plotlog_hard, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False,robust=True)
g.savefig(plotroot+'ope_vs_human.pdf')

In [None]:
for m in plotlog_hard.Metric.unique():
    tempdf = plotlog_hard[plotlog_hard.Metric == m]
    tempdf = tempdf[~tempdf.OPE.isnull()]
    X = np.array(tempdf['Human-mean'])
    y = np.array(tempdf['OPE'])

#     model = sm.OLS(y, X)
#     results = model.fit()
#     print('Parameters: ', results.params)
#     print('{} R2: {:.4f}'.format(m, results.rsquared))
#     print(results.summary())

    print('{} R2: {:.4f}'.format(m, stats.pearsonr(y, X)[0] ))

In [None]:
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2
g = sns.jointplot('Human-mean', 'OPE', kind="reg", data=plotlog_hard, stat_func=r2)
g.savefig(plotroot+'ope_vs_human_all.pdf')

In [None]:
g = sns.lmplot(x="Human-mean", y='Error', col='Metric', hue='Metric', order=2,
               data=plotlog_hard, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.set(ylim=(-0.008, None))
for ax, (_, subdata) in zip(g.axes, plotlog_hard.groupby('Metric')):
    ax2=ax.twinx()
    sns.distplot(subdata["Human-mean"], ax=ax2,color='#95a5a6')
    plt.setp(ax2.get_yticklabels(), visible=False)
    plt.setp(ax2.get_yticklines(), visible=False)
    ax.patch.set_visible(True)
    
g.savefig(plotroot+'error_analysis.pdf')
                            
#g = sns.FacetGrid(all_plotlog, hue='Metric', col='Metric', col_wrap=5, ).map(sns.distplot, "Human-mean")

In [None]:
default_plotlog = gather_ope_logs_by_config(all_log, metriclist, default_config)
default_plotlog['Data'] = 'Normal'
plotlog_hard = gather_ope_logs_by_config(all_log_hard, metriclist, default_config)
plotlog_hard['Data'] = 'Hard'
all_plotlog = default_plotlog.append(plotlog_hard)

In [None]:
g = sns.lmplot(x="Human-mean", y='Error', col='Metric', hue='Data', order=2,
               data=all_plotlog, col_wrap=5, height=3,sharey=False,sharex=False,truncate=False)
g.set(ylim=(-0.008, None))
for ax, (_, subdata) in zip(g.axes, all_plotlog.groupby('Metric')):
    ax2=ax.twinx()
    sns.distplot(subdata["Human-mean"], ax=ax2,color='#95a5a6')
    plt.setp(ax2.get_yticklabels(), visible=False)
    plt.setp(ax2.get_yticklines(), visible=False)
    ax.patch.set_visible(True)
g.savefig(plotroot+'error_normal_vs_hard_data.pdf')