In [None]:
import pandas as pd
import os
from typing import Dict
import json
from IPython.display import display
pd.options.mode.chained_assignment = None
pd.set_option("display.max_columns", None)

from parlai_internal.projects.light.lightqa.results.utils import get_eval_directories, df_from_eval_runs, save_to_csv, load_all_exps, get_view

# Load Data

In [None]:
eval_paths = {
    # WizWiki
    'wow_baseline': '/checkpoint/light/projects/lightqa/wow/evals/wow_baseline',
    'wow_k2r': '/checkpoint/light/projects/lightqa/wow/evals/wow_k2r',
    
    # NQ
    'nq_k2r': '/checkpoint/light/projects/lightqa/nq_open/evals/nq_k2r',
    'nq_baseline': '/checkpoint/light/projects/lightqa/nq_open/evals/nq_baseline',
    
    # LIGHT
    'lightqa_baseline': '/checkpoint/light/projects/lightqa/lightqa/evals/lightqa_baseline',
    'lightqa_k2r': '/checkpoint/light/projects/lightqa/lightqa/evals/lightqa_k2r',
    
    #WizInt
    'woi_k2r': '/checkpoint/light/projects/lightqa/woi/evals/woi_fidgold',
}

dfs, df = load_all_exps(eval_paths)

# Results

## Light Results

### LightWild test

In [None]:
# Quantitative Evaluations on LightWild test
display(get_view(
    dfs['lightqa_baseline'],
    conditions={
        'dt': 'test',
        't': 'LightTeacherPlus'
    }))
print('Table: Baseline LightWild test')


df_view = get_view(
    dfs['lightqa_k2r'],
    conditions={
        'dt': 'test',
        't': 'LightTeacherPlus',
    })
display(df_view[df_view['response-model'] != 'BART conf-score'])
print('Table: K2R LightWild test')


df_view = get_view(
    dfs['lightqa_k2r'],
    conditions={
        'dt': 'test',
        't': 'LightTeacherPlus',
        'response-model': 'BART conf-score',
    }, sort_key='ppl')
display(df_view[(df_view['km-train-data'] == 'Light') | (df_view['km-train-data'] == '-')])
print('Table: Confidence-Score Exps LightWild test')

### LightWild valid

In [None]:
# Quantitative Evaluations on LightWild valid
display(get_view(
    dfs['lightqa_baseline'],
    conditions={
        'dt': 'valid',
        't': 'LightTeacherPlus'
    }))
print('Table: Baseline LightWild valid')

df_view = get_view(
    dfs['lightqa_k2r'],
    conditions={
        'dt': 'valid',
        't': 'LightTeacherPlus',
    })
display(df_view[df_view['response-model'] != 'BART conf-score'])
print('Table: K2R LightWild valid')


df_view = get_view(
    dfs['lightqa_k2r'],
    conditions={
        'dt': 'valid',
        't': 'LightTeacherPlus',
        'response-model': 'BART conf-score',
    }, sort_key='ppl')
display(df_view[(df_view['km-train-data'] == 'Light') | (df_view['km-train-data'] == '-')])
print('Table: Confidence-Score Exps LightWild valid')

## LightQA test

In [None]:
# Quantitative Evaluations on LightQA test
display(get_view(
    dfs['lightqa_baseline'],
    conditions={
        'dt': 'test',
        't': 'SummaryQATeacher'
    }))
print('Table: Baseline LightQA test')

df_view = get_view(
    dfs['lightqa_k2r'],
    conditions={
        'dt': 'test',
        't': 'SummaryQATeacher',
    })
display(df_view[df_view['response-model'] != 'BART conf-score'])
print('Table: K2R LightQA test')

### LightQA valid

In [None]:
# Quantitative Evaluations on LightQA valid
display(get_view(
    dfs['lightqa_baseline'],
    conditions={
        'dt': 'valid',
        't': 'SummaryQATeacher'
    }))
print('Table: Baseline LightQA valid')


df_view = get_view(
    dfs['lightqa_k2r'],
    conditions={
        'dt': 'valid',
        't': 'SummaryQATeacher',
    })
display(df_view[df_view['response-model'] != 'BART conf-score'])
print('Table: K2R LightQA valid')

## NQ Results

### NQ test

In [None]:
# Quantitative Evaluations on NQ test
display(get_view(
    dfs['nq_baseline'],
    conditions={
        'dt': 'test',
        'beam-size': '3',
    }))
print('Table: Baseline NQ test')

df_view = get_view(
    dfs['nq_k2r'],
    conditions={
        'dt': 'test',
    })
df_view = df_view[df_view['knowledge-model'] != 'Oracle']
display(df_view)
print('Table: K2R NQ test')

df_oracle = get_view(
    dfs['nq_k2r'],
    conditions={
        'dt': 'test',
        'knowledge-model': 'Oracle',
    })
display(df_oracle)
print('Table: Oracle K2R NQ test')

### NQ valid

In [None]:
# Quantitative Evaluations on NQ valid
display(get_view(
    dfs['nq_baseline'],
    conditions={
        'dt': 'valid',
        'beam-size': '3',
    }))
print('Table: Baseline NQ valid')

df_view = get_view(
    dfs['nq_k2r'],
    conditions={
        'dt': 'valid',
    })
df_view = df_view[df_view['knowledge-model'] != 'Oracle']
display(df_view)
print('Table: K2R NQ valid')

df_oracle = get_view(
    dfs['nq_k2r'],
    conditions={
        'dt': 'valid',
        'knowledge-model': 'Oracle',
    })
display(df_oracle)
print('Table: Oracle K2R NQ valid')

## WizWiki Results

### WizWiki test seen

In [None]:
# Quantitative Evaluations on WizWiki test seen
df_view = get_view(
    dfs['wow_baseline'],
    conditions={
        'datatype': 'test',
        't': 'random_split',
    })

display(df_view)
print('Table: Baseline WizWiki test seen')

df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'test',
        't': 'WoW seen',
    })
df_view = df_view[df_view['response-model'] != 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] != 'Oracle']
display(df_view)
print('Table: K2R WizWiki test seen')

df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'test',
        't': 'WoW seen',
    })
df_view = df_view[df_view['response-model'] != 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] == 'Oracle']
display(df_view)
print('Table: K2R-Oracle WizWiki test seen')

### WizWiki confidence-score conditioned Exps

In [None]:
df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'valid',
        't': 'WoW seen',
        'krm-beam-min-length': '10',
    })
df_view = df_view[df_view['response-model'] == 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] != 'Oracle']
df_view['add-fixed-confidence'] = df_view['add-fixed-confidence'].astype(int)
df_view.sort_values('add-fixed-confidence', inplace=True)
display(df_view)
print('Table: K2R confidence-conditioned WizWiki test seen')

df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'valid',
        't': 'WoW seen',
        'krm-beam-min-length': '10',
    })
df_view = df_view[df_view['response-model'] == 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] == 'Oracle']
df_view['add-fixed-confidence'] = df_view['add-fixed-confidence'].astype(int)
df_view.sort_values('add-fixed-confidence', inplace=True)
display(df_view)
print('Table: K2R-Oracle confidence-conditioned WizWiki test seen')

### WizWiki test unseen

In [None]:
# Quantitative Evaluations on WizWiki test unseen
df_view = get_view(
    dfs['wow_baseline'],
    conditions={
        'datatype': 'test',
        't': 'topic_split',
    })

display(df_view)
print('Table: Baseline WizWiki test unseen')

df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'test',
        't': 'WoW unseen',
    })
df_view = df_view[df_view['response-model'] != 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] != 'Oracle']
display(df_view)
print('Table: K2R WizWiki test unseen')

df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'test',
        't': 'WoW unseen',
    })
df_view = df_view[df_view['response-model'] != 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] == 'Oracle']
display(df_view)
print('Table: K2R-Oracle WizWiki test unseen')

### WizWiki valid seen

In [None]:
# Quantitative Evaluations on WizWiki valid seen
df_view = get_view(
    dfs['wow_baseline'],
    conditions={
        'datatype': 'valid',
        't': 'random_split',
    })

display(df_view)
print('Table: Baseline WizWiki valid seen')

df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'valid',
        't': 'WoW seen',
    })
df_view = df_view[df_view['response-model'] != 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] != 'Oracle']
display(df_view)
print('Table: K2R WizWiki valid seen')

df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'valid',
        't': 'WoW seen',
    })
df_view = df_view[df_view['response-model'] != 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] == 'Oracle']
display(df_view)
print('Table: K2R-Oracle WizWiki valid seen')

### WizWiki valid unseen

In [None]:
# Quantitative Evaluations on WizWiki valid unseen
df_view = get_view(
    dfs['wow_baseline'],
    conditions={
        'datatype': 'valid',
        't': 'topic_split',
    })

display(df_view)
print('Table: Baseline WizWiki valid unseen')

df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'valid',
        't': 'WoW unseen',
    })
df_view = df_view[df_view['response-model'] != 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] != 'Oracle']
display(df_view)
print('Table: K2R WizWiki valid unseen')

df_view = get_view(
    dfs['wow_k2r'],
    conditions={
        'dt': 'valid',
        't': 'WoW unseen',
    })
df_view = df_view[df_view['response-model'] != 'BART conf-score']
df_view = df_view[df_view['knowledge-model'] == 'Oracle']
display(df_view)
print('Table: K2R-Oracle WizWiki valid unseen')

## Wizard of Internet

In [None]:
display(get_view(
    dfs['woi_k2r'],
    conditions={
        'dt': 'valid',
        'krm-beam-min-length': '10',
    },
    removed_cols = ['rm-train-data', 'km-train-data', 'complete_percentage']))