In [1]:
data_path = '../example-data'
# data_path = 'gs://elife-public-data/sciencebeam/datasets'
dataset_relative_paths = ['pmc-sample-1943-cc-by-subset']
dataset_suffixes = []
output_path_suffix = '-results'
tool_names = ['grobid-tei', 'cermine']
# tool_names = ['grobid-tei', 'cermine', 'scienceparse-v1', 'scienceparse-v2']
affiliation_field_names = ['affiliation_strings', 'affiliation_institution']
reference_field_names = [
    'first_reference_fields', 'first_reference_title',
    'reference_fields', 'reference_title', 'reference_year', 'reference_source', 'reference_volume', 'reference_fpage', 'reference_lpage'
]
table_field_names = ['tables', 'table_strings', 'table_labels', 'table_captions', 'table_label_captions']
figure_field_names = ['figure_labels', 'figure_captions', 'figure_label_captions']
body_field_names = ['section_titles']
field_names = [
    'title',
    'first_author_full_name', 'author_full_names',
    'abstract'
] + body_field_names + affiliation_field_names + reference_field_names + table_field_names + figure_field_names
evaluation_methods = ['levenshtein', 'exact']
details_filename = 'results-00000-of-00001.csv'
show_tables = False

# Conversion Results Details

In [2]:
# This Conversion Results Notebook by default use the very small example dataset
# You may pass in other parameters, e.g.:
#   $papermill conversion-results-tools.ipynb conversion-results-tools-configured.ipynb -p data_path '/my/other/data/path'

In [3]:
%matplotlib inline

In [4]:
import matplotlib
matplotlib.style.use('ggplot')

In [5]:
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime
import time
import os
import subprocess
import re
from hashlib import sha1

import pandas as pd
from markdown import markdown
from IPython.display import display, HTML

warnings.filterwarnings('default')

In [6]:
display(HTML(
'''
<!-- vertical accordion CSS -->
<style>
details {
  padding: 15px;
  border: 1px solid rgba(0,0,0,.1);
}

details + details {
  margin-top: 20px;
}

details[open] {
  min-height: 30px;
}

summary {
  font-weight: 500;
  cursor: pointer;
}

summary:focus {
  outline: none;
}
</style>
'''))

In [7]:
def parse_list(s):
    if isinstance(s, list):
        return s
    return [x.strip() for x in s.split(',') if x.strip()]

dataset_relative_paths = parse_list(dataset_relative_paths)
tool_names = parse_list(tool_names)
field_names = parse_list(field_names)
evaluation_methods = parse_list(evaluation_methods)
dataset_suffixes = dict(parse_list(dataset_suffixes))

In [8]:
all_result_paths = [
    (
        '%s%s %s' % (dataset_relative_path, dataset_suffix, tool),
        os.path.join(
            data_path,
            '%s%s/evaluation-results/%s%s' % (
                dataset_relative_path,
                output_path_suffix,
                tool,
                dataset_suffix
            )
        )
    )
    for dataset_relative_path in dataset_relative_paths
    for dataset_suffix in dataset_suffixes.get(dataset_relative_path, [''])
    for tool in tool_names
]

prefix_list = [
    '%s%s' % (dataset_relative_path, dataset_suffix)
    for dataset_relative_path in dataset_relative_paths
    for dataset_suffix in dataset_suffixes.get(dataset_relative_path, [''])
]

In [9]:
all_result_names = [name for name, _ in all_result_paths]

In [10]:
def printmd(s):
    try:
        s = s.decode('unicode_escape')
    except AttributeError:
        pass
    display(HTML(markdown(s)))

In [11]:
def gs_file_time(file_url):
    timestamp_match = re.search(r'\s(\d{4}-[^ ]+)\s', subprocess.check_output('gsutil ls -l "%s"' % file_url, shell=True).decode('utf-8'))
    return time.mktime(datetime.strptime(timestamp_match.group(1), "%Y-%m-%dT%H:%M:%SZ").timetuple())

def gs_cp(file_url, target_file):
    subprocess.check_output('gsutil cp -P "%s" "%s"' % (file_url, target_file), shell=True)


def retrieve_local_copy_if_remote(file_url):
    if file_url.startswith('gs://'):
        file_time = gs_file_time(file_url)

        local_filename = os.path.join('../.temp', '%s-%s' % (
            sha1(file_url.encode('utf-8')).hexdigest(), os.path.basename(file_url)
        ))
        if not os.path.isfile(local_filename) or file_time > os.path.getmtime(local_filename):
            gs_cp(file_url, local_filename)
            # would be nice if gsutil did actually copy the creation time, let's do it here in a hacky way
            os.utime(local_filename, (file_time, file_time))
        return local_filename

    return file_url

def load_result_map(all_result_paths, filename):
    df_map = {}
    for name, result_path in all_result_paths:
        full_filename = retrieve_local_copy_if_remote(os.path.join(result_path, filename))
        printmd('**Timestamp:** %s (%s)' % (datetime.fromtimestamp(os.path.getmtime(full_filename)), name))
        df_map[name] = pd.read_csv(full_filename)
        if show_tables:
            display(df_map[name].head(3))
    return df_map

In [12]:
printmd('## Details Files')
details_df_map = load_result_map(all_result_paths, details_filename)

In [13]:
printmd('### Evaluation File States')
printmd('**Evaluation methods:** %s' % ', '.join(sorted(set(
    details_df_map[all_result_names[0]]['evaluation_method']
))))
printmd('**Field names:** %s' % ', '.join(sorted(set(
    details_df_map[all_result_names[0]]['field_name'].dropna()
))))

In [14]:
def short_name(full_name):
    return full_name.split(' ')[-1]

In [15]:
def get_title_map(df):
    return df[
        (df['field_name'] == 'title') &
        (df['evaluation_method'] == 'exact')
    ][['target_file', 'expected']].groupby('target_file').first()['expected'].to_dict()

title_map = get_title_map(details_df_map[all_result_names[0]])

In [16]:
def filter_by_field_name_and_evaluation_method(df, field_name, evaluation_method):
    return df[
        (df['field_name'] == field_name) &
        (df['evaluation_method'] == evaluation_method)
    ]

def show_comparative_examples(inspect_name, compare_with_name, field_name, evaluation_method, limit=3):
    filtered_inspect_df = details_df_map[inspect_name]
    inspect_df = filter_by_field_name_and_evaluation_method(
        details_df_map[inspect_name], field_name, evaluation_method
    ).merge(
        filter_by_field_name_and_evaluation_method(
            details_df_map[compare_with_name], field_name, evaluation_method
        ),
        how='outer',
        on=['target_file', 'field_name', 'evaluation_method', 'expected'],
        suffixes=['', '_other']
    )

    md = []

    md.append('**E**: expected, **A**: %s, **B**: %s (%s %s)' % (
        inspect_name, compare_with_name, field_name, evaluation_method
    ))
    
    short_inspect_name = short_name(inspect_name)
    short_other_name = short_name(compare_with_name)

    incorrect_inspect_df = inspect_df[(inspect_df['tp'] == 0) & (inspect_df['tp_other'] == 1)]
    md.append('#### %s [0 : 1] %s (%d)' % (short_inspect_name, short_other_name, len(incorrect_inspect_df)))
    md.append('There are %d samples where %s got it wrong, but %s got it right.' % (
        len(incorrect_inspect_df), short_inspect_name, short_other_name
    ))
    for row in incorrect_inspect_df[:limit].to_dict(orient='rows'):
        md.append('> `%s` "%s"<br/>**E**: %s<br/>**A**: %s' % (
            row['target_file'], title_map.get(row['target_file']), row['expected'], row['actual']
        ))
    if len(incorrect_inspect_df) > limit:
        md.append('%d more' % (len(incorrect_inspect_df) - limit))

    correct_inspect_df = inspect_df[(inspect_df['tp'] == 1) & (inspect_df['tp_other'] == 0)]
    md.append('#### %s [1 : 0] %s (%d)' % (short_inspect_name, short_other_name, len(correct_inspect_df)))
    md.append('There are %d samples where %s got it right, but %s got it wrong.' % (
        len(correct_inspect_df), short_inspect_name, short_other_name
    ))
    for row in correct_inspect_df[:limit].to_dict(orient='rows'):
        md.append('> `%s` "%s"<br/>**E**: %s<br/>**B**: %s' % (
            row['target_file'], title_map.get(row['target_file']), row['expected'], row['actual_other']
        ))
    if len(correct_inspect_df) > limit:
        md.append('%d more' % (len(correct_inspect_df) - limit))

    both_incorrect_inspect_df = inspect_df[
        (inspect_df['tn'] == 0) & (inspect_df['tp'] == 0) & (inspect_df['tp_other'] == 0)
    ]
    md.append('#### %s [0 : 0] %s (%d)' % (short_inspect_name, short_other_name, len(both_incorrect_inspect_df)))
    md.append('There are %d samples where both %s and %s got it wrong.' % (
        len(both_incorrect_inspect_df), short_inspect_name, short_other_name
    ))
    for row in both_incorrect_inspect_df[:limit].to_dict(orient='rows'):
        md.append('> `%s` "%s"<br/>**E**: %s<br/>**A**: %s<br/>**B**: %s' % (
            row['target_file'], title_map.get(row['target_file']),
            row['expected'], row['actual'], row['actual_other']
        ))
    if len(both_incorrect_inspect_df) > limit:
        md.append('%d more' % (len(both_incorrect_inspect_df) - limit))
    return '\n\n'.join(md)

printmd('# Comparing results between tools')
for field_name in field_names:
    printmd('<hr/>')
    printmd('# Field: %s' % field_name)
    for evaluation_method in evaluation_methods:
        for prefix in prefix_list:
            printmd('## Dataset: %s (%s %s)' % (prefix, field_name, evaluation_method))
            names_with_prefix = [name for name in all_result_names if name.startswith(prefix)]
            for i, inspect_name in enumerate(names_with_prefix[:-1]):
                for other_name in names_with_prefix[(1 + i):]:
                    title = 'Comparison: %s vs %s' % (short_name(inspect_name), short_name(other_name))
                    content = markdown(show_comparative_examples(
                        inspect_name, other_name,
                        field_name=field_name, evaluation_method=evaluation_method
                    ))
                    display(HTML('<details><summary>%s</summary>%s</details>' % (title, content)))