In [None]:
import os
from datetime import datetime
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from Bio import pairwise2
from Bio.Seq import Seq
from processing import load_results, generate_summary_stats
from processing import plot_cer, plot_cer_confidence_interval, plot_fuzzy_match, plot_close_matches
from processing import process_oboes

# Load predictions

In [None]:
filenames = list(Path('predictions/bayes_tuner_1-transfer_learned').glob('*.csv'))
models, results = load_results(filenames)
summary_results = generate_summary_stats(models, results)
print([a[1] for a in models])
results.head()

# View summary stats

In [None]:
summary_results[[('CER', 'mean'), ('CER_insensitive', 'mean'), ('CER_nopunc', 'mean'), 
                       ('matchratio', 'mean'), ('matchratio_insensitive', 'mean'), ('matchratio_nopunc', 'mean')]]

In [None]:
summary_results[['exact_matches', 'oboe_matches', 'cer_95_error_min', 'cer_95_error_max']]

# Display graphs

In [None]:
fig, ((graph1, graph2), (graph3, graph4)) = plt.subplots(2, 2, figsize=(20, 15))
# fig, graph1 = plt.subplots(1, 1, figsize=(10, 7))  # For saving individual charts

plot_cer(models, results, graph1)
plot_cer_confidence_interval(models, summary_results, graph2)
plot_fuzzy_match(models, results, graph3)
plot_close_matches(models, results, summary_results, graph4)
fig.tight_layout()

# OBO String Comparisons

In [None]:
plain_model_labels, substitution_errors, deletion_errors, insertion_errors = process_oboes(models, results)
display_model = plain_model_labels[0]

print(f'=========== {display_model} Summary: ===========')
print('Top 10 Substitutions:')
for key in list(substitution_errors[display_model])[:10]:
    print(f'{key}: {substitution_errors[display_model][key]}')
print('\nTop 10 Deletions:')
for key in list(deletion_errors[display_model])[:10]:
    print(f'{key}: {deletion_errors[display_model][key]}')
print('\nTop 10 Insertions:')
for key in list(insertion_errors[display_model])[:10]:
    print(f'{key}: {insertion_errors[display_model][key]}')

In [None]:
comparison_model = plain_model_labels[1]
plt.figure(figsize=(24, 8))
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.bar(np.arange(len(deletion_errors[display_model].keys())) - 0.225, deletion_errors[display_model].values(), 0.45, label = display_model)
plt.bar(np.arange(len(deletion_errors[comparison_model].keys())) + 0.225, deletion_errors[comparison_model].values(), 0.45, label = comparison_model)
plt.xticks(np.arange(len(deletion_errors[display_model].keys())) , deletion_errors[display_model].keys())
plt.title("Deletion Errors")
plt.ylabel("Number of Occurrences")
plt.legend()
plt.show()

plt.figure(figsize=(24, 8))
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.bar(np.arange(len(insertion_errors[display_model].keys())) - 0.225, insertion_errors[display_model].values(), 0.45, label = display_model)
plt.bar(np.arange(len(insertion_errors[comparison_model].keys())) + 0.225, insertion_errors[comparison_model].values(), 0.45, label = comparison_model)
plt.xticks(np.arange(len(insertion_errors[display_model].keys())) , insertion_errors[display_model].keys())
plt.title("Insertion Errors")
plt.ylabel("Number of Occurrences")
plt.legend()
plt.show()

In [None]:
list_of_changes = []
for idx, (ground_truth_label, model_label) in enumerate(models):
    oboes = results[results[f'{model_label}-edit_distance']==1][[f'{model_label}-ground_truth', model_label]]
    diffs = oboes.apply(lambda r: list(d.compare(r[f'{model_label}-ground_truth'], r[model_label])), axis=1)
    diffs = diffs.apply(lambda r: [a for a in r if a[0] == '+' or a[0] == '-'])
    list_of_changes.append(diffs)
print(f'Top 10 changes in {models[0][1]}: {list(list_of_changes[0][0:10])}')
print(f'Top 10 changes in {models[1][1]}: {list(list_of_changes[1][0:10])}')
missing = list()
added = list()
for changes in list_of_changes:
    missing_this_model = dict()
    added_this_model = dict()
    for change_list in changes:
        for change in change_list:
            if change[0] == '+':
                added_value = change[2:]
                if added_value in added_this_model.keys():
                    added_this_model[added_value] += 1
                else:
                    added_this_model[added_value] = 1
            elif change[0] == '-':
                missing_value = change[2:]
                if missing_value in missing_this_model.keys():
                    missing_this_model[missing_value] += 1
                else:
                    missing_this_model[missing_value] = 1
    missing_this_model = {k: v for k, v in sorted(missing_this_model.items(), key=lambda i: i[1], reverse=True)}
    missing.append(missing_this_model)
    added_this_model = {k: v for k, v in sorted(added_this_model.items(), key=lambda i: i[1], reverse=True)}
    added.append(added_this_model)
print(f'Top missing in {models[0][1]}: {missing[0]}')
print(f'Top missing in {models[1][1]}: {missing[1]}')
print(f'Top added in {models[0][1]}: {added[0]}')
print(f'Top added in {models[1][1]}: {added[1]}')
# char_sub_summary = pd.DataFrame(columns=['Missing])

In [None]:
results_folder = 'results'
if not os.path.exists(results_folder):
    os.makedirs(results_folder)
timestamp = datetime.now().strftime('%Y_%m_%d-%H_%M_%S')
summary_results.to_csv(Path(results_folder, f'summary-{timestamp}.csv'))
fig.savefig(Path(results_folder, f'summary-{timestamp}.png'))