In [51]:
'''
Evaluate case triage against gold standard
'''

import json
import os
import sys
import time
import traceback
import re
import pandas as pd
import requests

TRAIN_TEST = "test"
IS_USE_LIVE_BOT = False

print(f"Dataset: {TRAIN_TEST}")

df_gold_standard = pd.read_csv(f"{TRAIN_TEST}_cases_gold_standard.csv", encoding="utf-8", sep="\t")

df = pd.read_csv(f"output_{TRAIN_TEST}_insolvency_bot_with_gpt-4.csv", encoding="utf-8", sep="\t")

Dataset: test


In [52]:
if IS_USE_LIVE_BOT:
    sys.path.append("../insolvency/")
    from insolvency_bot import get_matching_sources, construct_prompt, embeddings_lookup

    bot_cases = [""] * len(df)

    for idx in range(len(df)):
        question = df.question_text.iloc[idx]
        matching_sources = get_matching_sources(question)
        dialogue_history = construct_prompt(question, matching_sources)
        bot_cases[idx] = "|".join(matching_sources["human_readable_c"])
    df["bot_cases"] = bot_cases

In [60]:
import re
re_contains_num = re.compile(r'\d')
def normalise_case_name(c):
    c = re.sub('\s+', ' ', c)
    words = c.split(" ")
    for w in words:
        if len(re_contains_num.findall(w)) > 0:
            continue
        if w.lower() not in {"in", "re", "h.r.", "petitioner", "vs", "v", "company", "a", "an"}:
            return w.lower()

In [61]:
question_to_key_cases = dict(df_gold_standard.set_index("question_no")["cases"])

In [62]:
df = df[df.question_no.isin(question_to_key_cases)]

In [63]:
total_cases_gt = [0] * len(df)
total_cases_retrieved = [0] * len(df)
total_cases_correct = [0] * len(df)

for idx in range(len(df)):
    q_no = df.question_no.iloc[idx]
    
    y_gt = question_to_key_cases[q_no].split("|")
    
    
    y_pred = df.bot_cases.iloc[idx].split("|")
    
    y_pred_normalised = set([normalise_case_name(c) for c in y_pred])
    y_gt_normalised = set([normalise_case_name(c) for c in y_gt])
    
    print ("Ground truth:\t\t", sorted(y_gt_normalised))
    print ("\tPredicted:\t", sorted(y_pred_normalised))
    
    total_cases_gt[idx] = len(y_gt_normalised)
    total_cases_correct[idx] = len(y_pred_normalised.intersection(y_gt_normalised))
    total_cases_retrieved[idx] = len(y_pred_normalised)

Ground truth:		 ['bny', 'byblos', 'casa', 'cheyne', 'paramount']
	Predicted:	 ['bny', 'bti', 'byblos', 'casa', 'cheyne', 'ewhc', 'wluk']
Ground truth:		 ['avanti', 'belmont', 'buchler', 'jackson', 'mistral']
	Predicted:	 ['bti', 'johnson', 'lo-line', 'mctear', 'morris', 'official', 'pathania', 'rowntree', 'singer', 'wluk']
Ground truth:		 ['capital', 'clydesdale', 'discovery', 'halliwells', 'hellas', 'kayley', 'moss']
	Predicted:	 ['belmont', 'bny', 'bti', 'byblos', 'casa', 'cheyne', 'wluk']
Ground truth:		 ['amicus', 'fitness', 'good', 'houst', 'hurricane', 'prezzo', 'virgin']
	Predicted:	 ['belmont', 'bny', 'bti', 'byblos', 'casa', 'cheyne', 'wluk']
Ground truth:		 ['discovery', 'fitness', 'newco', 'oceanfill', 'virgin']
	Predicted:	 ['belmont', 'bti', 'oceanfill']
Ground truth:		 ['bny', 'byblos', 'casa', 'cheyne', 'paramount']
	Predicted:	 ['bny', 'byblos', 'casa', 'cheyne', 'ewhc', 'one', 'rushbrooke']
Ground truth:		 ['bti', 'lo-line', 'mctear', 'sevenoaks']
	Predicted:	 ['adams'

In [64]:
df["cases_ground_truth"] = total_cases_gt
df["cases_retrieved"] = total_cases_retrieved
df["cases_correct"] = total_cases_correct

In [65]:
df[["question_no","cases_ground_truth", "cases_retrieved", "cases_correct" ]]

Unnamed: 0,question_no,cases_ground_truth,cases_retrieved,cases_correct
0,Q1,5,7,4
1,Q2,5,10,0
2,Q3,7,7,0
3,Q4,7,7,0
4,Q5,5,3,1
5,Q6,5,7,4
6,Q7,4,13,1
7,Q8,4,7,0
8,Q9,4,7,4
9,Q10,3,7,1


In [66]:
print (f"Precision: {df.cases_correct.sum() / df.cases_retrieved.sum():.0%}")
print (f"Recall: {df.cases_correct.sum() / df.cases_ground_truth.sum():.0%}")

Precision: 24%
Recall: 33%
