In [10]:
'''
Evaluate case triage against gold standard
'''

import json
import os
import sys
import time
import traceback
import re
import pandas as pd
import requests

TRAIN_TEST = "train"
IS_USE_LIVE_BOT = False

print(f"Dataset: {TRAIN_TEST}")

df_gold_standard = pd.read_csv(f"{TRAIN_TEST}_cases_gold_standard.csv", encoding="utf-8", sep="\t")

df = pd.read_csv(f"output_{TRAIN_TEST}_insolvency_bot_with_gpt-4.csv", encoding="utf-8", sep="\t")

Dataset: train


In [11]:
if IS_USE_LIVE_BOT:
    sys.path.append("../insolvency/")
    from insolvency_bot import get_matching_sources, construct_prompt, embeddings_lookup

    bot_cases = [""] * len(df)

    for idx in range(len(df)):
        question = df.question_text.iloc[idx]
        matching_sources = get_matching_sources(question)
        dialogue_history = construct_prompt(question, matching_sources)
        bot_cases[idx] = "|".join(matching_sources["human_readable_c"])
    df["bot_cases"] = bot_cases

In [12]:
import re
re_contains_num = re.compile(r'\d')
def normalise_case_name(c):
    c = re.sub('\s+', ' ', c)
    words = c.split(" ")
    for w in words:
        if len(re_contains_num.findall(w)) > 0:
            continue
        if w.lower() not in {"in", "re", "h.r.", "petitioner", "vs", "v", "company", "a", "an"}:
            return w.lower()

In [13]:
question_to_key_cases = dict(df_gold_standard.set_index("question_no")["cases"])

In [14]:
df = df[df.question_no.isin(question_to_key_cases)]

In [15]:
total_cases_gt = [0] * len(df)
total_cases_retrieved = [0] * len(df)
total_cases_correct = [0] * len(df)

for idx in range(len(df)):
    q_no = df.question_no.iloc[idx]
    
    y_gt = question_to_key_cases[q_no].split("|")
    
    
    y_pred = df.bot_cases.iloc[idx].split("|")
    
    y_pred_normalised = set([normalise_case_name(c) for c in y_pred])
    y_gt_normalised = set([normalise_case_name(c) for c in y_gt])
    
    print ("Ground truth:\t\t", sorted(y_gt_normalised))
    print ("\tPredicted:\t", sorted(y_pred_normalised))
    
    total_cases_gt[idx] = len(y_gt_normalised)
    total_cases_correct[idx] = len(y_pred_normalised.intersection(y_gt_normalised))
    total_cases_retrieved[idx] = len(y_pred_normalised)

Ground truth:		 ['harmer', 'john', 'revenue', 'secretary']
	Predicted:	 ['bny', 'bradcrown', 'byblos', 'casa', 'cheyne', 'lancefield']
Ground truth:		 ['adams', 'bny', 'buchan', 'byblos', 'casa', 'cheyne', 'dhn', 'fg', 'gilford', 'macdonald', 'prest', 'salomon']
	Predicted:	 ['adams', 'belmont', 'city', 'dhn', 'fg', 'gencor', 'gilford', 'kaytech', 'macdonald', 'prest', 'rowntree', 'salomon', 'wluk']
Ground truth:		 ['bellgroup', 'bny', 'byblos', 'casa', 'cheyne', 'lancefield', 'paramount', 'sandell', 'windsor']
	Predicted:	 ['bny', 'byblos', 'casa', 'cheyne', 'eurofood', 'hlc', 'maxwell']
Ground truth:		 ['adams', 'city', 'dhn', 'fg', 'gencor', 'gilford', 'kaytech', 'macdonald', 'prest', 'salomon']
	Predicted:	 ['adams', 'city', 'dhn', 'fg', 'gencor', 'gilford', 'glam', 'kaytech', 'macdonald', 'prest', 'psv', 'salomon']
Ground truth:		 ['adams', 'bny', 'buchan', 'byblos', 'casa', 'cheyne', 'dhn', 'fg', 'gilford', 'macdonald', 'prest', 'salomon']
	Predicted:	 ['belmont', 'bny', 'bti', '

In [16]:
df["cases_ground_truth"] = total_cases_gt
df["cases_retrieved"] = total_cases_retrieved
df["cases_correct"] = total_cases_correct

In [17]:
df[["question_no","cases_ground_truth", "cases_retrieved", "cases_correct" ]]

Unnamed: 0,question_no,cases_ground_truth,cases_retrieved,cases_correct
0,Q1,4,6,0
1,Q2,12,13,7
2,Q3,9,7,4
3,Q4,10,12,10
4,Q5,12,7,4
6,Q7,5,7,4
8,Q9,5,13,1
9,Q10,8,10,8
11,Q12,9,10,4


In [18]:
print (f"Precision: {df.cases_correct.sum() / df.cases_retrieved.sum():.0%}")
print (f"Recall: {df.cases_correct.sum() / df.cases_ground_truth.sum():.0%}")

Precision: 49%
Recall: 57%
