In [1]:
'''
Evaluate statute triage against gold standard
'''

import json
import os
import sys
import time
import traceback
import re
import pandas as pd
import requests

TRAIN_TEST = "train"
IS_USE_LIVE_BOT = False


print(f"Dataset: {TRAIN_TEST}")

df_gold_standard = pd.read_csv(f"{TRAIN_TEST}_statute_gold_standard.csv", encoding="utf-8", sep="\t")

df = pd.read_csv(f"output_{TRAIN_TEST}_insolvency_bot_with_gpt-4.csv", encoding="utf-8", sep="\t")

Dataset: train


In [2]:
if IS_USE_LIVE_BOT:
    sys.path.append("../insolvency/")
    from insolvency_bot import get_matching_sources, construct_prompt, embeddings_lookup

    bot_statutes = [""] * len(df)

    for idx in range(len(df)):
        question = df.question_text.iloc[idx]
        matching_sources = get_matching_sources(question)
        dialogue_history = construct_prompt(question, matching_sources)
        bot_statutes[idx] = "|".join(matching_sources["human_readable_s"])
    df["bot_statutes"] = bot_statutes

In [3]:
re_num = re.compile(r'\b[A-Z]?\d+[A-Z]?\b')

In [4]:
def normalise_statute_name(c):
    c = re.sub(r'\(.+', '', c)
    statute = ""
    if "insolvency" in c.lower() or "IA" in c:
        statute = "IA"
    if "companies" in c.lower() or "CA" in c:
        statute = "CA"
    matches = re_num.findall(c)
    if len(matches) > 0:
        for m in matches:
            if m != "2006" and m != "1986":
                return m + statute
    return c

In [5]:
normalise_statute_name('Insolvency Act 1986 section 124 (Winding Up of Companies Registered under the Companies Acts)')

'124IA'

In [6]:
question_to_key_statutes = dict(df_gold_standard.set_index("question_no")["statutes"])

In [7]:
df = df[df.question_no.isin(question_to_key_statutes)]

In [8]:
from collections import Counter
ctr = Counter()

total_statutes_gt = [0] * len(df)
total_statutes_retrieved = [0] * len(df)
total_statutes_correct = [0] * len(df)

for idx in range(len(df)):
    q_no = df.question_no.iloc[idx]
    
    y_gt = question_to_key_statutes[q_no].split("|")
    
    y_pred = df.bot_statutes.iloc[idx].split("|")
    
    y_pred_normalised = set([normalise_statute_name(c) for c in y_pred])
    y_gt_normalised = set([normalise_statute_name(c) for c in y_gt])
    
    for n in y_gt_normalised:
        ctr[n] += 1
    
    print ("Ground truth:\t\t", sorted(y_gt_normalised))
    print ("\tPredicted:\t", sorted(y_pred_normalised))
    
    total_statutes_gt[idx] = len(y_gt_normalised)
    total_statutes_correct[idx] = len(y_pred_normalised.intersection(y_gt_normalised))
    total_statutes_retrieved[idx] = len(y_pred_normalised)

Ground truth:		 ['154CA', '250CA']
	Predicted:	 ['123IA', '124IA', '154CA', '234IA', '371IA']
Ground truth:		 ['124', '166', '6IA']
	Predicted:	 ['123IA', '124IA', '154CA', '222IA', '43CA']
Ground truth:		 ['122IA', '123IA', '125IA', '143IA']
	Predicted:	 ['122IA', '123IA', '124IA', '95IA', 'A16IA']
Ground truth:		 ['171CA', '250IA']
	Predicted:	 ['1187CA', '123IA', '124IA', '15', '373IA']
Ground truth:		 ['122IA']
	Predicted:	 ['122IA', '123IA', '124IA', '251CA', 'A16IA']
Ground truth:		 ['122IA', '123IA', '125IA']
	Predicted:	 ['122IA', '123IA', '124IA', '3CA', '58CA']
Ground truth:		 ['171CA', '212IA', '238IA', '244IA', '423IA']
	Predicted:	 ['122IA', '123IA', '124IA', '154CA', '155CA']
Ground truth:		 ['171CA', '212IA', '213IA', '214IA', '238IA', '244IA', '423IA']
	Predicted:	 ['123IA', '124IA', '393CA', '414CA', '472CA']
Ground truth:		 ['122IA', '123IA', '171CA']
	Predicted:	 ['122IA', '123IA', '124IA', '251GIA', 'A18IA']


In [9]:
df["statutes_ground_truth"] = total_statutes_gt
df["statutes_retrieved"] = total_statutes_retrieved
df["statutes_correct"] = total_statutes_correct

In [10]:
df[["question_no","statutes_ground_truth", "statutes_retrieved", "statutes_correct" ]]

Unnamed: 0,question_no,statutes_ground_truth,statutes_retrieved,statutes_correct
0,Q1,2,5,1
1,Q2,3,5,0
2,Q3,4,5,2
3,Q4,2,5,0
4,Q5,1,5,1
6,Q7,3,5,2
8,Q9,5,5,0
9,Q10,7,5,0
11,Q12,3,5,2


In [11]:
print (f"Precision: {df.statutes_correct.sum() / df.statutes_retrieved.sum():.0%}")
print (f"Recall: {df.statutes_correct.sum() / df.statutes_ground_truth.sum():.0%}")

Precision: 18%
Recall: 27%
