In [37]:
import sys
import os
import datasets
from dotenv import dotenv_values
from pathlib import Path
from scipy.special import softmax
import pandas as pd
import numpy as np
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
np.random.seed(19950808)

# take environment variables from .env.
config = dotenv_values("./../../config/.env")
base_path = Path(config["BASE_PATH"])
writing_path = base_path/"writing"/"MSc-Thesis-Emerging-Risks"
table_path = writing_path/"tables"
sys.path.append(str(base_path/"code"))

# KW Baseline

In [2]:
df_al = pd.read_pickle(base_path/"data/labeling/active-learning-iteration-2.pkl")
df_al = df_al[df_al.labeled]
loss = df_al.loss.astype(bool)
unexpected = df_al.unexpected.astype(bool)

In [3]:
from itertools import chain, combinations

def powerset(iterable):
    """
    powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
    """
    xs = list(iterable)
    # note we return an iterator rather than a list
    return chain.from_iterable(combinations(xs,n) for n in range(len(xs)+1))

## Inspecting the true labeled paragraphs

In [None]:
df_al[df_al.loss == 1].text.tolist()

In [None]:
df_al[df_al.unexpected == 1].text.tolist()

## Loss

In [95]:
loss_labels = [
    "loss",
    "adverse development",
    "adverse effect",
    "adverse impact",
    "adverse influence"
    "higher claims",
    "higher loss",
    "higher cost"
    "costs increased",
    "impariment",
    "charge",
    "rising claims expenses",
    "negatively impacted",
    "burden"
]

In [None]:
l_labels = list(powerset(loss_labels))
l_scores = np.zeros((len(l_labels), 5))
for i, labels in enumerate(l_labels):
    kw = df_al.text.apply(lambda x: any([y.lower() in x.lower() for y in labels]))
    l_scores[i] = (f1_score(loss, kw), fbeta_score(loss, kw, beta=2), precision_score(loss, kw, zero_division=0), recall_score(loss, kw), roc_auc_score(loss, kw))

In [96]:
for index in set(l_scores.argmax(axis=0)):
    print(", ".join(l_labels[index]))
    print(" ")
    kw = df_al.text.apply(lambda x: any([y.lower() in x.lower() for y in l_labels[index]]))
    print("F1:", round(f1_score(loss, kw), 3))
    print("F2:", round(fbeta_score(loss, kw, beta=2), 3))
    conf_mat = confusion_matrix(loss, kw)
    print("TP:", conf_mat[1,1])
    print("FP:", conf_mat[0,1])
    print("FN:", conf_mat[1,0])
    print("TN:", conf_mat[0,0])
    print(classification_report(loss, kw))
    print(40*"#")
    print(" ")

rising claims expenses
 
F1: 0.004
F2: 0.003
TP: 1
FP: 0
FN: 450
TN: 1049
              precision    recall  f1-score   support

       False       0.70      1.00      0.82      1049
        True       1.00      0.00      0.00       451

    accuracy                           0.70      1500
   macro avg       0.85      0.50      0.41      1500
weighted avg       0.79      0.70      0.58      1500

########################################
 
loss, adverse development, adverse effect, charge, rising claims expenses, burden
 
F1: 0.561
F2: 0.66
TP: 338
FP: 417
FN: 113
TN: 632
              precision    recall  f1-score   support

       False       0.85      0.60      0.70      1049
        True       0.45      0.75      0.56       451

    accuracy                           0.65      1500
   macro avg       0.65      0.68      0.63      1500
weighted avg       0.73      0.65      0.66      1500

########################################
 
loss, adverse development, charge, rising claims ex

### Choice

In [4]:
loss_labels = [
    "loss",
    "adverse development",
    "charge",
    "rising claims expenses",
    "burden"
]

## Unexpected

### Initial List

In [None]:
unexpected_labels = [
    "unexpected",
    "surprising",
    "surprised",
    "surpris",
    "not expected",
    "expected",
    "more than expected",
    "less than expected",
    "lower than expected"
    "higher than expected "
    "more than expected"
    "below expectations",
    "above expectations",
    "exceed expectations",
    "exceeded expectations",
    "not meet expectations",
    "not according to expectations",
    "not as expected",
    "estimated",
    "anticipated",
    "predicted"
]

### First Subset
This is too large to run all combinations thus i used different subsets, refinde iteratively

In [108]:
unexpected_labels = [
    "unexpected",
    "surprising",
    "surprised",
    "surpris",
    "not expected",
    "expected",
    "more than expected",
    "less than expected",
    "lower than expected"
    "higher than expected "
    "more than expected"
    "below expectations",
    "above expectations",
    "exceed expectations",
    "exceeded expectations"
]

### Second Subset

In [112]:
unexpected_labels = [
    "unexpect",
    "expectation"
    "surpris",
    "expected",
    "below expectations",
    "above expectations",
    "exceed expectations",
    "exceeded expectations",
    "estimated",
    "anticipated",
    "predicted"
]

### Third Subset

In [None]:
unexpected_labels = [
    "expected",
    "surprised",
    "below expectations",
    "above expectations",
    "exceeded expectations",
    "anticipated",
    "predicted",
    "not meet expectations",
    "not according to expectations",
    "not as expected"
]

In [117]:
u_labels = list(powerset(unexpected_labels))
u_scores = np.zeros((len(u_labels), 5))
for i, labels in enumerate(u_labels):
    kw = df_al.text.apply(lambda x: any([y.lower() in x.lower() for y in labels]))
    u_scores[i] = (f1_score(unexpected, kw), fbeta_score(unexpected, kw, beta=2), precision_score(unexpected, kw, zero_division=0), recall_score(unexpected, kw), roc_auc_score(unexpected, kw))

In [118]:
for index in set(u_scores.argmax(axis=0)):
    print(", ".join(u_labels[index]))
    print(" ")
    kw = df_al.text.apply(lambda x: any([y.lower() in x.lower() for y in u_labels[index]]))
    print("F1:", round(f1_score(unexpected, kw), 3))
    print("F2:", round(fbeta_score(unexpected, kw, beta=2), 3))
    conf_mat = confusion_matrix(unexpected, kw)
    print("TP:", conf_mat[1,1])
    print("FP:", conf_mat[0,1])
    print("FN:", conf_mat[1,0])
    print("TN:", conf_mat[0,0])
    print(classification_report(unexpected, kw))
    print(40*"#")
    print(" ")

expected, below expectations, above expectations, exceeded expectations, anticipated
 
F1: 0.372
F2: 0.474
TP: 106
FP: 281
FN: 77
TN: 1036
              precision    recall  f1-score   support

       False       0.93      0.79      0.85      1317
        True       0.27      0.58      0.37       183

    accuracy                           0.76      1500
   macro avg       0.60      0.68      0.61      1500
weighted avg       0.85      0.76      0.79      1500

########################################
 
expected, below expectations, above expectations, exceeded expectations, anticipated, predicted
 
F1: 0.37
F2: 0.475
TP: 107
FP: 288
FN: 76
TN: 1029
              precision    recall  f1-score   support

       False       0.93      0.78      0.85      1317
        True       0.27      0.58      0.37       183

    accuracy                           0.76      1500
   macro avg       0.60      0.68      0.61      1500
weighted avg       0.85      0.76      0.79      1500

###############

### Choice

In [5]:
unexpected_labels = [
    "expected",
    "below expectations",
    "above expectations",
    "exceeded expectations",
    "anticipated",
    "predicted"
]

# Evaluate

In [6]:
df_llm = pd.read_pickle(base_path/"data/labeling/Eval-LLMs.pkl")

In [7]:
gt = pd.read_pickle(base_path/"data/labeling/GT.pkl")
gt = gt[(gt.labeled == True) & (gt.strategy == "sequential")]
gt_loss = gt.loss.astype(bool)
gt_unexpected = gt.unexpected.astype(bool)

In [8]:
kw_loss = gt.text.apply(lambda x: any([y.lower() in x.lower() for y in loss_labels]))
kw_unexpected = gt.text.apply(lambda x: any([y.lower() in x.lower() for y in unexpected_labels]))

In [125]:
print(classification_report(gt_unexpected, kw_unexpected))
print(classification_report(gt_loss, kw_loss))

              precision    recall  f1-score   support

       False       1.00      0.96      0.98      2745
        True       0.14      0.76      0.24        21

    accuracy                           0.96      2766
   macro avg       0.57      0.86      0.61      2766
weighted avg       0.99      0.96      0.98      2766

              precision    recall  f1-score   support

       False       0.97      0.84      0.90      2629
        True       0.16      0.55      0.24       137

    accuracy                           0.83      2766
   macro avg       0.56      0.70      0.57      2766
weighted avg       0.93      0.83      0.87      2766



In [10]:
print(classification_report(gt_unexpected, df_llm.cohere_unexpected.apply(lambda x: x =="True")))
print(classification_report(gt_loss, df_llm.cohere_loss.apply(lambda x: x =="True")))

              precision    recall  f1-score   support

       False       0.99      0.93      0.96      2745
        True       0.01      0.10      0.02        21

    accuracy                           0.92      2766
   macro avg       0.50      0.51      0.49      2766
weighted avg       0.99      0.92      0.95      2766

              precision    recall  f1-score   support

       False       0.98      0.91      0.95      2629
        True       0.28      0.64      0.39       137

    accuracy                           0.90      2766
   macro avg       0.63      0.77      0.67      2766
weighted avg       0.94      0.90      0.92      2766



# Evaluating Different Models

In [63]:
model_pred = {
    "loss": {},
    "unexpected": {},
    "unexpected_loss": {}
    }

data_folder_path = base_path/"data/evaluation"
for target in ["loss", "unexpected", "unexpected_loss"]:
    dataset_path = data_folder_path/target
    for dataset in os.listdir(dataset_path):
        model =  "-".join(dataset.split("-")[1:])
        if model == "ze":
            data = datasets.load_from_disk(dataset_path/dataset)
            df = data.to_pandas()
            df["p_ensemble"] = df[[col for col in df.columns if col[:2] == "p_"]].mean(axis=1)
            df = df[["report_id", "paragraph_nr", "p_ensemble"]]
        else:
            data = datasets.load_from_disk(dataset_path/dataset)
            df = data.to_pandas()
            for col in df.columns:
                if col[:6] == "logits":
                    new_col = "_".join(["p"] + col.split("_")[1:])
                    df[new_col]= softmax(df[col].tolist(), axis=1)[:,0]
                    df = df[["report_id", "paragraph_nr"] + [new_col]]
        for col in df.columns:
            if col[:2] == "p_":
                y_pred = (gt.merge(df, on=["report_id", "paragraph_nr"], how='left')[col]>0.5).tolist()
                model_pred[target][model] =  y_pred
        if target == "loss":
            model_pred[target]["Keyword"] = gt.text.apply(lambda x: any([y.lower() in x.lower() for y in loss_labels]))
        elif target == "unexpected":
            model_pred[target]["Keyword"] = gt.text.apply(lambda x: any([y.lower() in x.lower() for y in unexpected_labels]))
    if target != "unexpected_loss":
        model_pred[target]["cohere"] = df_llm[f"cohere_{target}"].apply(lambda x: x == "True")
    

In [64]:
scores = {
    "loss": {},
    "unexpected": {},
    "unexpected_loss": {}
    }

for target in ["loss", "unexpected", "unexpected_loss"]:
    if target == "unexpected_loss":
        y_gt = (gt_unexpected & gt_loss).tolist()
        for key in model_pred["loss"]:
            y_pred = [x&y for x, y in zip(model_pred["loss"][key], model_pred["unexpected"][key])]
            conf_mat = confusion_matrix(y_gt, y_pred)
            scores[target][key] = {
                "F1": f1_score(y_gt, y_pred, average="binary"),
                "Precision": precision_score(y_gt, y_pred, average="binary", zero_division=0),
                "Recall": recall_score(y_gt, y_pred, average="binary"),
                "F2": fbeta_score(y_gt, y_pred, beta=2, average="binary"),
                "AUC-ROC": roc_auc_score(y_gt, y_pred),
                "TP": conf_mat[1,1],
                "FP": conf_mat[0,1],
                "FN": conf_mat[1,0],
                "TN": conf_mat[0,0]
            }
        for key in model_pred["unexpected_loss"]:
            y_pred = model_pred[target][key]
            conf_mat = confusion_matrix(y_gt, y_pred)
            scores[target][key] = {
                "F1": f1_score(y_gt, y_pred, average="binary"),
                "Precision": precision_score(y_gt, y_pred, average="binary", zero_division=0),
                "Recall": recall_score(y_gt, y_pred, average="binary"),
                "F2": fbeta_score(y_gt, y_pred, beta=2, average="binary"),
                "AUC-ROC": roc_auc_score(y_gt, y_pred),
                "TP": conf_mat[1,1],
                "FP": conf_mat[0,1],
                "FN": conf_mat[1,0],
                "TN": conf_mat[0,0]
            }
    else:
        y_gt = gt[target].tolist()
        for key in model_pred[target]:
            y_pred = model_pred[target][key]
            conf_mat = confusion_matrix(y_gt, y_pred)
            scores[target][key] = {
                "F1": f1_score(y_gt, y_pred, average="binary"),
                "Precision": precision_score(y_gt, y_pred, average="binary", zero_division=0),
                "Recall": recall_score(y_gt, y_pred, average="binary"),
                "F2": fbeta_score(y_gt, y_pred, beta=2, average="binary"),
                "AUC-ROC": roc_auc_score(y_gt, y_pred),
                "TP": conf_mat[1,1],
                "FP": conf_mat[0,1],
                "FN": conf_mat[1,0],
                "TN": conf_mat[0,0]
            }

In [77]:
model_map = {
    "Keyword": "Keyword",
    "zediac-large": "ZeDiAc-large",
    "zediac-base": "ZeDiAc-base",
    "zedi-large": "ZeDi-large",
    "zedi-base": "ZeDi-base",
    "ze": "Zero-Shot",
    "ft-large": "DeBERTa-large",
    "ft-base": "DeBERTa-base",
    "cohere": "Cohere"
}
model_order = [key for key in model_map.keys()]
score_order= ["F1", "F2", "AUC-ROC", "Precision", "Recall", "TP", "FP", "FN", "TN"]

### Unexpected

In [78]:
pd.DataFrame(scores["unexpected"])[model_order].rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2", "AUC-ROC", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2", "AUC-ROC", "Precision", "Recall"]).format("{:.0f}", ["TP", "TN", "FP", "FN"]).to_latex(
    table_path/"model_eval_unexpected.tex",
    position="H",
    caption="Model performance for \emph{unexpected}.",
    label="tab:model_eval_unexpected",
    environment="longtable",
    convert_css=True,
    hrules=True
)

### Loss

In [79]:
pd.DataFrame(scores["loss"])[model_order].rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2", "AUC-ROC", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2", "AUC-ROC", "Precision", "Recall"]).format("{:.0f}", ["TP", "TN", "FP", "FN"]).to_latex(
    table_path/"model_eval_loss.tex",
    position="H",
    caption="Model performance for \emph{loss}.",
    label="tab:model_eval_loss",
    environment="longtable",
    convert_css=True,
    hrules=True
)

## Unexpected Loss

In [80]:
pd.DataFrame(scores["unexpected_loss"]).rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2", "AUC-ROC", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2", "AUC-ROC", "Precision", "Recall"]).format("{:.0f}", ["TP", "TN", "FP", "FN"]).to_latex(
    table_path/"model_eval_unexpected_loss.tex",
    position="H",
    caption="Model performance for \emph{unexpected \& loss}.",
    label="tab:model_eval_unexpected_loss",
    environment="longtable",
    convert_css=True,
    hrules=True
)

# Evaluating Zero-Shot performance of different Models, Targets, and Labels

In [70]:
scores_zs = {
    "loss": {},
    "unexpected": {},
    "unexpected_loss": {}
    }
for target in ["loss", "unexpected"]:
    scores_zs[target]["Ensemble"] = scores[target]["ze"]
    data = datasets.load_from_disk(data_folder_path/target/"eval-ze")
    df = data.to_pandas()
    df = gt.merge(df[[col for col in df.columns if col[:2] == "p_"]+["report_id", "paragraph_nr"]], on =["report_id", "paragraph_nr"], how="left")

    y_pred = {}
    for col in [col for col in df.columns if col[:2] == "p_"]:
        y_pred[col] = (df[col]>0.5).to_list()

    y_gt = df[target].to_list()
    for col in y_pred:
        y = y_pred[col]
        model = "RoBERTa" if col.split("_")[2] == "D" else "DeBERTa"
        key = "-".join([model, col.split("_")[3], col.split("_")[4]])
        conf_mat = confusion_matrix(y_gt, y)
        scores_zs[target][key] = {
                "F1": f1_score(y_gt, y, average="binary"),
                "Precision": precision_score(y_gt, y, average="binary", zero_division=0),
                "Recall": recall_score(y_gt, y, average="binary"),
                "F2": fbeta_score(y_gt, y, beta=2, average="binary"),
                "AUC-ROC": roc_auc_score(y_gt, y),
                "TP": conf_mat[1,1],
                "FP": conf_mat[0,1],
                "FN": conf_mat[1,0],
                "TN": conf_mat[0,0]
            }

In [72]:
pd.DataFrame(scores_zs["unexpected"]).rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2", "AUC-ROC", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2", "AUC-ROC", "Precision", "Recall"]).format("{:.0f}", ["TP", "TN", "FP", "FN"]).to_latex(
    table_path/"model_eval_zero_shot_unexpected.tex",
    position="H",
    caption="Zero-shot model performance for \emph{unexpected}.",
    label="tab:model_eval_zero_shot_unexpected",
    environment="longtable",
    convert_css=True,
    hrules=True
)

In [73]:
pd.DataFrame(scores_zs["loss"]).rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2", "AUC-ROC", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2", "AUC-ROC", "Precision", "Recall"]).format("{:.0f}", ["TP", "TN", "FP", "FN"]).to_latex(
    table_path/"model_eval_zero_shot_loss.tex",
    position="H",
    caption="Zero-shot model performance for \emph{loss}.",
    label="tab:model_eval_zero_shot_loss",
    environment="longtable",
    convert_css=True,
    hrules=True
)