In [1]:
import sys
import os
import datasets
from dotenv import dotenv_values
from pathlib import Path
from scipy.special import softmax
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report, f1_score, fbeta_score, precision_score, accuracy_score, recall_score, confusion_matrix
np.random.seed(19950808)

# take environment variables from .env.
config = dotenv_values("./../../config/.env")
base_path = Path(config["BASE_PATH"])
writing_path = base_path/"writing"/"MSc-Thesis-Emerging-Risks"
table_path = writing_path/"tables"
data_path = base_path/"data"
sys.path.append(str(base_path/"code"))

# Evaluate

## Baslines

In [2]:
df_llm = pd.read_pickle(base_path/"data/evaluation/Eval-LLMs.pkl")

In [4]:
gt = pd.read_pickle(base_path/"data/evaluation/GT.pkl")
gt_loss = gt.loss.astype(bool)
gt_unexpected = gt.unexpected.astype(bool)

In [23]:
column_name_mapping = {
    "paragraph_nr": "\\# Paragraphs",
    "n_words": "mean \\# words",
    "loss": "\\# loss",
    "unexpected": "\\# unexpected",
    "unexpected loss": "\\# unexpected loss"}

dfff = gt.rename(columns={"company": "Company", "year": "Year"})

dfff["Company"] = dfff["Company"].astype(str)
dfff["Year"] = dfff["Year"].astype(int)
dfff["unexpected loss"] = dfff["unexpected"].astype(bool) & dfff["loss"].astype(bool)

dfff.groupby(["Company", "Year"]).agg({
    "paragraph_nr": "count",
    "n_words": "mean", 
    "loss": "sum", 
    "unexpected": "sum",
    "unexpected loss": "sum"})\
    .rename(columns=column_name_mapping)\
    .style.format(precision=2).to_latex(
            table_path/"stats_eval.tex",
            position="H",
            caption="Descriptive statistics of evaluation data.",
            label="tab:stats_eval",
            environment="longtable",
            sparse_index=False,
            sparse_columns=False,
            hrules=True)

### KW-Baseline

In [7]:
loss_labels = [
    "loss",
    "adverse development",
    "charge",
    "rising claims expenses",
    "burden"
]

unexpected_labels = [
    "expected",
    "below expectations",
    "above expectations",
    "exceeded expectations",
    "anticipated",
    "predicted"
]

In [8]:
kw_loss = gt.text.apply(lambda x: any([y.lower() in x.lower() for y in loss_labels]))
kw_unexpected = gt.text.apply(lambda x: any([y.lower() in x.lower() for y in unexpected_labels]))

In [9]:
print(classification_report(gt_unexpected, kw_unexpected))
print(classification_report(gt_loss, kw_loss))

              precision    recall  f1-score   support

       False       1.00      0.96      0.98      2745
        True       0.14      0.76      0.24        21

    accuracy                           0.96      2766
   macro avg       0.57      0.86      0.61      2766
weighted avg       0.99      0.96      0.98      2766

              precision    recall  f1-score   support

       False       0.97      0.84      0.90      2629
        True       0.16      0.55      0.24       137

    accuracy                           0.83      2766
   macro avg       0.56      0.70      0.57      2766
weighted avg       0.93      0.83      0.87      2766



### TF-IDF

In [10]:
df_al = pd.read_pickle(base_path/"data/labeling/active-learning-iteration-2.pkl")
df_al = df_al[df_al.labeled]


vectorizer = TfidfVectorizer(
    lowercase=True, 
    stop_words='english', 
    norm="l2", 
    use_idf=True, 
    smooth_idf=True, 
    analyzer="word",
    max_df=0.9,
    min_df=0.03,
    ngram_range=(1,2))

cls_l = svm.SVC(
    kernel="sigmoid",
    class_weight="balanced",
    C=100,
    gamma="auto"
)

cls_u = svm.SVC(
    kernel="sigmoid",
    class_weight="balanced",
    C=100,
    gamma="auto"
)

X_train_counts = vectorizer.fit_transform(df_al.text.tolist())
X_test_counts = vectorizer.transform(gt.text.tolist())

cls_l.fit(X_train_counts, df_al.loss.tolist())
tfidf_loss = cls_l.predict(X_test_counts)
cls_u.fit(X_train_counts, df_al.unexpected.tolist())
tfidf_unexpected = cls_u.predict(X_test_counts)

### Cohere

In [11]:
print(classification_report(gt_unexpected, df_llm.cohere_unexpected.apply(lambda x: x =="True")))
print(classification_report(gt_loss, df_llm.cohere_loss.apply(lambda x: x =="True")))

              precision    recall  f1-score   support

       False       0.99      0.93      0.96      2745
        True       0.01      0.10      0.02        21

    accuracy                           0.92      2766
   macro avg       0.50      0.51      0.49      2766
weighted avg       0.99      0.92      0.95      2766

              precision    recall  f1-score   support

       False       0.98      0.91      0.95      2629
        True       0.28      0.64      0.39       137

    accuracy                           0.90      2766
   macro avg       0.63      0.77      0.67      2766
weighted avg       0.94      0.90      0.92      2766



# Evaluating Different Models

In [12]:
model_pred = {
    "loss": {},
    "unexpected": {},
    "unexpected_loss": {}
    }

data_folder_path = base_path/"data/evaluation"
for target in ["loss", "unexpected", "unexpected_loss"]:
    dataset_path = data_folder_path/target
    for dataset in os.listdir(dataset_path):
        model =  "-".join(dataset.split("-")[1:])
        if model == "ze":
            data = datasets.load_from_disk(dataset_path/dataset)
            df = data.to_pandas()
            df["p_ensemble"] = df[[col for col in df.columns if col[:2] == "p_"]].mean(axis=1)
            df = df[["report_id", "paragraph_nr", "p_ensemble"]]
        else:
            data = datasets.load_from_disk(dataset_path/dataset)
            df = data.to_pandas()
            for col in df.columns:
                if col[:6] == "logits":
                    new_col = "_".join(["p"] + col.split("_")[1:])
                    df[new_col]= softmax(df[col].tolist(), axis=1)[:,0]
                    df = df[["report_id", "paragraph_nr"] + [new_col]]
        for col in df.columns:
            if col[:2] == "p_":
                y_pred = (gt.merge(df, on=["report_id", "paragraph_nr"], how='left')[col]>0.5).tolist()
                model_pred[target][model] =  y_pred
        if target == "loss":
            model_pred[target]["Keyword"] = kw_loss
            model_pred[target]["tf-idf"] = tfidf_loss.astype(bool)
        elif target == "unexpected":
            model_pred[target]["Keyword"] = kw_unexpected
            model_pred[target]["tf-idf"] = tfidf_unexpected.astype(bool)
    if target != "unexpected_loss":
        model_pred[target]["cohere"] = df_llm[f"cohere_{target}"].apply(lambda x: x == "True")
    

In [13]:
scores = {
    "loss": {},
    "unexpected": {},
    "unexpected_loss": {}
    }

for target in ["loss", "unexpected", "unexpected_loss"]:
    if target == "unexpected_loss":
        y_gt = (gt_unexpected & gt_loss).tolist()
        for key in model_pred["loss"]:
            y_pred = [x&y for x, y in zip(model_pred["loss"][key], model_pred["unexpected"][key])]
            conf_mat = confusion_matrix(y_gt, y_pred)
            scores[target][key] = {
                "F1": f1_score(y_gt, y_pred, average="binary"),
                "Precision": precision_score(y_gt, y_pred, average="binary", zero_division=0),
                "Recall": recall_score(y_gt, y_pred, average="binary"),
                "F2": fbeta_score(y_gt, y_pred, beta=2, average="binary"),
                "Accuracy": accuracy_score(y_gt, y_pred),
                "TP": conf_mat[1,1],
                "FP": conf_mat[0,1],
                "FN": conf_mat[1,0],
                "TN": conf_mat[0,0]
            }
        for key in model_pred["unexpected_loss"]:
            y_pred = model_pred[target][key]
            conf_mat = confusion_matrix(y_gt, y_pred)
            scores[target][key] = {
                "F1": f1_score(y_gt, y_pred, average="binary"),
                "Precision": precision_score(y_gt, y_pred, average="binary", zero_division=0),
                "Recall": recall_score(y_gt, y_pred, average="binary"),
                "F2": fbeta_score(y_gt, y_pred, beta=2, average="binary"),
                "Accuracy": accuracy_score(y_gt, y_pred),
                "TP": conf_mat[1,1],
                "FP": conf_mat[0,1],
                "FN": conf_mat[1,0],
                "TN": conf_mat[0,0]
            }
    else:
        y_gt = gt[target].tolist()
        for key in model_pred[target]:
            y_pred = model_pred[target][key]
            conf_mat = confusion_matrix(y_gt, y_pred)
            scores[target][key] = {
                "F1": f1_score(y_gt, y_pred, average="binary"),
                "Precision": precision_score(y_gt, y_pred, average="binary", zero_division=0),
                "Recall": recall_score(y_gt, y_pred, average="binary"),
                "F2": fbeta_score(y_gt, y_pred, beta=2, average="binary"),
                "Accuracy": accuracy_score(y_gt, y_pred),
                "TP": conf_mat[1,1],
                "FP": conf_mat[0,1],
                "FN": conf_mat[1,0],
                "TN": conf_mat[0,0]
            }

In [14]:
model_map = {
    "Keyword": "Keyword",
    "zediac-large": "ZeDiAc-large",
    "zediac-base": "ZeDiAc-base",
    "zedi-large": "ZeDi-large",
    "zedi-base": "ZeDi-base",
    "ze": "Zero-Shot",
    "ft-large": "DeBERTa-large",
    "ft-base": "DeBERTa-base",
    "cohere": "Cohere",
    "tf-idf": "tf-idf"
}
model_order = [key for key in model_map.keys()]
score_order= ["F1", "F2", "Accuracy", "Precision", "Recall", "TP", "FP", "FN", "TN"]

### Unexpected

In [15]:
df_format = pd.DataFrame(scores["unexpected"])[model_order].rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2","Accuracy", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2", "Accuracy","Precision", "Recall"])
df_format.format("{:.0f}", ["TP", "TN", "FP", "FN"]).to_latex(
    table_path/"model_eval_unexpected.tex",
    position="H",
    caption="Model performance for \emph{unexpected}.",
    label="tab:model_eval_unexpected",
    environment="longtable",
    convert_css=True,
    hrules=True
)
df_format

Unnamed: 0,F1,F2,Accuracy,Precision,Recall,TP,FP,FN,TN
Keyword,0.24,0.4,0.96,0.14,0.76,16,98,5,2647
ZeDiAc-large,0.76,0.84,1.0,0.66,0.9,19,10,2,2735
ZeDiAc-base,0.0,0.0,0.99,0.0,0.0,0,0,21,2745
ZeDi-large,0.26,0.46,0.96,0.15,0.95,20,113,1,2632
ZeDi-base,0.0,0.0,0.99,0.0,0.0,0,0,21,2745
Zero-Shot,0.35,0.31,0.99,0.46,0.29,6,7,15,2738
DeBERTa-large,0.0,0.0,0.99,0.0,0.0,0,0,21,2745
DeBERTa-base,0.0,0.0,0.99,0.0,0.0,0,0,21,2745
Cohere,0.02,0.04,0.92,0.01,0.1,2,193,19,2552
tf-idf,0.08,0.17,0.86,0.04,0.76,16,369,5,2376


### Loss

In [16]:
df_format = pd.DataFrame(scores["loss"])[model_order].rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2", "Accuracy", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2", "Accuracy","Precision", "Recall"]).format("{:.0f}", ["TP", "TN", "FP", "FN"])

df_format.to_latex(
    table_path/"model_eval_loss.tex",
    position="H",
    caption="Model performance for \emph{loss}.",
    label="tab:model_eval_loss",
    environment="longtable",
    convert_css=True,
    hrules=True
)
df_format

Unnamed: 0,F1,F2,Accuracy,Precision,Recall,TP,FP,FN,TN
Keyword,0.24,0.37,0.83,0.16,0.55,76,410,61,2219
ZeDiAc-large,0.64,0.59,0.97,0.73,0.56,77,28,60,2601
ZeDiAc-base,0.59,0.51,0.97,0.81,0.47,64,15,73,2614
ZeDi-large,0.52,0.57,0.94,0.45,0.61,83,100,54,2529
ZeDi-base,0.52,0.56,0.95,0.47,0.59,81,91,56,2538
Zero-Shot,0.47,0.41,0.96,0.61,0.38,52,33,85,2596
DeBERTa-large,0.65,0.57,0.97,0.83,0.53,73,15,64,2614
DeBERTa-base,0.0,0.0,0.95,0.0,0.0,0,0,137,2629
Cohere,0.39,0.5,0.9,0.28,0.64,87,227,50,2402
tf-idf,0.37,0.46,0.9,0.27,0.56,77,206,60,2423


## Unexpected Loss

In [17]:
df_format = pd.DataFrame(scores["unexpected_loss"]).rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2", "Accuracy", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2","Accuracy", "Precision", "Recall"]).format("{:.0f}", ["TP", "TN", "FP", "FN"])

df_format.to_latex(
    table_path/"model_eval_unexpected_loss.tex",
    position="H",
    caption="Model performance for \emph{unexpected \& loss}.",
    label="tab:model_eval_unexpected_loss",
    environment="longtable",
    convert_css=True,
    hrules=True
)
df_format

Unnamed: 0,F1,F2,Accuracy,Precision,Recall,TP,FP,FN,TN
ZeDiAc-large,0.14,0.13,1.0,0.17,0.12,1,5,7,2753
Keyword,0.12,0.2,0.98,0.07,0.38,3,40,5,2718
tf-idf,0.06,0.11,0.98,0.03,0.25,2,57,6,2701
DeBERTa-large,0.0,0.0,1.0,0.0,0.0,0,0,8,2758
DeBERTa-base,0.0,0.0,1.0,0.0,0.0,0,0,8,2758
ZeDi-base,0.0,0.0,1.0,0.0,0.0,0,0,8,2758
ZeDiAc-base,0.0,0.0,1.0,0.0,0.0,0,0,8,2758
ZeDi-large,0.14,0.25,0.98,0.08,0.5,4,44,4,2714
Zero-Shot,0.06,0.08,0.99,0.04,0.12,1,26,7,2732
Cohere,0.02,0.04,0.96,0.01,0.12,1,97,7,2661


# Evaluating Zero-Shot performance of different Models, Targets, and Labels

In [18]:
scores_zs = {
    "loss": {},
    "unexpected": {},
    "unexpected_loss": {}
    }
for target in ["loss", "unexpected"]:
    scores_zs[target]["Ensemble"] = scores[target]["ze"]
    data = datasets.load_from_disk(data_folder_path/target/"eval-ze")
    df = data.to_pandas()
    df = gt.merge(df[[col for col in df.columns if col[:2] == "p_"]+["report_id", "paragraph_nr"]], on =["report_id", "paragraph_nr"], how="left")

    y_pred = {}
    for col in [col for col in df.columns if col[:2] == "p_"]:
        y_pred[col] = (df[col]>0.5).to_list()

    y_gt = df[target].to_list()
    for col in y_pred:
        y = y_pred[col]
        model = "RoBERTa" if col.split("_")[2] == "D" else "DeBERTa"
        key = "-".join([model, col.split("_")[3], col.split("_")[4]])
        conf_mat = confusion_matrix(y_gt, y)
        scores_zs[target][key] = {
                "F1": f1_score(y_gt, y, average="binary"),
                "Precision": precision_score(y_gt, y, average="binary", zero_division=0),
                "Recall": recall_score(y_gt, y, average="binary"),
                "F2": fbeta_score(y_gt, y, beta=2, average="binary"),
                "Accuracy": accuracy_score(y_gt, y),
                "TP": conf_mat[1,1],
                "FP": conf_mat[0,1],
                "FN": conf_mat[1,0],
                "TN": conf_mat[0,0]
            }

In [19]:
df_format = pd.DataFrame(scores_zs["unexpected"]).rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2", "Accuracy", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2", "Accuracy",  "Precision", "Recall"]).format("{:.0f}", ["TP", "TN", "FP", "FN"])

df_format.to_latex(
    table_path/"model_eval_zero_shot_unexpected.tex",
    position="H",
    caption="Zero-shot model performance for \emph{unexpected}.",
    label="tab:model_eval_zero_shot_unexpected",
    environment="longtable",
    convert_css=True,
    hrules=True
)
df_format

Unnamed: 0,F1,F2,Accuracy,Precision,Recall,TP,FP,FN,TN
Ensemble,0.35,0.31,0.99,0.46,0.29,6,7,15,2738
RoBERTa-T1-U1,0.15,0.11,0.99,0.4,0.1,2,3,19,2742
RoBERTa-T1-U2,0.09,0.06,0.99,1.0,0.05,1,0,20,2745
RoBERTa-T1-U3,0.22,0.4,0.95,0.13,0.86,18,124,3,2621
RoBERTa-T1-U4,0.19,0.19,0.99,0.19,0.19,4,17,17,2728
RoBERTa-T1-U5,0.09,0.06,0.99,0.5,0.05,1,1,20,2744
RoBERTa-T2-U1,0.19,0.32,0.96,0.12,0.57,12,91,9,2654
RoBERTa-T2-U2,0.18,0.15,0.99,0.23,0.14,3,10,18,2735
RoBERTa-T2-U3,0.05,0.11,0.75,0.03,0.86,18,701,3,2044
RoBERTa-T2-U4,0.14,0.26,0.94,0.08,0.62,13,157,8,2588


In [20]:
df_format = pd.DataFrame(scores_zs["loss"]).rename(columns=model_map).T.round(2)[score_order].style.highlight_max(props="font-weight:bold;", axis=0, subset=["F1", "F2", "Accuracy", "Precision", "Recall", "TP", "TN"]).highlight_min(props="font-weight:bold;", axis=0, subset=["FP", "FN"]).format("{:.2f}", subset=["F1", "F2", "Accuracy", "Precision", "Recall"]).format("{:.0f}", ["TP", "TN", "FP", "FN"])

df_format.to_latex(
    table_path/"model_eval_zero_shot_loss.tex",
    position="H",
    caption="Zero-shot model performance for \emph{loss}.",
    label="tab:model_eval_zero_shot_loss",
    environment="longtable",
    convert_css=True,
    hrules=True
)
df_format

Unnamed: 0,F1,F2,Accuracy,Precision,Recall,TP,FP,FN,TN
Ensemble,0.47,0.41,0.96,0.61,0.38,52,33,85,2596
RoBERTa-T1-L1,0.25,0.19,0.95,0.52,0.17,23,21,114,2608
RoBERTa-T1-L2,0.43,0.41,0.95,0.48,0.39,54,58,83,2571
RoBERTa-T1-L3,0.25,0.22,0.94,0.32,0.2,28,59,109,2570
RoBERTa-T1-L4,0.43,0.47,0.93,0.36,0.51,70,122,67,2507
RoBERTa-T2-L1,0.28,0.23,0.95,0.46,0.2,28,33,109,2596
RoBERTa-T2-L2,0.35,0.45,0.9,0.25,0.56,77,229,60,2400
RoBERTa-T2-L3,0.35,0.52,0.86,0.23,0.76,104,354,33,2275
RoBERTa-T2-L4,0.34,0.44,0.9,0.25,0.55,75,225,62,2404
DeBERTa-T1-L1,0.17,0.12,0.95,0.54,0.1,14,12,123,2617
