In [1]:
import sys
from dotenv import dotenv_values
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, f1_score, fbeta_score, precision_score, recall_score, confusion_matrix
np.random.seed(19950808)

# take environment variables from .env.
config = dotenv_values("./../../config/.env")
base_path = Path(config["BASE_PATH"])
writing_path = base_path/"writing"/"MSc-Thesis-Emerging-Risks"
table_path = writing_path/"tables"
sys.path.append(str(base_path/"code"))

In [2]:
from itertools import chain, combinations

def powerset(iterable):
    """
    powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
    """
    xs = list(iterable)
    # note we return an iterator rather than a list
    return chain.from_iterable(combinations(xs,n) for n in range(len(xs)+1))

In [3]:
df_al = pd.read_pickle(base_path/"data/labeling/active-learning-iteration-2.pkl")
df_al = df_al[df_al.labeled]
loss = df_al.loss.astype(bool)
unexpected = df_al.unexpected.astype(bool)

## Inspecting the true labeled paragraphs

In [None]:
df_al[df_al.loss == 1].text.tolist()

In [None]:
df_al[df_al.unexpected == 1].text.tolist()

## Loss

In [4]:
loss_labels = [
    "loss",
    "adverse development",
    "adverse effect",
    "adverse impact",
    "adverse influence"
    "higher claims",
    "higher loss",
    "higher cost"
    "costs increased",
    "impariment",
    "charge",
    "rising claims expenses",
    "negatively impacted",
    "burden"
]

In [6]:
l_labels = list(powerset(loss_labels))
l_scores = np.zeros((len(l_labels), 4))
for i, labels in enumerate(l_labels):
    kw = df_al.text.apply(lambda x: any([y.lower() in x.lower() for y in labels]))
    l_scores[i] = (f1_score(loss, kw), fbeta_score(loss, kw, beta=2), precision_score(loss, kw, zero_division=0), recall_score(loss, kw))

In [7]:
for index in set(l_scores.argmax(axis=0)):
    print(", ".join(l_labels[index]))
    print(" ")
    kw = df_al.text.apply(lambda x: any([y.lower() in x.lower() for y in l_labels[index]]))
    print("F1:", round(f1_score(loss, kw), 3))
    print("F2:", round(fbeta_score(loss, kw, beta=2), 3))
    conf_mat = confusion_matrix(loss, kw)
    print("TP:", conf_mat[1,1])
    print("FP:", conf_mat[0,1])
    print("FN:", conf_mat[1,0])
    print("TN:", conf_mat[0,0])
    print(classification_report(loss, kw))
    print(40*"#")
    print(" ")

rising claims expenses
 
F1: 0.004
F2: 0.003
TP: 1
FP: 0
FN: 450
TN: 1049
              precision    recall  f1-score   support

       False       0.70      1.00      0.82      1049
        True       1.00      0.00      0.00       451

    accuracy                           0.70      1500
   macro avg       0.85      0.50      0.41      1500
weighted avg       0.79      0.70      0.58      1500

########################################
 
loss, adverse development, adverse effect, charge, rising claims expenses, burden
 
F1: 0.561
F2: 0.66
TP: 338
FP: 417
FN: 113
TN: 632
              precision    recall  f1-score   support

       False       0.85      0.60      0.70      1049
        True       0.45      0.75      0.56       451

    accuracy                           0.65      1500
   macro avg       0.65      0.68      0.63      1500
weighted avg       0.73      0.65      0.66      1500

########################################
 
loss, adverse development, charge, rising claims ex

### Choice

In [8]:
loss_labels = [
    "loss",
    "adverse development",
    "charge",
    "rising claims expenses",
    "burden"
]

## Unexpected

### Initial List

In [9]:
unexpected_labels = [
    "unexpected",
    "surprising",
    "surprised",
    "surpris",
    "not expected",
    "expected",
    "more than expected",
    "less than expected",
    "lower than expected"
    "higher than expected "
    "more than expected"
    "below expectations",
    "above expectations",
    "exceed expectations",
    "exceeded expectations",
    "not meet expectations",
    "not according to expectations",
    "not as expected",
    "estimated",
    "anticipated",
    "predicted"
]

### First Subset
This is too large to run all combinations thus i used different subsets, refinde iteratively

In [10]:
unexpected_labels = [
    "unexpected",
    "surprising",
    "surprised",
    "surpris",
    "not expected",
    "expected",
    "more than expected",
    "less than expected",
    "lower than expected"
    "higher than expected "
    "more than expected"
    "below expectations",
    "above expectations",
    "exceed expectations",
    "exceeded expectations"
]

### Second Subset

In [11]:
unexpected_labels = [
    "unexpect",
    "expectation"
    "surpris",
    "expected",
    "below expectations",
    "above expectations",
    "exceed expectations",
    "exceeded expectations",
    "estimated",
    "anticipated",
    "predicted"
]

### Third Subset

In [12]:
unexpected_labels = [
    "expected",
    "surprised",
    "below expectations",
    "above expectations",
    "exceeded expectations",
    "anticipated",
    "predicted",
    "not meet expectations",
    "not according to expectations",
    "not as expected"
]

In [13]:
u_labels = list(powerset(unexpected_labels))
u_scores = np.zeros((len(u_labels), 4))
for i, labels in enumerate(u_labels):
    kw = df_al.text.apply(lambda x: any([y.lower() in x.lower() for y in labels]))
    u_scores[i] = (f1_score(unexpected, kw), fbeta_score(unexpected, kw, beta=2), precision_score(unexpected, kw, zero_division=0), recall_score(unexpected, kw))

In [14]:
for index in set(u_scores.argmax(axis=0)):
    print(", ".join(u_labels[index]))
    print(" ")
    kw = df_al.text.apply(lambda x: any([y.lower() in x.lower() for y in u_labels[index]]))
    print("F1:", round(f1_score(unexpected, kw), 3))
    print("F2:", round(fbeta_score(unexpected, kw, beta=2), 3))
    conf_mat = confusion_matrix(unexpected, kw)
    print("TP:", conf_mat[1,1])
    print("FP:", conf_mat[0,1])
    print("FN:", conf_mat[1,0])
    print("TN:", conf_mat[0,0])
    print(classification_report(unexpected, kw))
    print(40*"#")
    print(" ")

expected, below expectations, above expectations, exceeded expectations, anticipated
 
F1: 0.372
F2: 0.474
TP: 106
FP: 281
FN: 77
TN: 1036
              precision    recall  f1-score   support

       False       0.93      0.79      0.85      1317
        True       0.27      0.58      0.37       183

    accuracy                           0.76      1500
   macro avg       0.60      0.68      0.61      1500
weighted avg       0.85      0.76      0.79      1500

########################################
 
expected, below expectations, above expectations, exceeded expectations, anticipated, predicted
 
F1: 0.37
F2: 0.475
TP: 107
FP: 288
FN: 76
TN: 1029
              precision    recall  f1-score   support

       False       0.93      0.78      0.85      1317
        True       0.27      0.58      0.37       183

    accuracy                           0.76      1500
   macro avg       0.60      0.68      0.61      1500
weighted avg       0.85      0.76      0.79      1500

###############

### Choice

In [15]:
unexpected_labels = [
    "expected",
    "below expectations",
    "above expectations",
    "exceeded expectations",
    "anticipated",
    "predicted"
]