In [1]:
import sys
import pandas as pd
import numpy as np
from dotenv import dotenv_values
from pathlib import Path

np.random.seed(19950808)

config = dotenv_values("./../../config/.env") # take environment variables from .env.
base_path = Path(config["BASE_PATH"])
sys.path.append(str(base_path/"code"))

from ZeDiAc.Annotators import ReportAnnotator

In [2]:
annotator = ReportAnnotator(
    data = " ",
    labels = {f"loss_GT": "loss", f"unexpected_GT": "unexpected"},
    out_file = base_path/"data/labeling/GT.pkl",
    text_column = "text",
    batch_size = 50,
)

In [3]:
annotator.label_batch(strategy="sequential")

Companies:  ['FIRST COMMONWEALTH CORP'] Years:  [1999.] Paragraphs: 305 to 407


HTML(value='0 of 50 Examples annotated, Current Position: 0 ')

VBox(children=(HBox(children=(ToggleButton(value=False, description='loss'), ToggleButton(value=False, descrip…

Output()

### Insights into how much is already labeled   

In [14]:
n_annotated = annotator.data[annotator.data.strategy == "sequential"].shape[0]
n_to_annotate = annotator.data[annotator.data.labeled==-1].shape[0]
n_sample = n_annotated + n_to_annotate
print(f"Annotated this session: {n_annotated-2766}")
print(f"Remaining:             {n_to_annotate} of {n_sample} ({n_to_annotate/n_sample:.1%})")
print(f"Annotated:             {n_annotated} of {n_sample} ({n_annotated/n_sample:.1%})")
counts = annotator.data[annotator.data.strategy.isin(["sequential"])][["loss", "unexpected"]].sum()
print(f"Labeled as Loss:        {counts.tolist()[0]} ({round(counts.tolist()[0]/n_annotated*100,2)}%)")
print(f"Labeled as Unexpected:   {counts.tolist()[1]} ({round(counts.tolist()[1]/n_annotated*100,2)}%)")

Annotated this session: 0
Remaining:             3306 of 6072 (54.4%)
Annotated:             2766 of 6072 (45.6%)
Labeled as Loss:        137 (4.95%)
Labeled as Unexpected:   21 (0.76%)


In [15]:
annotator.data[(annotator.data.company == 'ALLSTATE LIFE INSURANCE CO') & (annotator.data.year == 2009)].labeled.value_counts()

-1.0    523
 1.0     24
Name: labeled, dtype: int64

In [16]:
annotator.data[annotator.data.labeled.isin([-1]) & annotator.data.company.isin(['BeazleyPLC'])].shape[0]

0