In [1]:
import sys
import datasets
import pandas as pd
import numpy as np
from dotenv import dotenv_values
from pathlib import Path

np.random.seed(19950808)

config = dotenv_values("./../../config/.env") # take environment variables from .env.
base_path = Path(config["BASE_PATH"])
sys.path.append(str(base_path/"code"))

In [2]:
df_main = pd.read_pickle(base_path/"data/preprocessed/main.pkl")
df_main.reset_index(inplace=True)
df_main.filing_type = df_main.filing_type.apply(lambda x: "10-K" if x[:4] == "10-K" else x)  # treat all SEC filings the same
df_main.year = df_main.year.astype(int)
df_main.report_id = df_main.report_id.astype(str)
df_main = df_main[df_main.year >= 1993] # Do not use historical Swiss Re Reports for training
df = pd.read_pickle(base_path/"data/preprocessed/paragraphs_clean.pkl")
df.report_id = df.report_id.astype(str)

In [3]:
df_zs = datasets.Dataset.load_from_disk(base_path/"data/preprocessed/dataset/zero-shot").to_pandas() 
df_zs["labeled"] = 1.0
df_zs["strategy"] = "zero-shot"
df_al = pd.read_pickle(base_path/"data/labeling/active-learning-iteration-2.pkl")
df_al = df_al[df_al.labeled == 1.0]
df_al["strategy"] = "active-learning"
df_labeled = pd.concat([df_zs[["report_id", "paragraph_nr", "labeled", "strategy"]], df_al[["report_id", "paragraph_nr", "labeled", "strategy"]]])
df_labeled.report_id = df_labeled.report_id.astype(str)

In [4]:
df = df.merge(df_labeled, on=["report_id", "paragraph_nr"], how="left")
df.labeled = df.labeled.apply(lambda x: x if x == 1.0 else 0.0)
df.sort_values(by=["report_id", "paragraph_nr"], inplace=True)

Note that labeling full reports is somewhat faster than labeling single paragraphs, as there is more context (ie. the preceeding paragraphs).

In [5]:
reports_to_label = df_main.groupby("filing_type").sample(5)["report_id"].tolist()

Comment: 

From these 10 sampled reports I only managed to label the first 5 (10K) and 2 (Beazley and Baloise) from the last 5 (PDF) due to time constraints.
The PDF reports are longer and there are fewer reports (about 1 in 10) in the dataset thus this 5 to 2 split is actually closer to the underlying distribution, while allowing for at least 2 PDF reports.

In [6]:
reports_to_label

['3935',
 '4123',
 '5110',
 '3585',
 '5407',
 'CNPAssurancesSA-AR_2019',
 'de_allianz-AR_2009',
 'BaloiseHoldingLtd-AR_2015',
 'SwissLifeHoldingAG-AR_2012',
 'BeazleyPLC-AR_2017']

Set labeled to -1 for the paragraphs which are to be labeled

In [7]:
df.loc[df.report_id.isin(reports_to_label) & df.labeled.isin([0]), "labeled"] = -1

In [8]:
df["loss"] = None
df["unexpected"] = None
df.reset_index(drop=True, inplace=True)
df_report_id = df.report_id.astype(str)
df = df.merge(df_main.drop("filing_type", axis=1), how ="left", on="report_id")

Uncomment the following to generate the labeling dataset note this overwrites any existing file

In [None]:
# df.to_pickle(base_path/"data/labeling/GT.pkl")