In [1]:
import pandas as pd
import numpy as np
import os 

# TRAIN

In [2]:
DATA_DIR = "/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/v2/labelling/"
df = pd.read_csv(os.path.join(DATA_DIR, "labels_training_21052021_full_II.csv"))

CRO_LEVEL = "cro_sub_type" # cro, cro_sub_type
CATEGORY_CODES = ["ACUTE", "CHRON", "POLICY", "MARKET", "REPUT"]

In [3]:
import re
df["id"] =  df.apply(lambda row: f"{row.report_id}_{str(row.page)}_{str(row.paragraph_no)}", axis=1)
df = df.drop("Unnamed: 0", axis=1, errors='ignore')

def parse_comment(row):
    if not row.comment or pd.isna(row.comment):
        return np.NaN
    span_id = re.search('<id>(.*)</id>', row.comment)
    if span_id:
        span_id = span_id.group(1)
    else:
        span_id = np.NaN
    return span_id

df["span_id"] = df.apply(lambda row: parse_comment(row), axis=1)

for span_id in df.span_id.unique():
    if not pd.isna(span_id):
        rows = df[df.span_id == span_id]
        len_rows = len(rows)
        text = "\n".join(rows.text.tolist())
        df = df.drop(rows.index[1:])
        df.loc[df.span_id == span_id, "text"] = text
        print(f"Merged ID {span_id} with {len_rows} rows...")

Merged ID 4 with 2 rows...
Merged ID 7 with 2 rows...
Merged ID 15 with 2 rows...
Merged ID 17 with 2 rows...
Merged ID 21 with 2 rows...
Merged ID 31 with 2 rows...
Merged ID 100 with 2 rows...
Merged ID 101 with 2 rows...


In [4]:
docs = df.groupby(["id"]).first().text

df.loc[df.cro_sub_type != df.cro_sub_type, "cro_sub_type"] = "missing"
labels = pd.crosstab(df.id, df.cro_sub_type, dropna=False)

In [5]:
labels = labels[CATEGORY_CODES]
labels['text'] = docs

In [6]:
labels.sum()

cro_sub_type
ACUTE                                                   140
CHRON                                                    58
POLICY                                                   47
MARKET                                                   43
REPUT                                                    26
text      Risks related to climate change refer to the p...
dtype: object

In [113]:
labels.sum()

cro_sub_type
ACUTE                                                   141
CHRON                                                    57
POLICY                                                   47
MARKET                                                   40
REPUT                                                    26
text      Risks related to climate change refer to the p...
dtype: object

In [7]:
labels.to_csv(os.path.join(DATA_DIR, f"train_explicit_labels_V.csv"))

# VALID / TEST

## Old train

In [None]:
# File containing "cleaned" positives (and resulting good negatives!)
df1 = pd.read_excel(os.path.join(DATA_DIR, "old_train_positives_BS_v02.xls"))
df1 = df1.set_index("id")
df1 = df1.rename(columns={"REPUTATION": "REPUT"})

In [None]:
# From here, we only want to get the additional negatives
df2 = pd.read_pickle(os.path.join(DATA_DIR, "Firm_AnnualReport_Labels_Training.pkl"))
df2["id"] = df2.apply(lambda row: f"{row.report_id}_{str(row.page)}_{str(row.paragraph_no)}", axis=1)

df2_docs = df2.groupby(["id"]).first().text
df2["cro_sub_type_combined"].loc[df2["cro_sub_type_combined"] != df2["cro_sub_type_combined"]] = "missing"
df2_labels = pd.crosstab(df2.id, df2["cro_sub_type_combined"], dropna=False)
df2_labels = df2_labels.rename(columns={"REPUTATION": "REPUT"})
df2_labels = df2_labels[CATEGORY_CODES]
df2 = df2_labels.join(df2_docs)

In [None]:
missing_idx = df2.index.difference(df1.index)
df_combined = df1.append(df2.loc[missing_idx, :])

## Old test

In [None]:
# File containing "cleaned" positives (and resulting good negatives!)
df3 = pd.read_excel(os.path.join(DATA_DIR, "old_test_positives_DF.xls"))
df3 = df3.rename(columns={"REPUTATION": "REPUT", "Unnamed: 0": "id"})
df3 = df3.set_index("id")

In [None]:
# From here, we only want to get the additional negatives
df4 = pd.read_pickle(os.path.join(DATA_DIR, "Firm_AnnualReport_Labels_Test.pkl"))
df4["id"] = df4.apply(lambda row: f"{row.report_id}_{str(row.page)}_{str(row.paragraph_no)}", axis=1)

df4_docs = df4.groupby(["id"]).first().text
df4["cro_sub_type_combined"].loc[df4["cro_sub_type_combined"] != df4["cro_sub_type_combined"]] = "missing"
df4_labels = pd.crosstab(df4.id, df4["cro_sub_type_combined"], dropna=False)
df4_labels = df4_labels.rename(columns={"REPUTATION": "REPUT"})
df4_labels = df4_labels[CATEGORY_CODES]
df4 = df4_labels.join(df4_docs)

In [None]:
missing_idx2 = df4.index.difference(df3.index)
df_combined2 = df3.append(df4.loc[missing_idx2, :])

In [None]:
df_combined2

In [None]:
df_combined_final = pd.concat([df_combined, df_combined2])
df_combined_final.to_csv(os.path.join(DATA_DIR, f"test_realistic.csv"))

df_combined_final_optimistic = pd.concat([df1, df3])
df_combined_final_optimistic.to_csv(os.path.join(DATA_DIR, "test_optimistic.csv"))

In [None]:
df_combined_final_optimistic

In [None]:
df_combined_final.text

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
df_combined_final["count"] = df_combined_final.apply(lambda row: len(tokenizer.tokenize(str(row.text))), axis=1)
print(df_combined_final["count"].describe())
df_combined_final["count"].plot.kde()

In [None]:
df_combined_final_optimistic["count"] = df_combined_final_optimistic.apply(lambda row: len(tokenizer.tokenize(str(row.text))), axis=1)
print(df_combined_final_optimistic["count"].describe())
df_combined_final_optimistic["count"].plot.kde()

In [None]:
text = "Hello, world! This is an   \n    awesome test of the BERT tokenizer."
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens: ", tokens)
print("ID's: ", ids)

## Data stats

In [None]:
df_1 = pd.read_csv(os.path.join(DATA_DIR, "train_explicit_labels.csv"))
df_2 = pd.read_csv(os.path.join(DATA_DIR, "train_explicit_labels_cleaned.csv"))
df_3 = pd.read_csv(os.path.join(DATA_DIR, "train_explicit_labels_II.csv"))

In [None]:
df_1.sum()

In [None]:
df_2.sum()

In [None]:
df_3.sum()