# Test/validation stratified split

In [40]:
DATA_DIR = "/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/v2/labelling/"
CRO_LEVEL = "cro_sub_type" # cro, cro_sub_type
CATEGORY_CODES = ["ACUTE", "CHRON", "POLICY", "MARKET", "REPUT"]

In [41]:
import pandas as pd
import numpy as np
import os 

## Old Train

In [42]:
# File containing "cleaned" positives (and resulting good negatives!)
df1 = pd.read_excel(os.path.join(DATA_DIR, "old_train_positives_BS_v02.xls"))
df1 = df1.set_index("id")
df1 = df1.rename(columns={"REPUTATION": "REPUT"})

In [43]:
# From here, we only want to get the additional negatives
df2 = pd.read_pickle(os.path.join(DATA_DIR, "Firm_AnnualReport_Labels_Training_Negative_incl_adjunct.pkl"))
df2["id"] = df2.apply(lambda row: f"{row.report_id}_{str(row.page)}_{str(row.paragraph_no)}", axis=1)

In [44]:
# From here, we only want to get the additional negatives
df2 = pd.read_pickle(os.path.join(DATA_DIR, "Firm_AnnualReport_Labels_Training_Negative_incl_adjunct.pkl"))
df2["id"] = df2.apply(lambda row: f"{row.report_id}_{str(row.page)}_{str(row.paragraph_no)}", axis=1)

df2 = pd.DataFrame({ "text": df2.groupby(["id"]).first().text })
df2[CATEGORY_CODES] = 0

# TODO: Not used?
missing_idx = df2.index.difference(df1.index)
df_combined = df1.append(df2.loc[missing_idx, :])

## Old Test

In [45]:
# File containing "cleaned" positives (and resulting good negatives!)
df3 = pd.read_excel(os.path.join(DATA_DIR, "old_test_positives_DF.xls"))
df3 = df3.rename(columns={"REPUTATION": "REPUT", "Unnamed: 0": "id"})
df3 = df3.set_index("id")

In [46]:
# From here, we only want to get the additional negatives
df4 = pd.read_pickle(os.path.join(DATA_DIR, "Firm_AnnualReport_Labels_Test.pkl"))
df4["id"] = df4.apply(lambda row: f"{row.report_id}_{str(row.page)}_{str(row.paragraph_no)}", axis=1)

df4_docs = df4.groupby(["id"]).first().text
df4["cro_sub_type_combined"].loc[df4["cro_sub_type_combined"] != df4["cro_sub_type_combined"]] = "missing"
df4_labels = pd.crosstab(df4.id, df4["cro_sub_type_combined"], dropna=False)
df4_labels = df4_labels.rename(columns={"REPUTATION": "REPUT"})
df4_labels = df4_labels[CATEGORY_CODES]

missing_idx_test = df4_labels.index.difference(df3.index)
df4 = df4_labels.join(df4_docs)
df4 = df4.loc[missing_idx_test, :]

# Split stratified groups of company reports

In [47]:
def add_company(df):
    df["company"] = df.apply(lambda row: row.name.split("-")[0], axis=1)
    return df

df1 = add_company(df1)
df2 = add_company(df2)
df3 = add_company(df3)
df4 = add_company(df4)

In [48]:
all_explicit = df1.append(df3)
all_negative = df2.append(df4)

In [64]:
all_companies = all_negative.company.unique()
from sklearn.model_selection import train_test_split
valid_companies, test_companies = train_test_split(all_companies.tolist(), test_size=0.5, random_state=1)

In [65]:
valid_all_explicit = all_explicit.query("company in @valid_companies")
test_all_explicit = all_explicit.query("company in @test_companies")

valid_all_negative = all_negative.query("company in @valid_companies")
test_all_negative = all_negative.query("company in @test_companies")

In [68]:
valid_realistic = valid_all_explicit.append(valid_all_negative)
test_realistic = test_all_explicit.append(test_all_negative)

In [69]:
def save_label_file(df, path):
    # Remove shit
    columns = CATEGORY_CODES + ["text"]
    df = df[columns]
    df.to_csv(path)
    
save_label_file(valid_all_explicit, os.path.join(DATA_DIR, "valid_optimistic.csv"))
save_label_file(test_all_explicit, os.path.join(DATA_DIR, "test_optimistic.csv"))
save_label_file(valid_realistic, os.path.join(DATA_DIR, "valid_realistic.csv"))
save_label_file(test_realistic, os.path.join(DATA_DIR, "test_realistic.csv"))

In [102]:
valid_pos_only = valid_all_explicit[valid_all_explicit[CATEGORY_CODES].any(axis='columns')]
test_pos_only = test_all_explicit[test_all_explicit[CATEGORY_CODES].any(axis='columns')]

save_label_file(valid_pos_only, os.path.join(DATA_DIR, "valid_discriminatory.csv"))
save_label_file(test_pos_only, os.path.join(DATA_DIR, "test_discriminatory.csv"))

# Overview

In [70]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "train_explicit.csv"))

In [92]:
print(f"Train: \n{train_df[CATEGORY_CODES].sum()}")
unique_pos_rows = train_df[CATEGORY_CODES].any(axis='columns').sum()
print(f"Unique rows: {unique_pos_rows}")
print(f"All Negatives: {len(train_df) - unique_pos_rows}")
#print(f"Explicit neg: {len(df2)}")

Train: 
ACUTE     133
CHRON      54
POLICY     43
MARKET     37
REPUT      23
dtype: int64
Unique rows: 205
All Negatives: 295


In [95]:
print(f"Valid: \n{valid_realistic[CATEGORY_CODES].sum()}")
unique_pos_rows = valid_realistic[CATEGORY_CODES].any(axis='columns').sum()
print(f"Unique rows: {unique_pos_rows}")
print(f"All Negatives: {len(valid_realistic) - unique_pos_rows}")
print(f"Explicit neg: {len(valid_all_explicit) - unique_pos_rows}")

Valid: 
ACUTE     15
CHRON      5
POLICY    40
MARKET    17
REPUT     14
dtype: int64
Unique rows: 72
All Negatives: 39007
Explicit neg: 73


In [96]:
print(f"Test: \n{test_realistic[CATEGORY_CODES].sum()}")
unique_pos_rows = test_realistic[CATEGORY_CODES].any(axis='columns').sum()
print(f"Unique rows: {unique_pos_rows}")
print(f"All Negatives: {len(test_realistic) - unique_pos_rows}")
print(f"Explicit neg: {len(test_all_explicit) - unique_pos_rows}")

Test: 
ACUTE     28
CHRON     19
POLICY    60
MARKET    21
REPUT     14
dtype: int64
Unique rows: 97
All Negatives: 40878
Explicit neg: 55


In [None]:
# Debug

In [None]:
df4 = pd.read_pickle(os.path.join(DATA_DIR, "Firm_AnnualReport_Labels_Test.pkl"))
df4["id"] = df4.apply(lambda row: f"{row.report_id}_{str(row.page)}_{str(row.paragraph_no)}", axis=1)

df4_docs = df4.groupby(["id"]).first().text

In [None]:
df4