# Prepares the overall header dataset

In [54]:
import os
import sys
from pathlib import Path

import pandas as pd
import numpy as np

sys.path.append('../..')

import data
import data.dataframe_preparation as preparation


############### CONFIG ###############
FIRM_METADATA = os.path.abspath("../../input_files/Firm_Metadata.csv")
DATA_INPUT_PATH = os.path.abspath("../../input_files/annual_reports/")
MASTER_DATA_PATH = os.path.abspath("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Data/stoxx_inference/Firm_AnnualReport.csv")
LABELLING_DATA_PATH = os.path.abspath("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/")
######################################

df_training_df = pd.read_csv(os.path.join(LABELLING_DATA_PATH, "Firm_AnnualReport_Training_DF.csv"))
df_training_ts = pd.read_csv(os.path.join(LABELLING_DATA_PATH, "Firm_AnnualReport_Training_TS.csv"))
df_test_df = pd.read_csv(os.path.join(LABELLING_DATA_PATH, "Firm_AnnualReport_Test_DF.csv"))
df_test_ts = pd.read_csv(os.path.join(LABELLING_DATA_PATH, "Firm_AnnualReport_Test_TS.csv"))

df_training_df = df_training_df.set_index("id")
df_training_ts = df_training_ts.set_index("id")
df_test_df = df_test_df.set_index("id")
df_test_ts = df_test_ts.set_index("id")

In [58]:
# Combine from both dataset. Since the rows should be exactly duplicated, the groupby and max strategy should return us the is_labelled as true if it was labelled in one of the two
df_training = pd.concat([df_training_df, df_training_ts]).groupby(level=0).max()
df_test = pd.concat([df_test_df, df_test_ts]).groupby(level=0).max()

# Validation
assert df_training.should_label.sum() == 100
assert df_training.is_labelled.sum() == 98
assert df_test.should_label.sum() == 69
assert df_test.is_labelled.sum() == 17

98
17


In [60]:
master_file = Path(MASTER_DATA_PATH)
assert not master_file.is_file(), "Master file is already present. Delete manually"

df = preparation.get_df(input_path=DATA_INPUT_PATH, report_type_mappings={"20F": "AR"}, selected_report_types={"AR"}, include_text=False, include_page_no=False, include_toc=False)
df = df.set_index("id")
    
# Add additional inference columns
df['should_infer'] = True
df['is_inferred'] = False # TODO: Change back

# Load the meta data
df_meta = pd.read_csv(FIRM_METADATA)
df_meta = df_meta.set_index('id')
df['company_id'] = df['country'] + "_" + df['company']
df = df.drop(columns=['country'])
df = df.merge(df_meta, left_on='company_id', right_index=True)
    
def set_dataset(row):
    is_in_training = df_training.loc[row.name].is_labelled if row.name in df_training.index else False
    is_in_test = df_test.loc[row.name].is_labelled if row.name in df_test.index else False
    assert not (is_in_training == True and  is_in_test == True)
    return "training" if is_in_training else "test" if is_in_test else np.nan 
    
df['labelling_dataset'] = df.apply(set_dataset, axis=1)
    
df.to_csv(MASTER_DATA_PATH)

df.groupby("labelling_dataset", dropna=False).count()

HBox(children=(FloatProgress(value=0.0, max=49.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




Unnamed: 0_level_0,company,orig_report_type,report_type,year,input_file,output_file,should_infer,is_inferred,company_id,firm_name,ticker,country,icb_industry,icb_supersector
labelling_dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
test,17,17,17,17,17,17,17,17,17,17,0,17,17,17
training,98,98,98,98,98,98,98,98,98,98,0,98,98,98
,677,677,677,677,677,677,677,677,677,677,0,677,677,677
