# Test Data Preparation

## Setup

In [1]:
import os
import sys
import numpy as np
import pandas as pd

sys.path.append('../..')
import data
from data.labels_postprocessing import process
from data.dataframe_preparation import get_counts_per_page, get_keywords_from_file, get_text_from_page, get_count_matrix
from data.preprocessing import DocumentPreprocessor

### Load labelling files

The 100 randomly selected reports were labelled by two labellers:

- Labeller A: Main labeller, labelled the first 78 and last 5 reports
- Labeller B: Labelled the first 15 reports for Inter-Coder reliability checks (see Notebook) and also labelled 17 of the last few

In [2]:
df_header_a = pd.read_csv("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Test_DF.csv") 
df_labels_a = pd.read_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Test_DF.pkl")
df_header_b = pd.read_csv("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Test_TS.csv")
df_labels_b = pd.read_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Test_TS.pkl")

# Assign coder ids
df_labels_a['coder'] = "DF"
df_labels_b['coder'] = "TS"

# Remove all labels (of coder B) that were actually ICR/training labels
filtered_df_labels_b = df_labels_b.iloc[241:].copy()

# Postprocess, e.g. extract unstructerd comment field
df_labels_a = process(df_labels_a)
filtered_df_labels_b = process(filtered_df_labels_b)

# Combine dataframes
df_labels_positive = pd.concat([filtered_df_labels_b, df_labels_a], ignore_index=True)

df_labels_positive

Unnamed: 0,report_id,cro,cro_sub_type,page,paragraph_no,label,comment,text,coder,indirect,vague,past,keyword,span_id,cro_sub_type_combined
0,es_bco_santander-AR_2019,TR,MARKET,75,12,True,,The initial analysis shows that against today’...,TS,False,False,False,,,MARKET
1,it_intesa_sanpaolo-AR_2019,OP,PRODUCTS,53,2,True,,\nIn addition to directly managing its energy ...,TS,False,False,False,,,PRODUCTS
2,it_intesa_sanpaolo-AR_2019,TR,MARKET,389,2,True,,– The Intesa Sanpaolo Group is aware that it h...,TS,False,False,False,,,MARKET
3,gb_unilever_plc-AR_2009,OP,,29,5,True,,Key indicators – people and sustainability\nId...,TS,False,False,False,,,
4,be_anheuser_busch_inbev-AR_2019,PR,ACUTE,47,5,True,,The 2019 crop year proved to be challenging \n...,TS,False,False,False,,,ACUTE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,fr_axa-AR_2019,TR,POLICY,174,3,True,,"Transition Benchmarks, EU Paris-aligned Benchm...",TS,False,False,False,,,POLICY
89,fr_axa-AR_2019,PR,ACUTE,204,13,True,cro_id:2,• expense payments;\n■ reserve risk resulting ...,TS,False,False,False,,2,ACUTE
90,fr_axa-AR_2019,PR,ACUTE,204,14,True,cro_id:2,• fluctuation of payments around their statist...,TS,False,False,False,,2,ACUTE
91,nl_ing_grp-AR_2019,OP,RESILI,20,2,True,,We believe climate risk is a strategic and cre...,DF,False,False,False,,,RESILIENCE


In [4]:
# Show all labels without categories: 
df_labels_positive.query("cro.isnull() | cro == ''")

Unnamed: 0,report_id,cro,cro_sub_type,page,paragraph_no,label,comment,text,coder,indirect,vague,past,keyword,span_id,cro_sub_type_combined


In [5]:
# Fix labels that do not have a sub-category.
df_labels_positive.query("cro_sub_type.isnull() | cro_sub_type == ''")

Unnamed: 0,report_id,cro,cro_sub_type,page,paragraph_no,label,comment,text,coder,indirect,vague,past,keyword,span_id,cro_sub_type_combined
3,gb_unilever_plc-AR_2009,OP,,29,5,True,,Key indicators – people and sustainability\nId...,TS,False,False,False,,,
9,gb_vodafone_grp-AR_2019,PR,,53,15,True,keyword:physical,Climate change poses a number of potential ris...,TS,False,False,False,physical,,
54,gb_prudential-AR_2019,PR,,65,8,True,keyword:physical,These�include�the�environmental risks�associat...,TS,False,False,False,physical,,
55,gb_prudential-AR_2019,TR,,65,8,True,keyword:transition,These�include�the�environmental risks�associat...,TS,False,False,False,transition,,
56,gb_prudential-AR_2019,TR,,73,4,True,keyword:transition,The�environmental�risks�associated�with�\nclim...,TS,False,False,False,transition,,
57,gb_prudential-AR_2019,PR,,73,4,True,keyword:physical,The�environmental�risks�associated�with�\nclim...,TS,False,False,False,physical,,
65,gb_prudential-AR_2019,PR,,83,7,True,keyword:physical,"Risk management\nAs a long-term investor, the ...",TS,False,False,False,physical,,
66,gb_prudential-AR_2019,TR,,83,7,True,keyword:transition,"Risk management\nAs a long-term investor, the ...",TS,False,False,False,transition,,
70,gb_prudential-AR_2019,PR,,399,7,True,keyword:physical,The environmental risks associated with \nclim...,TS,False,False,False,physical,,


In [6]:
print(df_labels_positive.text[56])

The�environmental�risks�associated�with�
climate�change�is�one�ESG�area�that�poses�
significant�risks�to�Prudential�and�its�
customers.�The�global�transition�to�a�lower�
carbon�economy�could�potentially�see�
the�financial�assets�of�carbon-intensive�
companies�re-price�as�a�result�of�facing�
significantly�higher�costs�or�decreasing�
demand�for�their�products�and�services.�
The�speed�of�this�transition,�including�the�
extent�to�which�it�is�orderly�and�managed,�
will�be�influenced�by�factors�such�as�public�
policy,�technology�and�changes�in�market�
or�investor�sentiment.�This�‘transition�risk’�
may�adversely�impact�the�valuation�of�
investments�held�by�the�Group.�The�Group�
expects�the�physical�impacts�of�climate�
change,�driven�by�both�specific�short-term�
climate-related�events�such�as�natural�
disasters�and�longer-term�changes�in�
the�natural�environment,�to�increasingly�
influence�the�longevity,�mortality�and�
morbidity�risk�assessments�of�the�
Group’s�product�offerings.�Climate-drive

In [7]:
print(df_labels_positive.loc[56])

report_id                                            gb_prudential-AR_2019
cro                                                                     TR
cro_sub_type                                                           NaN
page                                                                    73
paragraph_no                                                             4
label                                                                 True
comment                                                 keyword:transition
text                     The�environmental�risks�associated�with�\nclim...
coder                                                                   TS
indirect                                                             False
vague                                                                False
past                                                                 False
keyword                                                         transition
span_id                  

In [8]:
############## TODO ##############

# UPDATE ID's accordingly once we have the final datasets!

##################################

# 9 is both a Acute and chronic risk
tmp_id = 9
df_labels_positive.loc[tmp_id, "cro_sub_type"] = "ACUTE"
new_row = df_labels_positive.loc[tmp_id].copy()
new_row.cro_sub_type = "CHRON"
df_labels_positive = df_labels_positive.append(new_row, ignore_index=True)

# 54, 55 too generic to be associated with a sub category

# 56 can be vaguely seen as a market risk
tmp_id = 56
df_labels_positive.loc[tmp_id, "cro_sub_type"] = "MARKET"

# 57 is both acute and chronic...
tmp_id = 57
df_labels_positive.loc[tmp_id, "cro_sub_type"] = "ACUTE"
new_row = df_labels_positive.loc[tmp_id].copy()
new_row.cro_sub_type = "CHRON"
df_labels_positive = df_labels_positive.append(new_row, ignore_index=True)

# 65, 66 too generic to be associated with a sub category
# 70 is too generic for PR, but MARKET risk is already labelled

# Drop 3 as its not an opportunity
df_labels_positive = df_labels_positive.drop([3])

In [12]:
print(df_labels_positive.loc[57])

report_id                                            gb_prudential-AR_2019
cro                                                                     PR
cro_sub_type                                                         ACUTE
page                                                                    73
paragraph_no                                                             4
label                                                                 True
comment                                                   keyword:physical
text                     The�environmental�risks�associated�with�\nclim...
coder                                                                   TS
indirect                                                             False
vague                                                                False
past                                                                 False
keyword                                                           physical
span_id                  

In [13]:
# Run again to make sure we have all changes
df_labels_positive = process(df_labels_positive)

In [14]:
df_labels_positive[df_labels_positive.comment.str.find("water") > 0]
df_labels_positive[df_labels_positive.comment.str.find("recycling") > 0]

Unnamed: 0,report_id,cro,cro_sub_type,page,paragraph_no,label,comment,text,coder,indirect,vague,past,keyword,span_id,cro_sub_type_combined


In [15]:
df_labels_positive.to_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Test_Positive.pkl", protocol=4)

In [16]:
df_labels_positive.groupby("cro_sub_type").count()

Unnamed: 0_level_0,report_id,cro,page,paragraph_no,label,comment,text,coder,indirect,vague,past,keyword,span_id,cro_sub_type_combined
cro_sub_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ACUTE,21,21,21,21,21,11,21,21,21,21,21,6,4,21
CHRON,7,7,7,7,7,5,7,7,7,7,7,5,0,7
ENERGY,1,1,1,1,1,0,1,1,1,1,1,0,0,1
MARKET,9,9,9,9,9,4,9,9,9,9,9,3,0,9
MARKETS,13,13,13,13,13,0,13,13,13,13,13,0,0,13
POLICY,20,20,20,20,20,4,20,20,20,20,20,4,0,20
PRODUCTS,5,5,5,5,5,0,5,5,5,5,5,0,0,5
REPUT,7,7,7,7,7,1,7,7,7,7,7,1,0,7
RESILI,3,3,3,3,3,0,3,3,3,3,3,0,0,3
TECH,3,3,3,3,3,2,3,3,3,3,3,2,0,3


## Generation of negative samples

Take all paragraphs from bigram hits that were NOT labelled as such.

In [None]:
df_labels_negative = pd.DataFrame()

# Set temporary tracking lists
hot_negative_labels = []
weak_negative_labels = []

# Loop through each report in df_header_a that was actually labelled.
labelled_reports_a = df_header_a.query("should_label & is_labelled")
labelled_reports_b = df_header_b.query("should_label & is_labelled")

labelled_reports = pd.concat([labelled_reports_a, labelled_reports_b], ignore_index=True)

In [None]:

parsed_reports_folder = "/Users/david/Projects/fin-disclosures-nlp/input_files/annual_reports"
vocabulary = get_keywords_from_file("/Users/david/Projects/fin-disclosures-nlp/data/keyword_vocabulary.txt")

def get_unlabelled_paragraphs_of_report(report_row, add_adjunct_pages = True):
    # Set tracking vars
    no_hot_neg_labels = 0
    no_weak_neg_labels = 0
    
    # Load report
    path = os.path.join(parsed_reports_folder,report_row['input_file'])
    folder = os.path.dirname(path)
    parsed_report_file_path = os.path.join(folder, report_row['orig_report_type'] + '_' + str(int(report_row['year'])), report_row['output_file'])
    
    # Get pages with keyword hits
    pages = get_counts_per_page(parsed_report_file_path, vocabulary)
    
    page_indizes = set(pages.index)
    
    # Add adjunct pages if necessary
    if add_adjunct_pages:
        for p in pages.index:
            if p > 0:
                page_indizes.add(p - 1)
            # elif p < TOTAL_PAGES:
            page_indizes.add(p + 1)
    
    # For each page, get all paragraphs
    for page_no in page_indizes:
        try:
            text = get_text_from_page(parsed_report_file_path, page_no)
        except IndexError:
            continue
        processed_doc = DocumentPreprocessor(text).process()
        paragraphs = processed_doc.split('\n\n')
        
        # Get the positive labels for this page
        pos_labels_paragraph_no = df_labels_positive.query("report_id == @report_row.id & page == @page_no").paragraph_no
        
        # Get the "hot" paragraphs, i.e. those that were containing a bigram of the search vocab
        hot_paragraphs = get_count_matrix(paragraphs, vocabulary)
        hot_paragraphs = hot_paragraphs.sum(axis=1)
        hot_paragraphs = np.squeeze(np.asarray(hot_paragraphs))
        hot_paragraphs = np.where(hot_paragraphs)[0]
        
        # The "weak" paragraphs are all those that are not either in hot or the actual positive labels
        weak_paragraphs = np.setdiff1d(np.arange(len(paragraphs)), hot_paragraphs, assume_unique=True)
        weak_paragraphs = np.setdiff1d(weak_paragraphs, pos_labels_paragraph_no)
        hot_paragraphs_filtered = np.setdiff1d(hot_paragraphs, pos_labels_paragraph_no)
        
        for p in weak_paragraphs:
            weak_negative_labels.append([report_row.id, page_no, p, paragraphs[p], "EXTRACTED", "weak"])
            no_weak_neg_labels += 1
            
        for p in hot_paragraphs_filtered:
            hot_negative_labels.append([report_row.id, page_no, p, paragraphs[p], "EXTRACTED", "hot"])
            no_hot_neg_labels += 1
    
    # Add summary stats for each row, i.e. how many "positive/negative" labels
    report_row["no_pos_labels"] = len(df_labels_positive.query("report_id == @report_row.id"))
    report_row["no_neg_hot_labels"] = no_hot_neg_labels
    report_row["no_neg_weak_labels"] = no_weak_neg_labels
    print(f"Done with {report_row.id}. Extracted {no_hot_neg_labels} (hot) and {no_weak_neg_labels} (weak) negative labels...")
    return report_row

labelled_reports.loc[0:].apply(lambda row: get_unlabelled_paragraphs_of_report(row), axis=1)

# Append to negative labels dataframe
df_hot_neg_labels = pd.DataFrame(hot_negative_labels, columns=["report_id", "page", "paragraph_no", "text", "coder", "neg_type"])
df_weak_neg_labels = pd.DataFrame(weak_negative_labels, columns=["report_id", "page", "paragraph_no", "text", "coder", "neg_type"])
df_labels_negative = df_labels_negative.append(df_hot_neg_labels, ignore_index=True)
df_labels_negative = df_labels_negative.append(df_weak_neg_labels, ignore_index=True)

In [18]:
df_labels_positive.to_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Test_Positive.pkl", protocol=4)
df_labels_negative.to_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Test_Negative.pkl", protocol=4)

In [19]:
df_labels = pd.concat([df_labels_positive, df_labels_negative], ignore_index=True)
df_labels.groupby("neg_type", dropna=False).count()

Unnamed: 0_level_0,report_id,cro,cro_sub_type,page,paragraph_no,label,comment,text,coder,indirect,vague,past,keyword,span_id,cro_sub_type_combined
neg_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
hot,2162,0,0,2162,2162,0,0,2162,2162,0,0,0,0,0,0
weak,25970,0,0,25970,25970,0,0,25970,25970,0,0,0,0,0,0
,94,94,89,94,94,94,32,94,94,94,94,94,26,4,89


In [20]:
df_labels.to_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Test.pkl", protocol=4)