# Set-up and data loading

**Note**: Much of the code here related to loading and processing the reports is taken from https://github.com/llbtl/paper_ssm01/tree/main. Any copyright of the code belongs to the authors of that paper. 

In [1]:
import numpy as np
import pandas as pd
import fitz
import nltk
from nltk.tokenize import sent_tokenize
import os

In [3]:
path_pdf = '..\\data\\reports'
fname_out = '..\\data_structured\\report_sentences.csv'

# Loading and Processing the CSR Reports

The following functions help us process the PDF files (CSR reports) into a table, where each row is represented by a sentence from the report text. I split the document into sentences already as I will generate sentence embeddings later on using S-BERT. 

In [4]:
def get_cnt(text):
    cnt = 0
    for word in text.split():
        if word.isalnum():
            cnt += 1
    return cnt

In [5]:
def get_text(block_lst):

    MIN_WORD_CNT = 6
    
    text_lst = []
    for block in block_lst:
        if block[6] != 0: continue # block_type: 0 = text
        
        #I'm replacing the non-ascii single quotation mark here, because it's used in Coca Cola's report
        text = block[4].replace('’',"'")
        text = ''.join([i if ord(i) < 128 else ' ' for i in text])
    
        if get_cnt(text) < MIN_WORD_CNT: continue # Delete sentences with less than MIN_WORD_CNT(10) 
    
        text_lst.append(text.replace('-\n', ''))
        #.replace('-\n', '')
    return ('\n'.join(text_lst))

In [7]:
def get_sentence(fname):
 
    doc = fitz.open(fname)
    
    sent_lst = []
    for page_no, page in enumerate(doc):
        
        block_lst = page.get_text_blocks()
        text = get_text(block_lst)
    
        for token in sent_tokenize(text):
            sentences = token.split('\n\n')
            for sentence in sentences:
                r_sent = ' '.join(sentence.split()) # Delete '\n', '\t' and strip
                sent_lst.append(r_sent)
            
    doc.close()

    return sent_lst

In [9]:
def gen_document(fname, sent_lst):

    res_df = pd.DataFrame(
        {
            'doc_type': 'report',
            'company': fname.split('.')[0],
            'sentence': sent_lst
        }
    )
    
    return res_df

In [10]:
def read_filelist(path):

    # Create empty DataFrame
    df = pd.DataFrame()
    
    # Read file list (directory)
    for idx, fname in enumerate(os.listdir(path)):
        p_fname = os.path.join(path, fname)
        print('path + fname >>>', p_fname)
        
        if p_fname.split('.')[-1] != 'pdf': continue
        print('fname >>>',fname)
    
#         doc_id = int(idx)
        
#         print(f'doc_id = [{doc_id}], fname = [{fname}]')
#         print('')
    
        sent_lst = get_sentence(p_fname)
        df_doc   = gen_document(fname, sent_lst)
        
        df = pd.concat([df,df_doc])
        
    return df

In [11]:
%%time
df = read_filelist(path_pdf)
print('==== End of jobs ====')

path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\reports\abb.pdf
fname >>> abb.pdf
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\reports\adidas.pdf
fname >>> adidas.pdf
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\reports\airbus.pdf
fname >>> airbus.pdf
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\reports\apple.pdf
fname >>> apple.pdf
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\reports\bayer.pdf
fname >>> bayer.pdf
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\reports\beiersdorf.pdf
fname >>> beiersdorf.pdf
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\Documents\GitHub\Thesis\data\reports\blackrock.pdf
fname >>> blackrock.pdf
path + fname >>> C:\Users\tnguyen10\OneDrive - Deloitte (O365D)\D

In [13]:
df.drop_duplicates(subset = ['sentence'], inplace = True)

# Further cleaning

In [15]:
df_report = df.copy()

In [17]:
df_report.shape

(45892, 3)

In [18]:
#remove double spaces with one space and remove most hyperlinks + remove whitespaces at the end and beginning of a sentence
df_report["sentence"] = df_report["sentence"].replace(r'http\S+|\[.\]:?|www\S+|\w+/\S+|\w+-\w+-\S+|\[|\]','',regex = True).replace(r'^\s+|\s+$','',regex=True).replace(r'\s{2,}',' ',regex=True)
df_report['sentence'] = df_report['sentence'].str.replace('Nestl ', 'Nestle ')
df_report['sentence'] = df_report['sentence'].str.replace('Mondel z', 'Mondelez')
df_report['sentence'] = df_report['sentence'].str.replace('"','')

In [19]:
df_report["word count"] = [len(i) for i in df_report["sentence"].str.split()]
df_report = df_report[df_report["word count"] > 0]

In [20]:
df_report.reset_index(inplace = True, drop = True)

In [21]:
# my pdf package does not know how to deal with sentences that span across pages - define a funciton here, which will merge the two sentences following each other
# if the previous one doesn't end with punctuation and the following starts with a lower case letter
import string

# define a function to check if a sentence ends with punctuation
def ends_with_punctuation(s):
    return s.strip()[-1] in string.punctuation

# loop over each row in the DataFrame and concatenate the sentences as needed
for i, row in df_report.iterrows():
    # skip the first row as there is no previous row to compare with
    if i == 0:
        continue
    
    # get the current and previous sentences
    prev_sentence = df_report.loc[i-1, 'sentence']
    curr_sentence = df_report.loc[i, 'sentence']
    
    # check if the previous sentence ends with punctuation and the current sentence starts with a lowercase letter
    if not ends_with_punctuation(prev_sentence) and curr_sentence[0].islower():
        # concatenate the sentences with a space
        df_report.at[i, 'sentence'] = prev_sentence + ' ' + curr_sentence
        # drop the previous row
        df_report.drop(i-1, inplace=True)

In [22]:
df_report["word count"] = [len(i) for i in df_report["sentence"].str.split()]
df_report = df_report[df_report["word count"] > 5]
df_report = df_report[df_report["word count"] < 100]

In [24]:
# define a function to check if a sentence is comprised of more than half uppercase characters (these are usually nonsensical sentences)
def is_mostly_uppercase(sentence):
    return sum(1 for c in sentence if c.isupper()) / len(sentence) > 0.5

# apply the function to the 'sentence' column and filter out the rows where the condition is True
df_report = df_report[~df_report['sentence'].apply(is_mostly_uppercase)]

# print the resulting dataframe
df_report

Unnamed: 0,doc_type,company,sentence,word count
0,report,abb,One year into ABB's 2030 sustainability strate...,21
1,report,abb,"Compared with our baseline year of 2019, we ha...",27
2,report,abb,"Last year, we recorded no work-related fatalit...",13
3,report,abb,We also increased the number of women in senio...,39
4,report,abb,"Alongside these headline achievements, we made...",26
...,...,...,...,...
45878,report,walmart,Building on our experiences as a founding memb...,49
45879,report,walmart,LABS works with engineering companies to devel...,20
45880,report,walmart,Factories develop supervised corrective action...,18
45881,report,walmart,LABS has been active in India and Vietnam sinc...,14


In [28]:
df_report.drop_duplicates(subset = ['sentence'], inplace = True)

In [25]:
df_report.reset_index(inplace = True, drop = True)

In [242]:
df_report.to_csv(fname_out, index = False)