### This is adapted from https://github.com/dafrie/fin-disclosures-nlp and to be run with the dependencies referenced there

In [23]:
import os
import re
import pandas as pd
import numpy as np
from dotenv import dotenv_values
from pathlib import Path
import sys

config = dotenv_values("./../../config/.env") # take environment variables from .env.
base_path = Path(config["BASE_PATH"])
sys.path.append(str(base_path/"modules"))
data_path = base_path/"data"
writing_path = base_path/"writing"/"MSc-Thesis-Emerging-Risks"
table_path = writing_path/"tables"
figure_path = writing_path/"figures"


### Get path information

In [4]:
def get_company_paths(path):
    result = []
    for f in os.scandir(path):
        if not f.name.startswith('.') and f.is_dir():
            result.append(f)
    return result


def get_reports_paths(path):
    result = []
    for f in os.scandir(path):
        p = Path(f.path)
        if not f.name.startswith('.') and p.suffix == '.pdf' and f.is_file():
            result.append(p)
    return result

def get_reports_paths_information():
    regex= "\d{4}"
    rows = []
    column_names = ['report_id', 'company', 'year', 'path', 'extracted', 'inferred']
    company_paths = get_company_paths(data_path/"stoxx"/"annual_reports_raw")
    for company_dir in company_paths:
        company_files = get_reports_paths(company_dir.path)
        company = company_dir.name
        if os.path.exists(data_path/"stoxx"/"annual_reports_extracted"/company):
            extracted_years = [x.split(".")[0] for x in os.listdir(data_path/"stoxx"/"annual_reports_extracted"/company) if x[-4:]==".yml"]
        else:
            extracted_years = []
        for p in company_files:
            matches = re.findall("\d{4}", p.stem)
            try:
                assert len(matches) == 1, f"Less / more than one year matched!: {p}"
                year = matches[0]
                report_id = f"{company}-AR_{year}"
                if year in extracted_years:
                    rows.append([report_id, company, year, p, True, False ])
                else:
                    rows.append([report_id, company, year, p, False, False ])
            except:
                print(f"Error - Invalid reports file found: {p}")
                break;
                                                    
    df_reports = pd.DataFrame(rows, columns=column_names)
    df_reports.set_index("report_id", inplace=True)
    return df_reports

In [5]:
df = pd.read_csv(data_path/"stoxx"/"stoxx_europe_600_financials.csv")

In [28]:
latex = df.rename(columns={"company_name": "Company"})
latex.Insurance = df.Insurance.apply(lambda x: "Insurance" if x.lower() == "true" else ( "Non-Insurance" if x.lower() == "false" else "Partly-Insurance"))
s = latex[["Company", "Insurance"]].style.hide()
s.to_latex(table_path/"STOXX_Insurance_labling.tex", column_format="ll", label="tab:stoxx_insurance_labels", caption="Insurance labelling of STOXX Europe 600 Financials companies.")

In [22]:
df_reports = get_reports_paths_information()
df = pd.read_csv(data_path/"stoxx"/"stoxx_europe_600_financials.csv")

insurance_company_folders = df[df["Insurance"].apply(lambda x: x.lower() == "true")]["company_folder"].unique()
insurance_company_folders = np.concatenate((insurance_company_folders, np.array(["SwissReCorporateSolutions"])))  #Provided by Swiss Re but not part of STOXX 600
partly_insurance_company_folders = df[df["Insurance"].apply(lambda x: x.lower() == "partly")]["company_folder"].unique()
financials_company_folders = df[df["Insurance"].apply(lambda x: x.lower() == "false")]["company_folder"].unique()

df_insurance = df_reports[df_reports["company"].isin(np.concatenate((insurance_company_folders, partly_insurance_company_folders)))][["company", "year"]]
df_insurance["only_insurance"] = df_insurance["company"].isin(insurance_company_folders)
df_insurance["filing_type"] = "annual_report"
df_insurance.to_pickle(data_path/"stoxx"/"master_stoxx.pkl")

df_financials = df_reports[df_reports["company"].isin(financials_company_folders)][["company", "year"]]
df_financials["only_insurance"] = False
df_financials["filing_type"] = "annual_report"
df_financials.to_pickle(data_path/"stoxx"/"financials_main.pkl")

df_reports.drop(["year", "company"], axis = 1, inplace=True)
df_reports.to_csv(data_path/"stoxx"/"extraction_master.csv", index=False)
df_reports.to_pickle(data_path/"stoxx"/"extraction_master.pkl")

### PDF Extraction

In [16]:
from fin_disclosure_nlp.pdf_extractor import PdfExtractor
from fin_disclosure_nlp.preprocessing import DocumentPreprocessor
from tqdm.notebook  import tqdm
from multiprocessing import Pool
from concurrent.futures import ProcessPoolExecutor
import yaml
executor = ProcessPoolExecutor(max_workers=4)

df_insurance = pd.read_pickle(data_path/"stoxx"/"master_stoxx.pkl")
df_reports = pd.read_pickle(data_path/"stoxx"/"extraction_master.pkl")
df = df_insurance.join(df_reports, on="report_id", how="left")

In [33]:
df_insurance = pd.read_pickle(data_path/"stoxx"/"master_stoxx.pkl")
df_reports = pd.read_pickle(data_path/"stoxx"/"extraction_master.pkl")

In [8]:
df[df["extracted"] == False]["path"]

report_id
AegonNV-AR_2003    /home/andreas/Polybox/Project-Support-Material...
Name: path, dtype: object

In [9]:
def extract_file(input_file, output_folder, **kwargs):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    PdfExtractor(input_file=input_file, output_folder=output_folder, **kwargs)
    
futures = []

for index, row in df[df["extracted"] == False].iterrows():
    out_folder = str(Path(row.path).parent).replace("_raw", "_extracted")
    futures.append(executor.submit(extract_file, row.path, out_folder))

print("All Tasks in the queue")

for future in tqdm(futures, total=len(futures)):
    try:
        future.result()
    except Exception as e:
        print(f"Error")
        print(e)

All Tasks in the queue


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
if os.path.exists(data_path/"stoxx"/"paragraphs_stoxx.pkl"):
    df_stoxx = pd.read_pickle(data_path/"stoxx"/"paragraphs_stoxx.pkl")
else:
    df_stoxx = pd.DataFrame()

df_reports = get_reports_paths_information().drop(["company", "year"], axis=1)
df = df_insurance[~df_insurance.index.isin(df_stoxx.reset_index()["report_id"].unique())].join(df_reports, on="report_id", how="left")

In [25]:
def process_file(input_file, **kwargs):
    with open(str(input_file).replace("_raw", "_extracted").replace('.pdf', '.yml'), 'r') as f:
        pages = yaml.safe_load(f)
    paragraphs = []
    page_numbers = []
    for i, page in enumerate(pages["pages"]):
        p = DocumentPreprocessor(page["text"]).process().split("\n\n")
        paragraphs += p
        page_numbers += [page["page_no"]] * len(p)
    year = [Path(input_file).stem] * len(paragraphs)
    company = [f"{Path(input_file).parent.stem}-AR_{Path(input_file).stem}"] * len(paragraphs)
    df_inter = pd.DataFrame({"report_id": company, "year": year, "page_no": page_numbers, "text": paragraphs})
    df_inter = df_inter[df_inter["text"].str.len() > 0]
    return df_inter

futures = []
paths = []
for index, row in df[df.extracted].iterrows():
    futures.append(executor.submit(process_file, row.path))
    paths.append(row.path)
print("All Tasks in the queue")

for future, path in tqdm(zip(futures, paths), total=len(futures)):
    try:
        df_inter = future.result()
        df_stoxx = pd.concat([df_stoxx, df_inter], ignore_index=True)
    except Exception as e:
        print(f"Error")
        print(e)
        print(path)
        os.remove(str(path).replace("_raw", "_extracted").replace('.pdf', '.yml'))

df_stoxx.to_pickle(data_path/"stoxx"/"paragraphs_stoxx.pkl")

All Tasks in the queue


  0%|          | 0/571 [00:00<?, ?it/s]

# Preprocessing

In [29]:
df_stoxx = pd.read_pickle(data_path/"paragraphs_stoxx.pkl")
df_stoxx.drop(["year"], axis=1, inplace=True)
df_stoxx.text = df_stoxx.text.str.replace(r"\s+", " ", regex=True).str.strip()
df_stoxx["n_words"] = df_stoxx.text.str.split(r"\s", regex=True).str.len()
df_stoxx["paragraph_nr"] = df_stoxx.groupby(["report_id", "page_no"]).cumcount()
df_stoxx["loss_kw"] = df_stoxx.text.str.contains("loss", case=False)
df_stoxx["unexpected_kw"] = df_stoxx.text.str.contains("unexpected", case=False)
df_stoxx.reset_index(inplace=True)  
df_stoxx.n_words = pd.to_numeric(df_stoxx.n_words)
df_stoxx.page_no = pd.to_numeric(df_stoxx.page_no)
df_stoxx.paragraph_nr = pd.to_numeric(df_stoxx.paragraph_nr)
df_stoxx.to_pickle(data_path/"stoxx"/"paragraphs_stoxx.pkl")

In [32]:
df_stoxx.groupby(["loss_kw", "unexpected_kw"]).size()

loss_kw  unexpected_kw
False    False            2267080
         True                1043
True     False             145825
         True                 769
dtype: int64