# Inference with baseline model (TF-IDF SVM classifier)


In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
from pathlib import Path
import importlib
sys.path.append('..')

import pandas as pd
import numpy as np

import data.dataframe_preparation as preparation
from data.labels_postprocessing import process
from data.dataframe_preparation import get_counts_per_page, get_keywords_from_file, get_text_from_page, get_count_matrix
from data.preprocessing import DocumentPreprocessor
from data.inference_widgets import CroInferenceViewer

In [2]:
############### CONFIG ###############
FIRM_METADATA = os.path.abspath("../input_files/Firm_Metadata.csv")
DATA_INPUT_PATH = os.path.abspath("../input_files/annual_reports/")
MASTER_DATA_PATH = os.path.abspath("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Data/stoxx_inference/Firm_AnnualReport.csv")
INFERENCE_PARAGRAPH_PATH = os.path.abspath("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Data/stoxx_inference/Firm_AnnualReport_Paragraphs_with_actual_back.pkl")
MODELS_PATH = os.path.abspath("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Models/stoxx_inference")
######################################

# Load master file
df = pd.read_csv(MASTER_DATA_PATH)
df = df.set_index("id")

# Load paragraphs file
if Path(INFERENCE_PARAGRAPH_PATH).is_file():
    df_paragraphs = pd.read_pickle(INFERENCE_PARAGRAPH_PATH)
else:
    df_paragraphs = pd.DataFrame()

# Load classifier
import pickle

with open(os.path.join(MODELS_PATH, 'multilabel_svm_cro.pkl'), 'rb') as f:
    clf = pickle.load(f)
    
label_list = clf.label_list
    

In [4]:
df

Unnamed: 0_level_0,company,orig_report_type,report_type,year,input_file,output_file,should_infer,is_inferred,company_id,firm_name,ticker,country,icb_industry,icb_supersector,labelling_dataset
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
dk_novo_nordisk_b-AR_2016,novo_nordisk_b,AR,AR,2016,dk_novo_nordisk_b/AR_2016.pdf,AR_2016.yml,True,True,dk_novo_nordisk_b,NOVO NORDISK B,,dk,20 Health Care,2010 Health Care,
dk_novo_nordisk_b-AR_2002,novo_nordisk_b,AR,AR,2002,dk_novo_nordisk_b/AR_2002.pdf,AR_2002.yml,True,True,dk_novo_nordisk_b,NOVO NORDISK B,,dk,20 Health Care,2010 Health Care,
dk_novo_nordisk_b-AR_2003,novo_nordisk_b,AR,AR,2003,dk_novo_nordisk_b/AR_2003.pdf,AR_2003.yml,True,True,dk_novo_nordisk_b,NOVO NORDISK B,,dk,20 Health Care,2010 Health Care,
dk_novo_nordisk_b-AR_2017,novo_nordisk_b,AR,AR,2017,dk_novo_nordisk_b/AR_2017.pdf,AR_2017.yml,True,True,dk_novo_nordisk_b,NOVO NORDISK B,,dk,20 Health Care,2010 Health Care,
dk_novo_nordisk_b-AR_2001,novo_nordisk_b,AR,AR,2001,dk_novo_nordisk_b/AR_2001.pdf,AR_2001.yml,True,True,dk_novo_nordisk_b,NOVO NORDISK B,,dk,20 Health Care,2010 Health Care,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ch_novartis-AR_2012,novartis,AR,AR,2012,ch_novartis/AR_2012.pdf,AR_2012.yml,True,True,ch_novartis,NOVARTIS,,ch,20 Health Care,2010 Health Care,
ch_novartis-AR_2008,novartis,AR,AR,2008,ch_novartis/AR_2008.pdf,AR_2008.yml,True,True,ch_novartis,NOVARTIS,,ch,20 Health Care,2010 Health Care,training
ch_novartis-AR_2009,novartis,AR,AR,2009,ch_novartis/AR_2009.pdf,AR_2009.yml,True,True,ch_novartis,NOVARTIS,,ch,20 Health Care,2010 Health Care,
ch_novartis-AR_2019,novartis,AR,AR,2019,ch_novartis/AR_2019.pdf,AR_2019.yml,True,True,ch_novartis,NOVARTIS,,ch,20 Health Care,2010 Health Care,


## Get paragraphs of all reports

In [None]:
vocabulary = get_keywords_from_file("../data/keyword_vocabulary.txt")

def get_paragraphs_of_report(report_row, add_adjunct_pages=True):
    result = []
    
    # Load report
    path = os.path.join(DATA_INPUT_PATH,report_row['input_file'])
    folder = os.path.dirname(path)
    parsed_report_file_path = os.path.join(folder, report_row['orig_report_type'] + '_' + str(int(report_row['year'])), report_row['output_file'])
    
    # Get pages with keyword hits
    pages = get_counts_per_page(parsed_report_file_path, vocabulary)
    page_indizes = set(pages.index)
    
    # Add adjunct pages if necessary
    if add_adjunct_pages:
        for p in pages.index:
            if p > 0:
                page_indizes.add(p - 1)
            # elif p < TOTAL_PAGES:
            page_indizes.add(p + 1)
            
    # For each page, get all paragraphs
    for page_no in page_indizes:
        try:
            text = get_text_from_page(parsed_report_file_path, page_no)
            processed_doc = DocumentPreprocessor(text).process()
        except IndexError:
            continue
        paragraphs = processed_doc.split('\n\n')
        
        for idx, p in enumerate(paragraphs):
            result.append({ "page_no": page_no, "paragraph_no": idx, "text": p, "is_adjunct": False if page_no in pages.index else True })
        print(f"Page no: {page_no}")
    return result


from tqdm.notebook import trange, tqdm_notebook

# Loop through all reports
for index, row in tqdm_notebook(df.iterrows(), total=df.shape[0]):
    # Skip if not necessary
    if not row['should_infer'] or row['is_inferred']:
        continue
        
    paragraphs = get_paragraphs_of_report(row, add_adjunct_pages=True)
    if len(paragraphs):
        df_report_paragraphs = pd.DataFrame(paragraphs)

        paragraphs_df["report_id"] = index
        df_paragraphs = pd.concat([df_paragraphs, df_report_paragraphs], ignore_index=True)
    
    # Update progress
    df.loc[index, 'is_inferred'] = True
    
    # Save files
    df.to_csv(MASTER_DATA_PATH)
    df_paragraphs.to_pickle(INFERENCE_PARAGRAPH_PATH, protocol=4)
    

In [None]:
# TODO: Add inference step here instead of above, i.e. it make dynamic
df_paragraphs["preds_svm_cro"] = clf.predict(df_paragraphs['text']).tolist()
df_paragraphs["preds_prob_svm_cro"] = clf.predict_proba(df_paragraphs['text']).tolist()


In [None]:
# Prepare data/extract prob
df_paragraphs[[ l + "_predicted" for l in label_list]] = pd.DataFrame(df_paragraphs.preds_svm_cro.tolist())
df_paragraphs[[ l + "_prob" for l in label_list]] = pd.DataFrame(df_paragraphs.preds_prob_svm_cro.tolist())

# Merge dataset
df_paragraphs_merged = pd.merge(df_paragraphs, df, how="left", left_on="report_id", right_index=True)

In [None]:
df_paragraphs.groupby("labelling_dataset", dropna=False).count()

# Temporary: Combine from labels

In [None]:
# Rerun once test is complete
df_labels_training = pd.read_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Training_Positive.pkl")
df_labels_training_negative = pd.read_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Training_Negative.pkl")
df_labels_test = pd.read_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Labelling/annual reports/Firm_AnnualReport_Labels_Test_Positive.pkl")

# Set ids
id_columns = ['report_id', 'page', 'paragraph_no']
df_labels_training["id"] = df_labels_training.apply(lambda row: "_".join([str(row[c]) for c in id_columns]), axis=1)
df_labels_training_negative["id"] = df_labels_training_negative.apply(lambda row: "_".join([str(row[c]) for c in id_columns]), axis=1)
df_labels_test["id"] = df_labels_test.apply(lambda row: "_".join([str(row[c]) for c in id_columns]), axis=1)

# Quick check that we do not have overlapping labels
assert len(set(df_labels_training.id).intersection(set(df_labels_training_negative.id))) == 0
assert len(set(df_labels_training.id).intersection(set(df_labels_test.id))) == 0

df_labels = pd.concat([df_labels_training, df_labels_test])
df_cro = pd.crosstab(df_labels.id, df_labels["cro"], dropna=False)
df_cro_sub_type = pd.crosstab(df_labels.id, df_labels["cro_sub_type_combined"], dropna=False)
df_cro = df_cro.add_suffix('_actual')
df_cro_sub_type = df_cro_sub_type.add_suffix('_actual')
df_cro = (df_cro > 0) * 1
df_cro_sub_type = (df_cro_sub_type > 0) * 1

id_columns = ['report_id', 'page_no', 'paragraph_no']
assert len(df_paragraphs_merged) == len(df_paragraphs_merged.groupby(id_columns).count()), "Should only have unique id's, something is not correct!"

id_columns = ['report_id', 'page_no', 'paragraph_no']
df_paragraphs_merged["id"] = df_paragraphs_merged.apply(lambda row: "_".join([str(row[c]) for c in id_columns]), axis=1)

df_paragraphs_merged = df_paragraphs_merged.merge(df_cro, how="left", left_on="id", right_index=True)
df_paragraphs_merged = df_paragraphs_merged.merge(df_cro_sub_type, how="left", left_on="id", right_index=True)

In [None]:
df_paragraphs_merged.to_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Data/stoxx_inference/Firm_AnnualReport_Paragraphs_with_actual.pkl", protocol=4)

# Inference explorer

In [3]:
from data.inference_widgets import CroInferenceViewer
df_paragraphs_merged = df_paragraphs
viewer = CroInferenceViewer(df_paragraphs_merged, label_list=label_list)

Output()

Output()

Output()

In [None]:
df_paragraphs_merged.columns # cro_sub_type.unique()

In [None]:
print(df_paragraphs_merged.iloc[549581])
print(df_paragraphs_merged.iloc[549581].text)

In [None]:
df_paragraphs_merged.query("REPUTATION_actual == 1")