In [69]:
import os
import sys
import math
from datetime import date
from dateutil.relativedelta import relativedelta

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from lexicalrichness import LexicalRichness

%load_ext autoreload
%autoreload 2

sys.path.append('..')
from data import constants
from data import dataframe_preparation
from data.utils import tables

# Setup seaborn
sns.set_theme(style="ticks", rc={'text.usetex' : True})
sns.set_context("paper")

# Read main file
df = pd.read_pickle("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Data/stoxx_inference/Firm_AnnualReport_Paragraphs.pkl")
id_columns = ['report_id', 'page_no', 'paragraph_no']
df["id"] = df.apply(lambda row: "_".join([str(row[c]) for c in id_columns]), axis=1)
df = df.set_index(["id"])
assert df.index.is_unique, "Index is not unique. Check the data!"

first_stage = df['1stage_preds_labels'].apply(lambda x: np.array(x[1]))
second_stage = df['2stage_preds_labels'].apply(lambda x: np.array(x))
df["labels"] = first_stage * second_stage
df[constants.cro_sub_category_labels] = pd.DataFrame(df.labels.tolist(), index= df.index)
df[constants.cro_category_labels[0]] = df.labels.apply(lambda x: any(x[0:2]))
df[constants.cro_category_labels[1]] = df.labels.apply(lambda x: any(x[2:]))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
# Read master for scaling
df_master = pd.read_csv("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Data/stoxx_inference/Firm_AnnualReport.csv")
df_master = df_master.set_index("id")
df_master['icb_industry'] = df_master['icb_industry'].str.slice(3)
df_master['country'] = df_master['country'].str.upper()
df_master = df_master.rename(columns={"year": "Year"})
df_reports_count = df_master.groupby('Year')['is_inferred'].sum()
df = pd.merge(df, df_master, how="left", left_on="report_id", right_index=True)

In [78]:
print("Average of relevant pages per report: ", len(df.groupby(["report_id", "page_no"]).count())/len(df_master))

Average of relevant pages per report:  45.46969696969697


# Config

In [7]:
category_level = "cro" # ["cro", "cro_sub_type"]
categories = constants.cro_categories if category_level == "cro" else constants.cro_sub_categories
cro_category_labels = [c["label"] for c in categories]

export_dir = os.path.join("/Users/david/Nextcloud/Dokumente/Education/Uni Bern/Master Thesis/Analyzing Financial Climate Disclosures with NLP/Thesis/figures/")

colors = [c["color"] for c in categories]

# Descriptive stats

In [8]:
missing_inferred_reports = set(df_master.index) - set(df.report_id)
print(f"Total number of reports: {len(df_master.index)}, processed reports: {len(df.report_id.unique())}. Missing reports:")
missing_inferred_reports

Total number of reports: 792, processed reports: 772. Missing reports:


{'de_bayer-AR_2001',
 'de_deutsche_telekom-AR_1999',
 'de_deutsche_telekom-AR_2000',
 'de_deutsche_telekom-AR_2001',
 'de_sap-AR_1999',
 'de_sap-AR_2000',
 'de_sap-AR_2001',
 'de_sap-AR_2002',
 'dk_novo_nordisk_b-AR_2001',
 'dk_novo_nordisk_b-AR_2002',
 'fr_airbus-AR_2007',
 'fr_airbus-AR_2008',
 'fr_airbus-AR_2009',
 'gb_bp-AR_2017',
 'gb_lloyds_banking_grp-AR_2000',
 'gb_lloyds_banking_grp-AR_2001',
 'gb_prudential-AR_1999',
 'gb_reckitt_benckiser_grp-AR_2005',
 'gb_vodafone_grp-AR_2012',
 'nl_asml_hldg-AR_2001'}

In [9]:
print(f'Unique pages: {len(df.groupby(["report_id", "page_no"]).count())}, paragraphs: {len(df)}')

Unique pages: 36012, paragraphs: 628001


In [20]:
 df["is_climaterisk"] = df.apply(lambda x: x["1stage_preds_labels"][1] > 0, axis=1)

In [24]:
 print(f"First stage positive paragraphs: {df['is_climaterisk'].sum()}")

First stage positives: 3713


In [58]:
positive_docs = df[df[constants.cro_category_labels].any(axis=1)].copy()
positive_docs["processed_docs"] = positive_docs.apply(lambda x: dataframe_preparation.spacy_tokenizer(x['text']), axis=1)
print(f"Second stage positive paragraphs: {len(positive_docs)}")

Second stage positive paragraphs: 3067


In [37]:
from lexicalrichness import LexicalRichness

def get_lexical_diversity(text): 
    lex = LexicalRichness(text, use_TextBlob=False)
    return pd.Series([lex.words, lex.terms, lex.ttr, lex.mtld()])
    
positive_docs[["nwords", "nterms", "ttr", "mtld"]] = positive_docs.apply(lambda x: get_lexical_diversity(x['text']), axis=1)

In [59]:

print(f"Total labels : {positive_docs[constants.cro_sub_category_labels].sum().sum()}")

Total labels : 3110


In [54]:
print("Labels by sub category: ")
positive_docs[constants.cro_sub_category_labels].sum()

Acute                   490
Chronic                  41
Policy                  254
Market \& Technology    144
Reputation              102
dtype: int64

In [41]:
df_descriptive = pd.DataFrame()
df_tmp = df.groupby("report_id")[constants.cro_sub_category_labels].sum()
df_descriptive["", "Coverage"] = df_tmp.replace(0, np.nan).count()

df_descriptive["Frequency per report", "Mean"] = df_tmp.replace(0, np.nan).mean()
df_descriptive["Frequency per report", "St. Dev."] = df_tmp.replace(0, np.nan).std()
df_descriptive["Frequency per report", "Max"] = df_tmp.max()


df_descriptive["Avg. Lexical diversity per paragraph", "Words"] = df_tmp.apply(lambda x: positive_docs[positive_docs[x.name] > 0].nwords.mean()) 
df_descriptive["Avg. Lexical diversity per paragraph", "Terms"] = df_tmp.apply(lambda x: positive_docs[positive_docs[x.name] > 0].nterms.mean()) 
df_descriptive["Avg. Lexical diversity per paragraph", "TTR"] = df_tmp.apply(lambda x: positive_docs[positive_docs[x.name] > 0].ttr.mean()) 
df_descriptive["Avg. Lexical diversity per paragraph", "MTLD"] = df_tmp.apply(lambda x: positive_docs[positive_docs[x.name] > 0].mtld.mean())

# Add tempory
df_descriptive["Main"] = [constants.map_to_field()[c["parent"]] for c in constants.cro_sub_categories]

df_descriptive.set_index('Main', append=True, inplace=True)
df_descriptive.index.set_names(['second', 'first'], inplace=True)
df_descriptive = df_descriptive.reorder_levels(['first', 'second'])
df_descriptive.index.set_names(['', ''], inplace=True)
df_descriptive = df_descriptive.round(2)
latex_str = tables.export_to_latex(df_descriptive, 
                       filename="pos_paragraphs_descriptive_stats_v2.tex",
                       correct_multicolumn=True,
                       make_bold_row_at=[3],
                       # add_verticalrule_at=[3, 6],
                       index=True, 
                       multirow=False, 
                       multicolumn=False, 
                       bold_rows=True, 
                       multicolumn_format="l", 
                       escape=False,
                       float_format="{:0.2f}".format
                      )
df_descriptive

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Frequency per report,Frequency per report,Frequency per report,Avg. Lexical diversity per paragraph,Avg. Lexical diversity per paragraph,Avg. Lexical diversity per paragraph,Avg. Lexical diversity per paragraph
Unnamed: 0_level_1,Unnamed: 1_level_1,Paragraphs,Reports,Mean,St. Dev.,Max,Words,Terms,TTR,MTLD
,,,,,,,,,,
Physical risk,Acute,1457.0,490.0,2.97,2.81,19.0,104.77,69.9,0.73,82.37
Physical risk,Chronic,52.0,41.0,1.27,0.59,3.0,55.71,42.56,0.82,77.03
Transition risk,Policy,977.0,254.0,3.85,6.02,39.0,98.76,65.36,0.72,75.69
Transition risk,Market \& Technology,419.0,144.0,2.91,2.65,14.0,116.94,76.86,0.71,83.74
Transition risk,Reputation,205.0,102.0,2.01,1.55,9.0,76.9,53.77,0.77,82.92


# Prediction evaluation (textual and distribution)

In [None]:
from scipy.special import logit, softmax
positive_docs["Positive Probability"] = positive_docs["1stage_preds_prob"].apply(lambda x: softmax(logit(x)))
positive_docs["Positive Probability"] = positive_docs["Positive Probability"].apply(lambda x: x[1])

In [None]:
df["Positive Probability"] = df["1stage_preds_prob"].apply(lambda x: softmax(logit(x)))
df["Positive Probability"] = df["Positive Probability"].apply(lambda x: x[1])

In [None]:
df["Positive Probability"]

In [None]:
import seaborn as sns
sns.kdeplot(positive_docs["Positive Probability"], clip=[0,1], cumulative=True)

In [None]:
most_uncertain = positive_docs.sort_values(by=['Positive Probability']).iloc[0]
most_certain = positive_docs.sort_values(by=['Positive Probability']).iloc[-1]
print(f"Most uncertain: {most_uncertain.report_id}, {most_uncertain.page_no}, Prob: {most_uncertain['Positive Probability']}\n", softmax(logit(most_uncertain["2stage_preds_prob"]))[3] )
print(most_uncertain.text)
print("=====================")
print(f"Most uncertain: {most_certain.report_id}, {most_certain.page_no}, Prob: {most_certain['Positive Probability']}\n", softmax(logit(most_certain["2stage_preds_prob"]))[0] )
print(most_certain.text)

In [None]:
positive_docs.columns

In [None]:
positive_docs_long = pd.melt(positive_docs, id_vars=['Positive Probability', '2stage_preds_prob'],  value_vars=constants.cro_sub_category_labels, var_name='Category', value_name='Positive')
positive_docs_long = positive_docs_long.query("Positive > 0")

In [None]:
def get_idx(x):
    idx = next((index for (index, d) in enumerate(constants.cro_sub_category_labels) if d == x["Category"]), None)
    return idx 

def get_class_prob(x):
    # Fix for inverse logits
    probs = softmax(logit(x["2stage_preds_prob"]))
    idx = get_idx(x)
    return probs[idx]
    
    
positive_docs_long["Class probability"] = positive_docs_long.apply(lambda x: get_class_prob(x), axis=1)

In [None]:
colors = [c["color"] for c in constants.cro_sub_categories]
      
fig = plt.figure()#figsize=(9, 5))
ax0 = fig.add_subplot(121)
ax1 = fig.add_subplot(122)
sns.kdeplot(data=positive_docs_long, ax=ax0, x="Class probability", hue="Category", palette=colors, clip=[0,1], common_norm=False, multiple="layer")
sns.ecdfplot(positive_docs_long, ax=ax1, x="Class probability", hue="Category", palette=colors, complementary=True)

ax1.get_legend().remove()
plt.tight_layout()
fig.savefig(os.path.join(export_dir, "class_probability_distributions.pdf"), format='pdf', bbox_inches='tight')       

# Evolution over the years

Shows the level of *average number of predicted CR's per report* (ACRR) over time.
    

In [None]:
import seaborn as sns

def convert_to_long(df, labels):
    df_long = df.groupby(['Year', 'report_id']).sum()[labels]
    df_long = df_long.reset_index()
    df_long = pd.melt(df_long, id_vars=["Year"], value_vars=labels, var_name='Category', value_name='Frequency (per report)')
    return df_long

def plot_evolution(df, categories, **kwargs):
    colors = [c["color"] for c in categories]
    df = convert_to_long(df, [c["label"] for c in categories])
    ax = sns.lineplot(data=df, x="Year", y="Frequency (per report)", hue="Category", palette=colors, style="Category", **kwargs)
    plt.legend(loc='upper left')
    plt.xlim()
    plt.xlim(min(df.Year), max(df.Year))
    return ax


def plot_evolution_categories(df, categories, **kwargs):
    ax = plot_evolution(df, categories, **kwargs)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    fig = ax.get_figure()
    return fig


def plot_grid(groups, column, categories, exclude_groups=[], ncols=4, **kwargs):
    nrows = math.ceil(len(groups) / ncols)    
    fig = plt.figure(figsize=(12, 15 if nrows > 1 else 5))

    shared_y_ax = None
    shared_x_ax = None
    for idx, c in enumerate(groups):
        if c in exclude_groups:
            is_excluded = True 
            sharey = None
        else:
            is_excluded = False
            sharey = shared_y_ax
        ax = fig.add_subplot(nrows, ncols, idx + 1, sharey=sharey, sharex=shared_x_ax)
        if not shared_y_ax:
            shared_x_ax = ax
        if not is_excluded:
            shared_y_ax = ax
        
        filtered_df = df.query(f"{column} == @c")
        ax2 = plot_evolution(filtered_df, categories, ax=ax, **kwargs)
        ax2.title.set_text(c)
        
        if ax.is_last_row():
            pass
        else:
            plt.setp(ax.get_xticklabels(), visible=False)
            ax.set_xlabel(None)

        if ax.is_first_col():
            pass
        else:
            plt.setp(ax.get_yticklabels(), visible=False)
            ax.set_ylabel(None)
        
        if is_excluded:
            plt.setp(ax.get_yticklabels(), visible=True)
            ax.yaxis.tick_right()
            
    return fig
    

fig = plot_evolution_categories(df, constants.cro_categories)
fig.savefig(os.path.join(export_dir, "levels_acror_cro_years.pdf"), format='pdf', bbox_inches='tight')           

In [None]:
fig = plot_evolution_categories(df, constants.cro_sub_categories, ci=None)
fig.savefig(os.path.join(export_dir, f"levels_acror_cro_sub_type_years.pdf"), format='pdf', bbox_inches='tight')    

In [None]:
all_countries = sorted(df_master.country.unique())
all_countries_fig = plot_grid(all_countries, 'country', constants.cro_categories, ncols=3, ci=None)
all_countries_fig.savefig(os.path.join(export_dir, f"levels_acror_cro_countries.pdf"), format='pdf', bbox_inches='tight')

In [None]:
selected_countries_fig = plot_grid(["DE", "CH", "FR", "GB"], 'country', constants.cro_categories, ncols=4, ci=None)
selected_countries_fig.savefig(os.path.join(export_dir, f"levels_acror_cro_selected_countries.pdf"), format='pdf', bbox_inches='tight')

In [None]:
all_industries = sorted(df_master.icb_industry.unique())
all_inudustries_fig = plot_grid(all_industries, 'icb_industry', constants.cro_categories, exclude_groups=["Energy"], ncols=4, ci=None)
all_inudustries_fig.savefig(os.path.join(export_dir, f"levels_acror_cro_industry.pdf"), format='pdf', bbox_inches='tight')

In [None]:
selected_industries_fig = plot_grid(["Consumer Discretionary", "Financials", "Telecommunications", "Energy"], 'icb_industry', constants.cro_categories, exclude_groups=["Energy"], ncols=4, ci=None)
selected_industries_fig.savefig(os.path.join(export_dir, f"levels_acror_cro_selected_industries.pdf"), format='pdf', bbox_inches='tight')

# Frequency distributions

In [None]:
from_year = 2015
labels = [c['label'] for c in constants.cro_categories]
colors = [c["color"] for c in constants.cro_categories]

df_long = df.query("Year >= @from_year").groupby(['country', 'report_id']).sum()[labels]
df_long = df_long.reset_index()
print(f"Number of reports (from {from_year}): {len(df_long.report_id.unique())}")
df_long = pd.melt(df_long, id_vars=["country"], value_vars=labels, var_name='Category', value_name='Frequency (per report)')
df_long = df_long.rename(columns={"country": "Country"})

plt.figure(figsize=(9,6))
ax = sns.boxplot(y="Country", x="Frequency (per report)", hue="Category", data=df_long, palette=colors)
fig = ax.get_figure()
# plt.tight_layout()
fig.subplots_adjust(left=0.2)
fig.savefig(os.path.join(export_dir, f"cro_country_distribution.pdf"), format='pdf')

In [None]:
df_long = df.query("Year >= @from_year").groupby(['icb_industry', 'report_id']).sum()[labels]
df_long = df_long.reset_index()
print(f"Number of reports (from {from_year}): {len(df_long.report_id.unique())}")
df_long = pd.melt(df_long, id_vars=["icb_industry"], value_vars=labels, var_name='Category', value_name='Frequency (per report)')
df_long = df_long.rename(columns={"icb_industry": "Industry"})

plt.figure(figsize=(9,6))
ax = sns.boxplot(y="Industry", x="Frequency (per report)", hue="Category", data=df_long, palette=colors)
fig = ax.get_figure()
#plt.tight_layout()
fig.subplots_adjust(left=0.2)
fig.savefig(os.path.join(export_dir, f"cro_industry_distribution.pdf"), format='pdf')

# Preparation

In [None]:
pr_doc = positive_docs[positive_docs[constants.cro_category_labels[0]]]
tr_doc = positive_docs[positive_docs[constants.cro_category_labels[1]]]
acute_doc = positive_docs[positive_docs[constants.cro_sub_category_labels[0]] > 0]
chron_doc = positive_docs[positive_docs[constants.cro_sub_category_labels[1]] > 0]
policy_doc = positive_docs[positive_docs[constants.cro_sub_category_labels[2]] > 0]
market_doc = positive_docs[positive_docs[constants.cro_sub_category_labels[3]] > 0]
reputation_doc = positive_docs[positive_docs[constants.cro_sub_category_labels[4]] > 0]

# Most frequent ngrams

In [None]:
from nltk import ngrams, FreqDist

def get_word_counts(docs, most_common=50):
    counts = dict()
    for size in 1, 2:
        counts[size] = FreqDist(ngrams(docs, size))
        
    counts_df = pd.DataFrame(data=counts[1].most_common(most_common), columns=["Unigram", "Frequency"]) # columns=pd.MultiIndex.from_tuples([('Unigram', ''), ('Unigram', 'Frequency')]))
    # counts_df = counts_df.sort_values(by=['Frequency'], ascending=False)
    counts_df = counts_df.join(
        pd.DataFrame(data=counts[2].most_common(most_common), columns=['Bigram', "Frequency2"]) # columns=pd.MultiIndex.from_tuples([('Bigram', ''), ('Bigram', 'Frequency')]))
    )
    counts_df = counts_df.head(most_common)
    
    counts_df['Unigram'] = counts_df['Unigram'].apply(lambda x: " ".join(x))
    counts_df['Bigram'] = counts_df['Bigram'].apply(lambda x: " ".join(x))
    counts_df = counts_df.rename(columns={"Frequency2": "Frequency"})
    return counts_df
    

most_frequent_df = get_word_counts(positive_docs['processed_docs'].explode(), most_common=20)
tables.export_to_latex(most_frequent_df, filename="most_frequent_terms.tex", make_bold_row_at=2, index=False)

# Topic modelling

In [None]:
import gensim


In [None]:
dictionary = gensim.corpora.Dictionary(positive_docs["processed_docs"])

In [None]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.9, keep_n= 100000)

# Word Cloud

In [None]:
from wordcloud import WordCloud

In [None]:
import matplotlib.pyplot as plt

def gen_wordcloud(docs, export_path, **params):
    input_text = " ".join(docs)
    wordcloud = WordCloud(background_color="white", relative_scaling=0.6, **params).generate(input_text)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    if export_path:
        wordcloud.to_file(export_path + ".pdf")
        svg = wordcloud.to_svg()
        # with open(export_path + ".svg", "wt") as f:
        #     f.write(svg)

wordclouds_path = os.path.join(export_dir, "wordclouds")
gen_wordcloud(pr_doc["text"], os.path.join(wordclouds_path, "pr"), scale=1, height=400, width=800)
gen_wordcloud(tr_doc["text"], os.path.join(wordclouds_path, "tr"), scale=1, height=400, width=800)

for c in constants.cro_sub_categories:
    docs = positive_docs[positive_docs[c["label"]] > 0]
    path = os.path.join(wordclouds_path, c["code"].lower())
    wordcloud = gen_wordcloud(docs["text"], path, scale=1, height=400, width=600)
    
    