This notebook calculates the potential greenwashing score by comparing the text sentiment between the CSR reports and the news articles of a company.

# Set-up and loading the data

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util, models
import os

In [None]:
path_data = '..\\data_structured'

In [None]:
df_comb = pd.read_pickle(os.path.join(path_data, 'comb.pkl'))

# Sentiment Analysis (ClimateBERT)

In [None]:
# unifying the sentiment probabilities and labels into one score
def calculate_score(row):
    if (row['sentiment'] == 'negative')|(row['sentiment'] == 'risk'):
        return row['sentiment_probability']*(-1)
    elif row['sentiment'] == 'neutral':
        return row['sentiment_probability']*0
    else:
        return row['sentiment_probability']

In [None]:
# creating a function to get sentiment scores and labels
def get_sentiment_scores(df, model):
    pipe_sentiment = pipeline(model = model, device = 0, batch_size = 64)
    sentences = df['sentence'].tolist()  # Convert the column to a list

    results = pipe_sentiment(sentences)
    df['sentiment'] = [result['label'] for result in results]
    df['sentiment_probability'] = [result['score'] for result in results]
    df['sentiment_score'] = df.apply(calculate_score, axis = 1)
    return df

In [None]:
df_climate = df_comb.copy()

In [None]:
df_climate = get_sentiment_scores(df_climate, 'climatebert/distilroberta-base-climate-sentiment')

In [None]:
df_climate['sentiment'].value_counts()

In [None]:
# this is to get the finbert scores for the comparison in the discussion section
# %%time
# df_comb = get_sentiment_scores(df_comb, 'ProsusAI/finbert')

In [None]:
sust_topics = [0,1,2,3,4,5,6,7,8,10,11,20,25]
# this was a test to see whether excluding certain topics would change results significantly (specifically greenwashing accusations)
#lim_topics = [0,1,2,3,4,5,6,7,8,10,11,20]
# df_analyze = df_comb[df_comb['topics'].isin(sust_topics)]

In [None]:
df_analyze = df_climate[df_climate['topics'].isin(sust_topics)]
df_analyze.reset_index(inplace = True, drop = True)
df_analyze.shape

## Firm-Level Analysis

In [None]:
# calculating the average sentiment discrepancy without clusters
gw_database_total = {}
for firm in set(df_analyze['company']):
     # calculate the average sentiment sc|ore for this firm for symbolic actions
    rep_score = df_analyze[(df_analyze['company']==firm)&(df_analyze['doc_type']=='report')]['sentiment_score'].mean(skipna=True)
    # calculate the average sentiment score for this firm for substantive actions
    news_score = df_analyze[(df_analyze['company']==firm)&(df_analyze['doc_type']=='news')]['sentiment_score'].mean(skipna=True)
    gw_database_total[firm] = (rep_score-news_score)

In [None]:
# the dataframe with the firm level scores
df_gw_total = pd.DataFrame(gw_database_total.items(),columns=['company', 'clim_sentiment_overall'])
df_gw_total

## Cluster-Level Analysis

In [None]:
def gw_score_sent_average(firm, cluster, df, mode= None):
    # calculate the average sentiment score for this firm in this cluster across its report
    rep_score = df[(df['company']==firm)&(df['topics']==cluster)&(df['doc_type']=='report')]['sentiment_score'].mean(skipna=True)
    # calculate the average sentiment score for this firm in this cluster across its news coverage
    news_score = df[(df['company']==firm)&(df['topics']==cluster)&(df['doc_type']=='news')]['sentiment_score'].mean(skipna=True)
    if mode == 'weighted':
        # for calculating the cluster importance as described by Boelders, we need to divide the number of cluster sentences for this company by its total number of sentences
        cluster_sentences = len(df[(df['company']==firm)&(df['topics']==cluster)])
        total_sentences = len(df[df['company']==firm])
        cl_importance = cluster_sentences/total_sentences
        return (rep_score-news_score)*cl_importance
    else:
        return (rep_score-news_score)

In [None]:
# calculating with clusters
gw_database = {}
for cluster in list(set(df_analyze['topics'])):
    firm_cluster_score = {}
    for firm in set(df_analyze['company']):
        firm_cluster_score[firm] = gw_score_sent_average(firm, cluster, df_analyze)
    gw_database[cluster]= firm_cluster_score

In [None]:
# dataframe with topic level scores
df_gw = pd.DataFrame.from_dict(gw_database)
df_gw

In [None]:
df_gw.reset_index(inplace = True)
df_gw.rename(columns = {'index':'company'}, inplace = True)

In [None]:
# calculating the average across topics
df_gw['clim_sentiment_cluster_average'] = df_gw[sust_topics].mean(axis = 1)
df_gw

Finally, I merge the firm-level and cluster-level scores into one table.

In [None]:
# merging the two datasets together and exporting it
merged_df = df_gw_total.copy()
merged_df = merged_df.merge(df_gw[['company','clim_sentiment_cluster_average']])
# merged_df.to_csv('sentiment_scores.csv', index = False)