This notebook calculates the potential greenwashing score by comparing the text similarity between the CSR reports and the news articles of a company.

# Set-up and loading the data

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util, models
import os
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm

In [None]:
path_data = '..\\data_structured'

In [None]:
df_comb = pd.read_pickle(os.path.join(path_data, 'comb.pkl'))

In [None]:
sust_topics = [0,1,2,3,4,5,6,7,8,10,11,20,25]

In [None]:
df_analyze = df_comb[df_comb['topics'].isin(sust_topics)]
df_analyze.reset_index(inplace = True, drop = True)
df_analyze.shape

In [None]:
df_report = df_analyze[df_analyze['doc_type']=='report']
df_article = df_analyze[df_analyze['doc_type']=='news']
df_report.reset_index(drop = True, inplace = True)
df_article.reset_index(drop = True, inplace = True)

# Text Similarity (S-BERT) Score Calculation

In [None]:
df_similarity = df_report.copy()

In [None]:
%%time
# firm level similarity
similarity_scores = []
for i,row in df_similarity.iterrows():
    # define our query (i.e. claim) and the company it's related to 
    query_embedding = row['embeddings']
    company = row['company']
    # search only the article embeddings/sentences of the specific company
    corpus_embeddings = df_article[(df_article['company']==company)]['embeddings'].values
    top_10 = util.semantic_search(torch.Tensor(query_embedding), torch.Tensor(np.array(list(corpus_embeddings))), top_k = 10)
    try:
        sim_score = pd.DataFrame(top_10[0])['score'].mean()
    except:
        sim_score = 'no sentences'
    similarity_scores.append(sim_score)

df_similarity['sentence_similarity'] = similarity_scores

In [None]:
# creating a dictionary to hold the firm level scores
gw_sim_total = {}
for firm in set(df_analyze['company']):
    gw_sim_total[firm] = df_similarity[(df_similarity['company']==firm)&(df_similarity['sentence_similarity']!='no sentences')]['sentence_similarity'].mean(skipna=True)
gw_sim_total

In [None]:
# creating a dataframe based on the dictionary, including dissimilarity score
df_gw_sim_total = pd.DataFrame(gw_sim_total.items(),columns=['company', 'sim_overall'])
df_gw_sim_total['sim_overall'] = MinMaxScaler().fit_transform(np.array(df_gw_sim_total['sim_overall']).reshape(-1,1))
df_gw_sim_total['dissimilarity_overall'] = 1 - df_gw_sim_total['sim_overall']

In [None]:
%%time
# topic level sentence similarity
similarity_scores = []
for i,row in df_similarity.iterrows():
    # define our query (i.e. claim) and the company it's related to 
    query_embedding = row['embeddings']
    company = row['company']
    cluster = row['topics']
    # search only the article embeddings/sentences of the specific company
    corpus_embeddings = df_article[(df_article['company']==company)&(df_article['topics']==cluster)]['embeddings'].values
    top_10 = util.semantic_search(torch.Tensor(query_embedding), torch.Tensor(np.array(list(corpus_embeddings))), top_k = 10)
    try:
        sim_score = pd.DataFrame(top_10[0])['score'].mean()
    except:
        sim_score = 'no sentences'
    similarity_scores.append(sim_score)

df_similarity['sentence_similarity_topic'] = similarity_scores

In [None]:
# creating a dictionary to hold the topic level scores
gw_database_sim = {}
for cluster in list(set(df_analyze['topics'])):
    firm_cluster_score = {}
    for firm in set(df_analyze['company']):
        firm_cluster_score[firm] = (df_similarity[(df_similarity['company']==firm)&(df_similarity['topics']==cluster)&(df_similarity['sentence_similarity']!='no sentences')]['sentence_similarity'].mean(skipna=True))
    gw_database_sim[cluster]= firm_cluster_score

In [None]:
# creating a dataframe
df_gw_sim = pd.DataFrame.from_dict(gw_database_sim)
df_gw_sim.reset_index(inplace = True)
df_gw_sim.rename(columns = {'index':'company'}, inplace = True)

In [None]:
# get the average across topics
df_gw_sim['sim_average_cluster'] = df_gw_sim[sust_topics].mean(axis = 1, skipna = True)
df_gw_sim['sim_average_cluster'] = MinMaxScaler().fit_transform(np.array(df_gw_sim['sim_average_cluster']).reshape(-1,1))
df_gw_sim['dissimilarity_average_cluster'] = 1 - df_gw_sim['sim_average_cluster']

In [None]:
merged_df = df_gw_sim_total.copy()
merged_df = merged_df.merge(df_gw_sim[['company','sim_average_cluster']])
# merged_df.to_csv('similarity_scores.csv', index = False)