This notebook calculates the potential greenwashing score by attempting to verify claims companies make in their CSR reports through the usage of companies' news coverage.

# Set-up and loading the data

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util, models
import os
from sklearn.preprocessing import MinMaxScaler
from collections import Counter

In [None]:
scaler = MinMaxScaler()

In [None]:
# defining the data path
path_data = '..\\data_structured'

## Data loading

In [None]:
df_comb = pd.read_pickle(os.path.join(path_data, 'comb.pkl'))

In [None]:
sust_topics = [0,1,2,3,4,5,6,7,8,10,11,20,25]

In [None]:
df_analyze = df_comb[df_comb['topics'].isin(sust_topics)]

In [None]:
df_report = df_analyze[df_analyze['doc_type']=='report']
df_article = df_analyze[df_analyze['doc_type']=='news']
df_report.reset_index(drop = True, inplace = True)
df_article.reset_index(drop = True, inplace = True)

# Claim Verification

In [None]:
claim_checker = pipeline(model = "climatebert/environmental-claims",  device = 0, batch_size = 64) # claim identification
sem_search = SentenceTransformer('all-MiniLM-L6-v2', device='cuda') # evidence sentence selection
nli_model = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli" # inference analysis

The claim verification model consists of three stages - claim identification, evidence sentence selection and finally inference analysis. The three models above will help us achieve these three tasks. The ClimateBERT model is pre-trained to detect environmental and climate claims, semantic search will help us identify the 5 most relevant sentences from the corpus and finally the actual model can be used to check the entailment.

First, we apply the ClimateBERT model to identify environmental claims:

In [None]:
# first part of the pipeline - identifying claims
sentences = df_report['sentence'].tolist()  # Convert the column to a list

results = claim_checker(sentences)
df_report['claim'] = [result['label'] for result in results]
df_report['claim_probability'] = [result['score'] for result in results]

In [None]:
df_claims = df_report[df_report['claim']=='yes']
df_claims.reset_index(inplace = True, drop = True)
df_claims.shape

Since this took a while I will also pickle these to save my progress.

In [None]:
# df_article.to_pickle('art.pkl')
# df_claims.to_pickle('claims.pkl')

Sentence transformers has a utility called semantic search which can be used to do find top 5 most similar sentences:

In [None]:
# %%time
# for i,row in df_claims.iterrows():
#     query_embedding = row['embeddings']
#     company = row['company']
#     # search only the article embeddings/sentences of the specific company
#     corpus_embeddings = df_article[df_article['company']==company]['embeddings'].values
#     top_5 = util.semantic_search(torch.Tensor(query_embedding), torch.Tensor(np.array(list(corpus_embeddings))), top_k = 5)
#     break

Let us now create a new dataframe based on df_claims, which will store the same information as this dataframe, but will also additionally hold the top 5 most similar sentences in a separate column, as well as whether these sentences entail, contradict or are neutral towards each other. I use the MoritzLaurer NLI model for this purpose as it states that it is the best performing NLI model on the HuggingFace hub as of June 2022. The code used for the classification is mostly copied from the HuggingFace transformers website and modified for our purposes.

We repeat the same code as above but expand upon it further:

## Firm-Level Analysis

In [None]:
tokenizer = AutoTokenizer.from_pretrained(nli_model)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = AutoModelForSequenceClassification.from_pretrained(nli_model).to(device)

In [None]:
# # making lists to store values for the new columns
top_sentences_column = []
predictions = []
probabilities = []

# we run a for loop for each claim in the df_entailment dataset and check the validity of the claim
for i,row in df_claims.iterrows():
    # define our query (i.e. claim) and the company it's related to 
    query_embedding = row['embeddings']
    company = row['company']
    # search only the article embeddings/sentences of the specific company
    corpus_embeddings = df_article[df_article['company']==company]['embeddings'].values
    top_5 = util.semantic_search(torch.Tensor(query_embedding), torch.Tensor(np.array(list(corpus_embeddings))), top_k = 5)
    # define a list to hold our top sentences and predictions to add these as a new variable after the loop
    hard_predictions = []
    top_sentences = []
    soft_predictions =[]
    for sentence in  top_5[0]:
        # the premise is the claim
        premise = row['sentence']
        # the hypothesis is the sentence from the article(identified using the corpus id, which gives us the index of the sentence)
        hypothesis = df_article[df_article['company']==company]['sentence'].values[sentence['corpus_id']]
        tokens = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
        output = model(tokens["input_ids"].to(device))  # device = "cuda:0" or "cpu"
        soft_prediction = torch.softmax(output["logits"][0], -1)
        label_names = ["entailment", "neutral", "contradiction"]
        hard_prediction = label_names[torch.argmax(output["logits"][0], -1).item()]
        # append the different values to the correct list
        top_sentences.append(hypothesis)
        soft_predictions.append(max(torch.softmax(output["logits"][0], -1).tolist()))
        hard_predictions.append(hard_prediction)
        
    top_sentences_column.append(top_sentences)
    predictions.append(hard_predictions)
    probabilities.append(soft_predictions)

df_claims['top_sentences'] = top_sentences_column
df_claims['predictions'] = predictions
df_claims['probabilities'] = probabilities

In [None]:
def most_frequent_category(categories_list):
    counter = Counter(categories_list)
    return counter.most_common(1)[0][0]

In [None]:
# def count_non_neutral(categories_list):
#     counter = Counter(categories_list)
#     for element,count in counter.items():
#         if (count >= 2) & (element!='neutral'):
#             return element
#     return 'neutral'
        

In [None]:
df_claims['consensus'] = df_claims['predictions'].apply(most_frequent_category)
df_claims['consensus'].value_counts()

In [None]:
# df_claims['consensus'] = df_claims['predictions'].apply(count_non_neutral)
# df_claims['consensus'].value_counts()

In [None]:
gw_database_total = {}
for firm in set(df_claims['company']):
     # calculate the average sentiment score for this firm for symbolic actions
    ver_score = len(df_claims[(df_claims['company']==firm)&(df_claims['consensus']!='entailment')])/len(df_claims[df_claims['company']==firm])
    gw_database_total[firm] = ver_score

In [None]:
df_gw_total = pd.DataFrame(gw_database_total.items(),columns=['company', 'verification_score'])
scaler = MinMaxScaler()
df_gw_total['verification_score'] = scaler.fit_transform(df_gw_total[['verification_score']])

## Cluster-Level Analysis

In [None]:
# # making lists to store values for the new columns
top_sentences_column = []
predictions = []
probabilities = []

# we run a for loop for each claim in the df_entailment dataset and check the validity of the claim
for i,row in df_claims.iterrows():
    # define our query (i.e. claim) and the company it's related to 
    query_embedding = row['embeddings']
    company = row['company']
    topic = row['topics']
    # search only the article embeddings/sentences of the specific company and topic
    corpus_embeddings = df_article[(df_article['company']==company)&(df_article['topics']==topic)]['embeddings'].values
    top_5 = util.semantic_search(torch.Tensor(query_embedding), torch.Tensor(np.array(list(corpus_embeddings))), top_k = 5)
    # define a list to hold our top sentences and predictions to add these as a new variable after the loop
    hard_predictions = []
    top_sentences = []
    soft_predictions =[]
    for sentence in  top_5[0]:
        # the premise is the claim
        premise = row['sentence']
        # the hypothesis is the sentence from the article(identified using the corpus id, which gives us the index of the sentence)
        hypothesis = df_article[df_article['company']==company]['sentence'].values[sentence['corpus_id']]
        tokens = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
        output = model(tokens["input_ids"].to(device))  # device = "cuda:0" or "cpu"
        soft_prediction = torch.softmax(output["logits"][0], -1)
        label_names = ["entailment", "neutral", "contradiction"]
        hard_prediction = label_names[torch.argmax(output["logits"][0], -1).item()]
        # append the different values to the correct list
        top_sentences.append(hypothesis)
        soft_predictions.append(max(torch.softmax(output["logits"][0], -1).tolist()))
        hard_predictions.append(hard_prediction)
        
    top_sentences_column.append(top_sentences)
    predictions.append(hard_predictions)
    probabilities.append(soft_predictions)

df_claims['top_sentences_cluster'] = top_sentences_column
df_claims['predictions_cluster'] = predictions
df_claims['probabilities_cluster'] = probabilities

In [None]:
df_claims['consensus'] = df_claims['predictions'].apply(most_frequent_category)
df_claims['consensus'].value_counts()

In [None]:
gw_database = {}
for cluster in set(df_claims['topics']):
    firm_cluster_score = {}
    for firm in set(df_claims['company']):
        try:
            firm_cluster_score[firm] = len(df_claims[(df_claims['company']==firm)&(df_claims['topics']==cluster)&(df_claims['consensus']!='entailment')])/len(df_claims[(df_claims['company']==firm)&(df_claims['topics']==cluster)])
        except:
            firm_cluster_score[firm] = np.nan
     # calculate the average sentiment score for this firm for symbolic actions
    gw_database[cluster] = firm_cluster_score

In [None]:
df_gw = pd.DataFrame.from_dict(gw_database)
df_gw['verification_cluster'] = df_gw[sust_topics].mean(axis = 1, skipna = True)
df_gw['verification_cluster'] = scaler.fit_transform(df_gw[['verification_cluster']])

In [None]:
df_gw.reset_index(inplace = True)
df_gw.rename(columns = {'index':'company'}, inplace = True)
df_gw = df_gw[['company', 'verification_cluster']]

In [None]:
df_verification = pd.merge(df_gw_total, df_gw)
df_verification.to_csv('verification_scores.csv', index = False)