# Set-up and loading the data

In [None]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import fitz
from scipy.stats import pearsonr, ttest_1samp

In [None]:
# defining paths for the data sources - refinitiv, nexisuni articles and reports
path_ref = '..\\data\\refinitiv'
path_nexis = '..\\data\\articles'
path_reports = '..\\data\\reports'
path_data = '..\\data_structured'

## Data Loading

In [None]:
df_ref = pd.read_excel(os.path.join(path_ref,'refinitiv.xlsx'), sheet_name = 1)
df_sentiment = pd.read_csv('sentiment_scores.csv') #drop h&m and dhl
df_similarity = pd.read_csv('similarity_scores.csv')
df_verification = pd.read_csv('verification_scores.csv')

In [None]:
# Loading the combined dataset, which holds all the sentences to get the news article sentence count for futher analysis
df_comb = pd.read_pickle(os.path.join(path_data, 'comb.pkl'))

# Calculating the discrepancy index (Kruisheer)

In [None]:
# Calculate and scale the discrepancy index
df_ref['average_practice'] = df_ref[['resource_use', 'emissions', 'environmental_innovation']].replace(0, np.nan).mean(axis = 1)
df_ref['green_practice'] = StandardScaler().fit_transform(np.array(df_ref['average_practice']).reshape(-1,1))
df_ref['green_communication'] = StandardScaler().fit_transform(np.array(df_ref['CSR_strategy']).reshape(-1,1))
df_ref['discrepancy_index'] = df_ref['green_communication']-df_ref['green_practice']

In [None]:
# Scaling the discrepancy index to values between 0 and 1
df_ref['discrepancy_index'] = MinMaxScaler().fit_transform(np.array(df_ref['discrepancy_index']).reshape(-1,1))

# Correlation Analysis

## Creating one unified dataframe to hold all scores

In [None]:
df_scores = df_ref[['company','discrepancy_index']]

In [None]:
df_scores = pd.merge(df_scores, df_similarity, how = 'left')
df_scores = pd.merge(df_scores, df_sentiment)
df_scores = pd.merge(df_scores, df_verification)

## Generating a table with the correlation scores per developed measure

In [None]:
df_correlation_scores = pd.DataFrame(columns = ['discrepancy_measure', 'correlation', 'p-value'])

In [None]:
# scaling the developed measures first with a min max scaler (optional)
for column in df_scores.columns[2:]:
    df_scores[column] = MinMaxScaler().fit_transform(np.array(df_scores[column]).reshape(-1,1))

In [None]:
# generating the table
x = df_scores['discrepancy_index']
for column in df_scores.columns[2:]:
    y = df_scores[column]
    correlation, p_value = pearsonr(x,y)
    temp_df = pd.DataFrame(
        {
            'discrepancy_measure':[column], 
            'correlation':[correlation],
            'p-value':[p_value]
        }
    )
    df_correlation_scores = pd.concat([df_correlation_scores, temp_df], ignore_index = True)
    

In [None]:
df_correlation_scores
# correlation scores when leaving out beiersdorf, hershey and henkel, which are companies for which I also included articles from 2020

# Further exploration of the correlation analysis

## Creating additional variables for further analysis

In [None]:
# getting a list of low article companies (all the ones for which nexisuni search was necessary) for additional analysis later
low_article_companies = sample = os.listdir(path_nexis)
low_article_companies = [word.split('.')[0] for word in low_article_companies]

In [None]:
# creating a column, which has the value yes if the company belongs to one of the low article companies
df_scores['low_coverage'] = df_scores['company'].apply(lambda x: 'Yes' if x in low_article_companies else 'No')

In [None]:
# Getting the article sentence counts per company for a visualization later
df_counts = df_comb[df_comb['doc_type'] == 'news'].groupby(['company'])['sentence'].count() 
df_counts = pd.DataFrame(df_counts)
df_counts.reset_index(inplace = True)
df_counts.rename(columns = {'sentence':'sentence_count'}, inplace = True)

In [None]:
# df_rep_sentences =  df_comb[df_comb['doc_type'] == 'report'].groupby(['company'])['sentence'].count()
# df_rep_sentences = pd.DataFrame(df_rep_sentences)

In [None]:
df_scores = pd.merge(df_scores, df_counts)

In [None]:
# Creating a dataframe for the report page count
df = pd.DataFrame(columns = ['company', 'report_page_count'])

for filename in os.listdir(path_reports):
    if filename.endswith('.pdf'):
        file_path = os.path.join(path_reports, filename)
        doc = fitz.open(file_path)
        page_count = doc.page_count
        doc.close()
        company_name = filename[:-4]  # remove the ".pdf" extension
        df_temp = pd.DataFrame({'company': [company_name], 'report_page_count': [page_count]})
        df = pd.concat([df, df_temp], ignore_index = True)

In [None]:
df_scores = pd.merge(df_scores, df)

In [None]:
# Creating a scatterplot showcasing the general correlation and linear relationship between our indices

df_scores.reset_index(drop = True, inplace = True)
x = df_scores['discrepancy_index']
y = df_scores['clim_sentiment_overall']
companies = df_scores['company']

# Fit a linear regression line
coefficients = np.polyfit(x, y, 1)
line = np.poly1d(coefficients)

# Scatter plot
plt.figure(figsize=(12,9))
plt.scatter(x, y)

# Add the linear regression line
plt.plot(x, line(x), color='red', label = 'Our Correlation')

# Add labels for each point
for i, company in enumerate(companies):
    plt.text(x[i], y[i], company, ha='center', va='bottom')
    
plt.xlabel('Discrepancy Index')
plt.ylabel('Sentiment Discrepancy (Firm-level)')

#plt.savefig('correlationscatterplot.png', dpi = 200)

plt.show()

## Effect of News Coverage and Report Length

In [None]:
# Creating the same scatterplot, but also including the line for perfect correlation and indicating whether companies belong to the low article group

x = df_scores['discrepancy_index']
y = df_scores['clim_sentiment_overall']
low_article_companies = df_scores['low_coverage']

# Fit a linear regression line
coefficients = np.polyfit(x, y, 1)
line = np.poly1d(coefficients)

# Scatter plot with color coding by coverage
plt.figure(figsize=(16, 10))
for company in set(low_article_companies):
    plt.scatter(x[low_article_companies == company], y[low_article_companies == company], label=company)

# Add the linear regression line
plt.plot(x, line(x), color='red', label = 'Our Correlation')

# Add the perfect correlation line
plt.plot(x, x, color='green', linestyle='dotted', solid_capstyle='butt', label='Perfect Correlation')

plt.xlabel('Discrepancy Index')
plt.ylabel('Sentiment Discrepancy (Firm-level)')

# Add legend
plt.legend(title='Low Article Companies')

#plt.savefig('correlationscatterplot-low articles.png', dpi=500)

plt.show()

In [None]:
# Visualizing the effect of the amount of news coverage on our correlation

x = df_scores['discrepancy_index']
y = df_scores['clim_sentiment_overall']

# Fit a linear regression line
coefficients = np.polyfit(x, y, 1)
line = np.poly1d(coefficients)

plt.figure(figsize=(16, 10))

plt.scatter(x,y,c = df_scores['sentence_count'], cmap = 'viridis')
plt.colorbar(label='News Article Sentence Count')


# Add the linear regression line
plt.plot(x, line(x), color='red', label = 'Our Correlation')

plt.plot(x, x, color='green', linestyle='dotted', solid_capstyle='butt', label='Perfect Correlation')

plt.xlabel('Discrepancy Index')
plt.ylabel('Sentiment Discrepancy (Firm-level)')

# Add legend
plt.legend(title='Legend')

#plt.savefig('corr_scatter_news_coverage.png', dpi=500)

plt.show()

In [None]:
# visualizing the effect of report page count
x = df_scores['discrepancy_index']
y = df_scores['clim_sentiment_overall']

# Fit a linear regression line
coefficients = np.polyfit(x, y, 1)
line = np.poly1d(coefficients)

plt.figure(figsize=(16, 10))
plt.scatter(x,y,c = df_scores['report_page_count'], cmap = 'viridis')
plt.colorbar(label='Report Page Count')

# Add the linear regression line
plt.plot(x, line(x), color='red', label = 'Our Correlation')

plt.plot(x, x, color='green', linestyle='dotted', solid_capstyle='butt', label='Perfect Correlation')

plt.xlabel('Discrepancy Index')
plt.ylabel('Sentiment Discrepancy (Firm-level)')

# Add legend
plt.legend(title='Legend')

#plt.savefig('corr_scatter_page_count.png', dpi=500)

plt.show()

## Effect of Sector

In [None]:
df_descriptives = df_ref[['company','country','industry']]
df_scores = pd.merge(df_scores, df_descriptives)

In [None]:
x = df_scores['discrepancy_index']
y = df_scores['clim_sentiment_overall']
industries = df_scores['industry']

# Fit a linear regression line
coefficients = np.polyfit(x, y, 1)
line = np.poly1d(coefficients)

# Scatter plot with color coding by industry
plt.figure(figsize=(16, 10))
for industry in set(industries):
    plt.scatter(x[industries == industry], y[industries == industry], label=industry)

# Add the linear regression line
plt.plot(x, line(x), color='red', label = 'Our Correlation')

plt.plot(x, x, color='green', linestyle='dotted', solid_capstyle='butt', label='Perfect Correlation')

plt.xlabel('Discrepancy Index')
plt.ylabel('Sentiment Discrepancy (Firm-level)')

# Add legend
plt.legend(title='Economic Sector (Refinitiv)')

#plt.savefig('corr_scatter_sector.png', dpi=500)

plt.show()

# Generating Result Tables

In [None]:
df_finbert_results = df_correlation_scores.iloc[[8,9,12,13]]
df_finbert_results
print(df_finbert_results.to_latex(index = False, caption = 'Comparison of the Performance of a Domain-Trained (ClimateBERT) and Non-Domain Trained sentiment model (FinBERT)', label = 'tab:sentimentcomparison', header = True, position = 'h'))

In [None]:
filter_results = []
for measure in df_correlation_scores['discrepancy_measure']:
    if ('_lim' in measure)|('waverage' in measure)|('wsum' in measure):
        filter_results.append(False)
    else:
        filter_results.append(True)
df_results = df_correlation_scores[filter_results]
df_results.reset_index(inplace = True, drop = True)

In [None]:
df_results = df_results.iloc[[0,1,4,5,6,7]]
df_results.reset_index(inplace = True, drop = True)

In [None]:
df_results['discrepancy_measure'] = ['Dissimilarity_firm', 'Dissimilarity_firm-topic', 'Sentiment_firm', 'Sentiment_firm-topic', 'Verification_firm', 'Verification_firm-topic']

In [None]:
print(df_results.to_latex(index = False, caption = 'Correlation Analysis Results for the Developed Greenwashing Scores on Document and Cluster Level', label = 'tab:correlationresults', header = True, position = 'h'))

# Sentiment Score Distributions and Descriptives

In [None]:
df_sentiment['clim_sentiment_overall'].plot.box()
# plt.savefig('boxplot.png')

In [None]:
# Plot histogram
df_sentiment['clim_sentiment_overall'].plot.hist(bins = 25)

# Set plot labels
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Distribution of Clim Sentiment Overall Scores')

plt.savefig('sentimentdistribution.png')
# Show the histogram
plt.show()

In [None]:
df_sentiment['clim_sentiment_overall'].std()

In [None]:
df_results_2 = df_sentiment[df_sentiment['clim_sentiment_overall']>0]
df_results_2 = df_results_2[['company','clim_sentiment_overall']]

In [None]:
# print(df_results_2.to_latex(index = False, caption = 'Companies With a Sentiment Discrepancy Above 0', label = 'tab:greenwashcompanies', header = True, position = 'h'))