# Set-up and loading the data

In [None]:
from sentence_transformers import SentenceTransformer, util, models
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from umap import UMAP
from hdbscan import HDBSCAN
import os
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic.representation import MaximalMarginalRelevance
from sklearn.manifold import TSNE
import plotly.express as px
from tqdm.auto import tqdm
import seaborn as sns
import gensim
import gensim.corpora as corpora
import re
from gensim.models.coherencemodel import CoherenceModel
# from octis.evaluation_metrics.diversity_metrics import TopicDiversity

## Data loading

In [None]:
# defining data paths
path_data = '..\\data_structured'
path_reports = '..\\data\\reports'

In [None]:
# creating a list of company names in our dataset to include in our vectorizer as stop words later + to remove them from text to create anonymized embeddings
sample = os.listdir(path_reports)
# converting to panda series because I find it easier to manipulate
company_list = [word.split('.')[0] for word in sample]
company_list = pd.Series(company_list)
publisher_list = company_list.str.replace('-',' ') #so make it two words
publisher_list = publisher_list.str.replace('ford motor', 'ford')
publisher_list = publisher_list.str.replace('p&g', 'procter')
names_list = publisher_list.tolist()
names_list.append('p&g')

In [None]:
# loading all the necessary data
df_report = pd.read_csv(os.path.join(path_data,'report_sentences.csv'))
df_pdf = pd.read_csv(os.path.join(path_data,'article_sentences_pdf.csv'))
df_gnews = pd.read_csv(os.path.join(path_data,'article_sentences_gnews.csv'))
df_article = pd.concat([df_pdf,df_gnews])

## Data pre-processing

We apply the detailed pre-processing steps here in terms of generating the sentence embeddings. We also filter the data on character length as described in the thesis. Finally, we also 'anonymize' the sentences here by removing the company names so BERTopic has an easier time creating clusters without creating ones based on companies. This dataset is also used for the subsequent analysis.

In [None]:
# defining our sentence embedding model
sent_embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')

In [None]:
df_article.drop_duplicates(subset = ['sentence'], inplace = True)
df_article.reset_index(inplace = True, drop = True)

In [None]:
df_rep = df_report.copy()
df_art = df_article.copy()

In [None]:
# function for generating embeddings (useful for testing out later):
def gen_embeddings(df,model):
    sentences = df['sentence'].tolist()
    return model.encode(sentences)

In [None]:
%%time
# creating embeddings for the report sentences and storing them in a new column
df_rep['embeddings'] = list(gen_embeddings(df_rep,sent_embedder))
df_art['embeddings'] = list(gen_embeddings(df_art, sent_embedder))

In [None]:
# combining the two datasets in preparation for the clustering
df_comb = pd.concat([df_art, df_rep])

# filtering and applying any transformations necessary:
# creating a character length variable to filter on text longer than 20 chars
df_comb['char_length'] = df_comb['sentence'].apply(lambda x: len(x))
df_comb = df_comb[(df_comb['char_length'] > 20)]
df_comb.drop_duplicates(subset = ['sentence'], inplace = True)

# Escaping the names_list for safe inclusion in the regular expression pattern
escaped_names = [re.escape(name) for name in names_list]

# Constructing the modified regular expression pattern
pattern = r"\b(?:{})(?:'s)?\b".format("|".join(escaped_names))

# Replace company names with 'company' using the regular expression pattern
df_comb['anon_sentence'] = df_comb['sentence'].str.replace(pattern, 'the company', case=False, regex=True)

df_comb.reset_index(inplace = True, drop = True)


In [None]:
# Load the sentence embeddings and doc types from the dataframe
embeddings = np.array(df_comb["embeddings"].tolist())

In [None]:
# creating new embeddings based on the 'anonymized' sentences (without company names) - let's see how BERTopic handles it now
df_comb['anon_embeddings'] = list(gen_embeddings(df_comb, sent_embedder))

In [None]:
embeddings = np.array(df_comb["anon_embeddings"].tolist())

In [None]:
# df_comb = pd.to_pickle(os.path.join(path_data, 'comb.pkl'))
# df_comb

# Descriptive Visualizations

In [None]:
# Create a histogram of the word count
plt.figure(figsize=(10,5))
plt.hist(df_comb['word count'], bins=30, color='skyblue')
plt.title('Word Count Distribution')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.savefig('word_count_hist.png', dpi=300)

plt.show()

In [None]:
# Group the data by company, doc_type, and count the number of sentences
df_by_company_doc_type = df_comb.groupby(['company', 'doc_type'])['sentence'].count().reset_index()

# Pivot the dataframe to create separate columns for each doc_type
df_pivot = df_by_company_doc_type.pivot(index='company', columns='doc_type', values='sentence')

# Create a line chart of the number of sentences by company and doc_type
plt.figure(figsize=(20,5));
df_pivot.plot.line(color=['red', 'blue'], marker='o', markersize=5)
plt.title('Number of Sentences by Company and Doc Type')
plt.xlabel('Company')
plt.ylabel('Number of Sentences')
plt.legend(title='Doc Type')

# Save the plot as a PNG file
plt.show()

# BERTopic Set-up

## Dimensionality Reduction - default UMAP

In [None]:
umap2d = UMAP(n_components = 2, init = 'random', random_state = 0)
# umap3d = UMAP(n_components = 3, init = 'random', random_state = 0)

proj_2d = umap2d.fit_transform(embeddings)
# proj_3d = umap3d.fit_transform(embeddings)

In [None]:
fig2d = px.scatter(
    proj_2d, x = 0, y = 1,
    width = 1000, height = 700
)
# fig3d = px.scatter_3d(
#     proj_3d, x = 0, y = 1, z = 2,
#     color = df_comb.doc_type, labels = {'color': 'doc_type'}
# )

fig2d.show()
# fig3d.show()

In [None]:
# comparing it to TSNE dimensionality reduction - takes quite a while so commented out 
# tsne2d = TSNE(n_components = 2, random_state = 0)
# tsne3d = TSNE(n_components = 3, random_state = 0)

# proj_2d_tsne = tsne2d.fit_transform(embeddings)
# proj_3d_tsne = tsne3d.fit_transform(embeddings)


In [None]:
# fig2d_tsne = px.scatter(
#     proj_2d_tsne, x = 0, y = 1, 
#     color = df_comb.doc_type, labels = {'color': 'doc_type'}
# )

# # fig3d_tsne = px.scatter_3d(
# #     proj_3d_tsne, x = 0, y = 1, z = 2,
# #     color = df_comb.doc_type, labels = {'color': 'doc_type'}
# # )

# fig2d_tsne.show()
# # fig3d_tsne.show()

Based on the above visualizations, we can see that unlike Boelders, we do not require to overlap the sentence embeddings over each other, as there are not large semantic differences between the news articles and the reports. The jargon and language seems to be mostly the same, as such we can just use the original embeddings.

### Testing UMAP parameters

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(21, 10))
nns = [5,10, 15, 30, 50, 100]
#2, 3, 4
i, j = 0, 0
for n_neighbors in tqdm(nns):
    fit = UMAP(n_neighbors=n_neighbors, random_state = 0)
    u = fit.fit_transform(embeddings)
    sns.scatterplot(x=u[:,0], y=u[:,1], ax=ax[j, i])
    ax[j, i].set_title(f'n={n_neighbors}')
    if i < 2: i += 1
    else: i = 0; j += 1

In [None]:
# 2D representation
fit2d = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, init = 'random', metric = 'cosine', random_state = 0)
u2d = fit2d.fit_transform(embeddings)

fig2d = px.scatter(
    x=u2d[:,0], y=u2d[:,1],
    width = 1000, height = 700
)

fig2d.show()

# Running BERTopic

In [None]:
docs = df_comb['anon_sentence']

In [None]:
# Reduce dimensionality of embeddings, this step is necessary for later 2-dimensional representations of the clusters:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0,init = 'random', metric = 'cosine', random_state = 0).fit_transform(embeddings)

In [None]:
# getting stopwords for the TF-IDF representation
stopwords = list(stopwords.words('english')) + ['company','coca','cola'] + names_list

In [None]:
# customizing parts of the BERTopic pipeline
# vectorizer
vectorizer_model = CountVectorizer(stop_words = stopwords)

# umap
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, init = 'random', metric = 'cosine', random_state = 0)

# hdbscan
hdbscan_model = HDBSCAN(min_cluster_size = 200, metric='euclidean', prediction_data=True)
# 200 has decent results
# min_samples
# diversity topic words

# this is to get more distinct topics
representation_model = MaximalMarginalRelevance(diversity=0.5)

In [None]:
# defining the topic model 
topic_model = BERTopic(
    embedding_model = sent_embedder,
    vectorizer_model = vectorizer_model,
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    language="english", 
    representation_model = representation_model, #diversify topic words
    calculate_probabilities=True, 
    verbose=True, 
)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [None]:
# optional visualizations
# topic_model.visualize_barchart(top_n_topics=10, n_words = 10, height = 400)

In [None]:
# topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, 
#                                 hide_document_hover=True, hide_annotations=True)

## Topic Reduction

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

In [None]:
# get a hierarchical tree
>>> tree = topic_model.get_topic_tree(hierarchical_topics)
>>> print(tree)

# saf stands for sustainabile aviation fuel - together with topic 13 talks about the opportunities for sustainable fuel to reduce mostly emissions - can call it something like sustainable transportation? with dhl

In [None]:
topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)

In [None]:
# Now reduction of topics - we will merge them manually based ont he above visualizations
topics_to_merge = [
    [20,33,3,14], #plastic packaging and recycling
    [12,4,7,51,60,24,59,30], #recycling, more focus on circular economy other than just plastic - waste management
    [58,56,35,21,13], #regenerative agriculture/sustainable agriculture
    [8,23,27], #water, biodiversity, deforestation - could be called nature preservation
    [43,16], #sustainability (tech) innovation
    [5,28], #climate change
    [1,6,17,22], #sustainable leadership/governance
    [29,11,25,19], #sustainable fuels/transportations
    [39,40,2,15,53,61,45,47,30,32], #decarbonisation,emission reductions - include 47? (air pollution), 30? (chemical substances)
    [18,34,46]#EV's
] 
topic_model.merge_topics(docs, topics_to_merge)

## Outlier Reduction

In [None]:
# I saved and loaded the topic model here to not have to re-run it every time
# topic_model.save('merged_model')
# topic_model = BERTopic.load('merged_model')

In [None]:
topics = topic_model.topics_
probs = topic_model.probabilities_

In [None]:
topic_model.get_topic_info()

In [None]:
%%time
#new_topics_probs = topic_model.reduce_outliers(docs, topics, probabilities=probs, strategy="probabilities")
#new_topics_tfidf = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf", threshold = 0.1)
new_topics_dist = topic_model.reduce_outliers(docs, topics, strategy = "distributions", threshold = 0.08)

### HDBSCAN Probabilities

In [None]:
topic_model.update_topics(docs, topics=new_topics_probs, vectorizer_model = vectorizer_model, representation_model = representation_model)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

In [None]:
topic_model.get_topic_info()

### C-TF-IDF similarity

In [None]:
topic_model.update_topics(docs, topics=new_topics_tfidf, vectorizer_model = vectorizer_model, representation_model = representation_model)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

In [None]:
topic_model.get_topic_info()

### Using Sentence and Topic Embedding Similarity

In [None]:
#topic_model.update_topics(docs, topics=new_topics_embed, vectorizer_model = vectorizer_model,representation_model = representation_model)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=False, hide_annotations=True)

In [None]:
topic_model.get_topic_info()

### Using topic distributions

In [None]:
# updating the topics based on the reduction of outliers using topic distributions
topic_model.update_topics(docs, topics=new_topics_dist, vectorizer_model = vectorizer_model, representation_model = representation_model) #  representation_model = representation_model
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

In [None]:
# creating a new column with the topics
topics = topic_model.topics_
df_comb['topics'] = topics

In [None]:
# second merging after taking a look at the reports
topics_to_merge = [
    [26,10], #green cars/car production
    [14,3], #pollution lawsuits included
    
] 
topic_model.merge_topics(docs, topics_to_merge)

In [None]:
topic_model.get_topic_info()

# Generating Word Clouds and Other Visualizations for the Thesis

In [None]:
from wordcloud import WordCloud

def create_wordcloud(model, topic, save_path = None):
    # creating the text based on TF-IDF keywords to create the wordclouds
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    if save_path:
        plt.savefig(os.path.join(save_path, f"wordcloud_{topic}.png"))
    else:
        plt.show()


In [None]:
sust_topics = [0,1,2,3,4,5,6,7,8,10,11,20,25]
save_path = ".\\visualizations"

for i in sust_topics:
    create_wordcloud(topic_model, i, save_path = save_path)

## Creating Heatmaps

In [None]:
topic_labelling = {0: "ESG Governance", 1: "Emission Reduction", 2: "Waste Management", 3: "Renewable Energy", 4: "Plastics Recycling", 5: "Electrical Vehicles", 6:"Climate Change Risk Mitigation", 7: "Nature Conservation", 8: "Green Transportation",
                   10: "Sustainable Agriculture", 11:"Sustainable Innovation", 20:"Sustainable Finance", 25:"Greenwashing Accusations"}

In [None]:
df_analyze = df_comb[df_comb['topics'].isin(sust_topics)]
df_analyze.reset_index(inplace=True, drop = True)

In [None]:
df_analyze['topic_labels'] = df_analyze['topics'].apply(lambda x: topic_labelling[x])

In [None]:
# Calculate the share of sentences per topic label and company
df_share = df_analyze.groupby(['company', 'topic_labels'])['sentence'].count() / df_analyze.groupby('company')['sentence'].count()
df_share = df_share.unstack().T

# Create the heatmap
plt.figure(figsize=(19, 13))
sns.heatmap(df_share, cmap='PuRd', annot=False)
plt.title('Share of Sentences per Topic Label and Company')
plt.xlabel('Company')
plt.ylabel('Topic Label')

# # Save the plot as a PNG file
plt.savefig('heatmap.png', dpi=500)

# Show the plot
plt.show()