## Set-up and Data Loading

In [None]:
from sentence_transformers import SentenceTransformer, util, models
from transformers import pipeline, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from umap import UMAP
from hdbscan import HDBSCAN
import os
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from bertopic.representation import MaximalMarginalRelevance
from sklearn.manifold import TSNE
import plotly.express as px
from tqdm.auto import tqdm
import seaborn as sns
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

In [None]:
path_data = 'C:\\Users\\tnguyen10\\OneDrive - Deloitte (O365D)\\Documents\\GitHub\\Thesis\\data_structured'

In [None]:
df_report = pd.read_csv(os.path.join(path_data,'report_sentences.csv'))
df_pdf = pd.read_csv(os.path.join(path_data,'article_sentences_pdf.csv'))
df_gnews = pd.read_csv(os.path.join(path_data,'article_sentences_gnews.csv'))
df_article = pd.concat([df_pdf,df_gnews])

In [None]:
df_article.shape

In [None]:
df_report.shape

In [None]:
df_article.drop_duplicates(subset = ['sentence'], inplace = True)
df_article.reset_index(inplace = True, drop = True)

In [None]:
sent_embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
df_rep = df_report.copy()
df_art = df_article.copy()

In [None]:
# function for generating embeddings (useful for testing out later):
def gen_embeddings(df,model):
    sentences = df['sentence'].tolist()
    return model.encode(sentences)

In [None]:
# creating embeddings for the report sentences and storing them in a new column
rep_sent = df_rep['sentence'].tolist()
report_embeddings = sent_embedder.encode(rep_sent)
df_rep['embeddings'] = list(report_embeddings)

In [None]:
article_sent = df_art['sentence'].tolist()
article_embeddings = sent_embedder.encode(article_sent)
df_art['embeddings'] = list(article_embeddings)

- now we test out BERTopic 
- gonna try it with the overlapped sentence embeddings, non-overlapped?, and climateBERT + potentially FINbert
- then we try to manipulate the different parameters - tuning?

In [None]:
# df_rep.to_pickle(os.path.join(path_data,'rep.pkl'))
# df_art.to_pickle(os.path.join(path_data,'art.pkl'))
df_art = pd.read_pickle(os.path.join(path_data,'art.pkl'))
df_rep = pd.read_pickle(os.path.join(path_data,'rep.pkl'))

In [None]:
# combining the two datasets in preparation for the clustering
df_comb = pd.concat([df_art, df_rep])

In [None]:
df_comb['char_length'] = df_comb['sentence'].apply(lambda x: len(x))

In [None]:
df_comb.shape

In [None]:
df_comb = df_comb[(df_comb['char_length'] > 20)]
df_comb.shape

In [None]:
df_comb.reset_index(inplace = True, drop = True)

In [None]:
docs = df_comb['sentence']

In [None]:
# article_centroid = np.mean(df_art['embeddings'].values, axis = 0)
# report_centroid = np.mean(df_rep['embeddings'].values, axis = 0)

# dif = report_centroid - article_centroid 
# # Convert the article vectors
# def difference(org_vec):
#     return org_vec + dif

In [None]:
# df_art['new_embeddings'] = df_art['embeddings'].apply(difference)
# df_rep['new_embeddings'] = df_rep['embeddings']

In [None]:
# Load the sentence embeddings and doc types from the dataframe
embeddings = np.array(df_comb["embeddings"].tolist())

In [None]:
umap2d = UMAP(n_components = 2, init = 'random', random_state = 0)
# umap3d = UMAP(n_components = 3, init = 'random', random_state = 0)

proj_2d = umap2d.fit_transform(embeddings)
# proj_3d = umap3d.fit_transform(embeddings)

In [None]:
fig2d = px.scatter(
    proj_2d, x = 0, y = 1, 
    color = df_comb.doc_type, labels = {'color': 'doc_type'},
    width = 800, height = 500
)
# fig3d = px.scatter_3d(
#     proj_3d, x = 0, y = 1, z = 2,
#     color = df_comb.doc_type, labels = {'color': 'doc_type'}
# )

fig2d.show()
# fig3d.show()

In [None]:
tsne2d = TSNE(n_components = 2, random_state = 0)
tsne3d = TSNE(n_components = 3, random_state = 0)

proj_2d_tsne = tsne2d.fit_transform(embeddings)
proj_3d_tsne = tsne3d.fit_transform(embeddings)


In [None]:
fig2d_tsne = px.scatter(
    proj_2d_tsne, x = 0, y = 1, 
    color = df_comb.doc_type, labels = {'color': 'doc_type'}
)

# fig3d_tsne = px.scatter_3d(
#     proj_3d_tsne, x = 0, y = 1, z = 2,
#     color = df_comb.doc_type, labels = {'color': 'doc_type'}
# )

fig2d_tsne.show()
# fig3d_tsne.show()

Based on the above visualizations, we can see that unlike Boelders, we do not require to overlap the sentence embeddings over each other, as there are not large semantic differences between the news articles and the reports. The jargon and language seems to be mostly the same, as such we can just use the original embeddings.

## BERTopic Set-up

### Testing UMAP parameters

In [None]:
colors = df_comb['doc_type']
c_map = {
    'news': '#FAFF00',
    'report': '#1C17FF'
}
colors = [c_map[x] for x in colors]
len(colors)

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(21, 10))
nns = [5,10, 15, 30, 50, 100]
#2, 3, 4
i, j = 0, 0
for n_neighbors in tqdm(nns):
    fit = UMAP(n_neighbors=n_neighbors, random_state = 0)
    u = fit.fit_transform(embeddings)
    sns.scatterplot(x=u[:,0], y=u[:,1], ax=ax[j, i])
    ax[j, i].set_title(f'n={n_neighbors}')
    if i < 2: i += 1
    else: i = 0; j += 1

In [None]:
fig.savefig('full_figure.png')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 14))
nns = [3,4]
#2, 3, 4
i, j = 0, 0
for n_neighbors in tqdm(nns):
    fit = UMAP(n_neighbors=n_neighbors, random_state = 0)
    u = fit.fit_transform(embeddings)
    sns.scatterplot(x=u[:,0], y=u[:,1], c=colors, ax=ax[i])
    ax[i].set_title(f'n={n_neighbors}')
    i +=1

In [None]:
fit = UMAP(n_neighbors=10, n_components=3, min_dist=0.0, init = 'random', metric = 'cosine', random_state = 0)
u = fit.fit_transform(embeddings)

# the lowest min distance finally creates some separation - what does increasing it do?

In [None]:
fig = px.scatter_3d(
    x=u[:,0], y=u[:,1], z=u[:,2],
    color=df_comb.doc_type,
    labels = {'color': 'doc_type'}
)

fig.show()

In [None]:
# 2D representation
fit2d = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, init = 'random', metric = 'cosine', random_state = 0)
u2d = fit.fit_transform(embeddings)

fig2d = px.scatter(
    x=u2d[:,0], y=u2d[:,1],
    color=df_comb.doc_type,
    labels = {'color': 'doc_type'},
    width = 800, height = 500
)

fig2d.show()

In [None]:
# 2D representation
fit2d = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, init = 'random', metric = 'cosine', random_state = 0)
u2d = fit.fit_transform(embeddings)

fig2d = px.scatter(
    x=u2d[:,0], y=u2d[:,1],
    color=df_comb.doc_type,
    labels = {'color': 'doc_type'},
    width = 800, height = 500
)

fig2d.show()

### Clustering with HDBSCAN

In [None]:
clusterer = HDBSCAN(min_cluster_size = 400)
clusterer.fit(u)

In [None]:
clusterer.condensed_tree_.plot(select_clusters=True)

In [None]:
clusterer = HDBSCAN(min_cluster_size = 400, min_samples = 50, metric = 'euclidean')
clusterer.fit(u)
clusterer.condensed_tree_.plot(select_clusters=True)

## BERTopic

In [None]:
# getting stopwords
stopwords = list(stopwords.words('english'))

In [None]:
# customizing parts of the BERTopic pipeline
# vectorizer
vectorizer_model = CountVectorizer(stop_words = stopwords)

# umap
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, init = 'random', metric = 'cosine', random_state = 0)

# hdbscan
hdbscan_model = HDBSCAN(min_cluster_size = 200, metric='euclidean', prediction_data=True)
# 200 has decent results
# min_samples
# diversity topic words
representation_model = MaximalMarginalRelevance(diversity=0.5)

In [None]:
topic_model = BERTopic(
    vectorizer_model = vectorizer_model,
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    language="english", 
    representation_model = representation_model, #diversify topic words
    calculate_probabilities=False, 
    verbose=True, 
    #nr_topics = 'auto'
    # min_topic_size = 50, 
    # n_gram_range = (1,2)
)
topics = topic_model.fit_transform(docs, embeddings)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic_info()
#hdbscan_model = HDBSCAN(min_cluster_size = 200, min_samples = 50, metric='euclidean', prediction_data=True)

In [None]:
topic_model.get_topic_info()
# hdbscan_model = HDBSCAN(min_cluster_size = 250, metric='euclidean', prediction_data=True)

In [None]:
topic_model.get_topic(25)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_topics()

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

In [None]:
>>> tree = topic_model.get_topic_tree(hierarchical_topics)
>>> print(tree)

In [None]:
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0,init = 'random', metric = 'cosine', random_state = 0).fit_transform(embeddings)
topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_barchart(top_n_topics=10, n_words = 20, height = 400)

In [None]:
topic_model.visualize_barchart(top_n_topics=10, n_words = 10, height = 400)

In [None]:
topic_model.visualize_barchart(top_n_topics=10)

In [None]:
topic_model.visualize_heatmap()

In [None]:
# storing coherence?
len(topics[0])

In [None]:
filtered_text = docs.to_list()

## Fitting different embedding models

In [None]:
# mpnet - don't have high hopes for this one
model_mpnet = SentenceTransformer('all-mpnet-base-v2')
mpnet_embeddings = gen_embeddings(len(df_comb), model_mpnet) 

np.save('mpnet_embeddings', mpnet_embeddings)

In [None]:
len(clim_embeddings)

In [None]:
topics_clim = topic_model.fit_transform(docs, clim_embeddings)

In [None]:
topic_model.get_topic_info()

In [None]:
reduced_clim_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0,init = 'random', metric = 'cosine', random_state = 0).fit_transform(clim_embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_clim_embeddings)

In [None]:
topic_term_matrix = topic_model.c_tf_idf_
words = topic_model.vectorizer_model.get_feature_names_out()

In [None]:
topic_words

## Calculating Coherence

In [None]:
documents = pd.DataFrame({"Document": filtered_text,
                          "ID": range(len(filtered_text)),
                          "Topic": topics[0]})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics[0]))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

In [None]:
coherence

In [None]:
kmeans_model = KMeans(n_clusters = k, init = 'k-means++',random_state = 0, n_init = 10)

In [None]:
topic_model = BERTopic(
    vectorizer_model = vectorizer_model,
    umap_model = umap_model,
    hdbscan_model = kmeans_model,
    language="english", 
    representation_model = representation_model, #diversify topic words
    calculate_probabilities=False, 
    verbose=True, 
    #nr_topics = 'auto'
    # min_topic_size = 50, 
    # n_gram_range = (1,2)
)
topics = topic_model.fit_transform(docs, embeddings)