# ANTMN with BERTopic
The following is an implementation of Walter & Ophir's (2019) __Analysis of Topic Model Networks__ method using the BERTopic topic modelling architecture on a Hungarian news corpus. Unfortunately none of the textual data used for this project can be legally published, therefore the following is merely a methodological demonstration. 
<br>

For the original method see: Walter, D. and Ophir, Y., 2019. News frame analysis: An inductive mixed-method computational approach. Communication Methods and Measures, 13(4), pp.248-266.
<br>
For the BERTopic implementation below see: Nagy, M., 2024. Testing an Inductive Mixed-method Computational Approach to News Frame Analysis: An analysis of Hungarian online reporting of the 2014 Russia-Ukraine conflict. https://repository.ifla.org/handle/123456789/3402

In [1]:
import pickle
import numpy as np
import pandas as pd
from functools import partial
from datetime import datetime
from ast import literal_eval
from collections import defaultdict
from itertools import combinations

import spacy

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

from sentence_transformers import SentenceTransformer

from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import calinski_harabasz_score

import igraph as ig
import networkx as nx

import matplotlib
matplotlib.use('nbagg')
from matplotlib import pyplot as plt 
from matplotlib import dates as mdates
%matplotlib inline
from adjustText import adjust_text

In [2]:
plt.rcParams['figure.figsize'] = [20, 16]

# Functions

In [71]:
np_mean = partial(np.mean, axis=0)  # Matrix mean calculation

def _np_max(stuff):
    """Matrix max calculation"""
    return np.max(list(stuff), axis=0)

def normalize_node_size(data):
    """Network node size normalizer"""
    return (data - np.min(data)) / (np.max(data) - np.min(data))

def sil_and_calinski(topic_model, topics, probs, embeddings, name=None):
    """Calculate Shiluette (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html) 
      and Calinski and Harabasz (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.calinski_harabasz_score.html) 
      scores for checking cluster quality
      
      Calinski-Harabasz Index is only for information, not as useful in DBSCAN based clustering. 
      """
    
    # Remove outlier topics
    # https://github.com/MaartenGr/BERTopic/issues/428#issuecomment-1027647827
    umap_embeddings = topic_model.umap_model.transform(embeddings)
    indices = [index for index, topic in enumerate(topics) if topic != -1]
    X = umap_embeddings[np.array(indices)]
    labels = [topic for index, topic in enumerate(topics) if topic != -1]
    s = silhouette_score(X, labels)
    
    c = calinski_harabasz_score(embeddings, topics)

    print(f'--> silhouette score: {s} / calinski_harabasz score: {c}')

def test_sil_cal(embeddings, texts, name=None, ranges=(10, 30)):
    """Iteratively test topic clustering with Shiluette and Calinski-Harabasz Index scores 
        with range of minimum cluster size 
        (https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#min_cluster_size)
        
        """

    embeddings = np.array(list(embeddings))
    # Test mean embeddings
    if name is not None:
        print(f'Testing {name}')
        
    for i in range(*ranges, 5):
        print(f'min_cluster_size {i}', end=' ')
        topic_model = get_model(i)
        topics, probs = topic_model.fit_transform(texts, embeddings=embeddings)
        sil_and_calinski(topic_model, topics, probs, embeddings=embeddings, name=name)
        

def get_model(n, auto_reduce_topics=False, spacy_rep=False, mmr=False):
    """Main function for creating BERTopic model.
    :params: n: HDBSCAN min_cluster_size"""
    # Embeddings
    embeddings_model = SentenceTransformer('NYTK/sentence-transformers-experimental-hubert-hungarian')
    # Preventing Stochastic Behaviour
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    # Controlling Number of Topics
    hdbscan_model = HDBSCAN(min_cluster_size=n, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    # Improving Default Representation
    with open('hu-stopwords.txt') as fh:
        hu_stopwords = [l.strip() for l in fh.readlines()]
    vectorizer_model = CountVectorizer(stop_words=hu_stopwords, min_df=2, ngram_range=(1, 3))
    # Frequent words remover
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

    # Additional Representations
    representation_model = {}
    representation_model['KeyBERT'] = KeyBERTInspired()
    
    if spacy_rep:   
        spacy_model = spacy.load('hu_core_news_lg')
        # # Part-of-Speech
        representation_model['POS'] = PartOfSpeech(spacy_model)
    # MMR
    if mmr:
        mmr_model = MaximalMarginalRelevance(diversity=0.3)
        representation_model['MMR'] = mmr_model

    bertopic_args = {
        # Pipeline models
        'embedding_model': embeddings_model,
        'umap_model': umap_model,
        'hdbscan_model': hdbscan_model,
        'vectorizer_model': vectorizer_model,
        'ctfidf_model': ctfidf_model,
        'representation_model': representation_model,
        # Hyperparameters
        'top_n_words': 10,
        'verbose': True,
        # Probs
        'calculate_probabilities': True
        }

    if auto_reduce_topics:
        bertopic_args['nr_topics'] = 'auto'
        
    # Training
    tm = BERTopic(**bertopic_args)
    print('Delivering model')
    return tm


def draw_network(Graph, cluster_name, scale=1, seed=None, k=None, iterations=50,  
                 save=None, alpha=0.25, font_size=5, with_labels=True):
    """Main function for drawing network from networkX Graph object.
    :params: Graph: NetworkX Graph object with calculated attributes that include cluster_name.
    :params: cluster_name: Chosen attribute for coloring nodes
    :params: scale: scale argument for spring layout
    :params: seed: Seed number for spring layout
    :params: k: Spring layout optimal distance between nodes
    :params: iterations: Spring layout maximum number of iterations taken
    :params: save: str or None, if str, graph png is saved with str as name
    :params: alpha: Opacity of network edges
    :params: font_size: Font size of node labels
    :params: with_labels: Bool, if False node labels are omitted"""
    
    Graph = nx.relabel_nodes(Graph, nx.get_node_attributes(Graph, "topic_label"))
    
    # Spring layout positions for all nodes - seed for reproducibility
    pos = nx.spring_layout(Graph, seed=seed, scale=scale, k=k, iterations=iterations)  
        
    node_sizes = list(nx.get_node_attributes(Graph, 'size').values())
    node_colors = list(nx.get_node_attributes(Graph, cluster_name).values())
    nx.draw_networkx_nodes(Graph, pos, node_color=node_colors, node_size=node_sizes)
    
    nx.draw_networkx_edges(
        Graph, pos, 
        edgelist=Graph.edges(),
        width=[e[2]['weight'] for e in Graph.edges(data=True)],
        alpha=alpha, 
        edge_color="b", 
        style="dashed")
    
    # node labels
    if with_labels:
        nx.draw_networkx_labels(Graph, pos, labels=nx.get_node_attributes(Graph, "topic_label"), font_size=font_size, font_family="sans-serif")
    
    ax = plt.gca()
    ax.margins(0.1)
    plt.axis("off")
    plt.tight_layout()

    # Create a list to hold the text objects for the labels
    texts = []
    
    # Draw labels and store the text objects
    for node, (x, y) in pos.items():
        texts.append(plt.text(x, y, node, fontsize=font_size, ha='center', va='center'))
    
    # Adjust the text labels to prevent overlap
    adjust_text(texts, only_move={'points': 'xy', 'texts': 'xy'}, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

    if save is not None:
        plt.savefig(f'output_graphs/{save}', dpi=200)
    else:
        plt.show()

# The following are community detection functions that return a tuple of a ID:color dict and a 
#  list of sets which contain IDs for a given community 

def get_louvain(nxGraph):
    """https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.louvain.louvain_communities.html"""
    louvain = nx.community.louvain_communities(nxGraph, weight='weight', resolution=1)
    louvain_coms = {}
    for c_num, com in enumerate(louvain):
        for top in com:
            louvain_coms[top] = c_num
    
    colors = ['lightblue', 'red', 'orange', 'gray', 'orange', 'pink', 'yellow', 'lightgreen']
    louvain_colors = {}
    for top, community in louvain_coms.items():
        louvain_colors[top] = colors[community]
    return louvain_colors, louvain

def get_walktrap(nxGraph):
    """https://igraph.org/python/doc/api/igraph._igraph.GraphBase.html#community_walktrap"""
    # convert to igraph
    iG = ig.Graph.from_networkx(nxGraph)
    
    walktrap = iG.community_walktrap(weights='weight')
    communities = walktrap.as_clustering()
    
    walktrap_coms = {}
    for c_num, com in enumerate(communities):
        for top in com:
            walktrap_coms[top] = c_num
    
    colors = ['lightblue', 'red', 'orange', 'gray', 'orange', 'pink', 'yellow', 'lightgreen']
    walktrap_colors = {}
    for top, community in walktrap_coms.items():
        walktrap_colors[top] = colors[community]

    return walktrap_colors, list(communities)
    
def get_spinglass(nxGraph):
    """https://igraph.org/python/doc/api/igraph._igraph.GraphBase.html#community_spinglass"""
    # convert to igraph
    iG = ig.Graph.from_networkx(nxGraph)
    spinglass = iG.community_spinglass(weights='weight')
    spinglass_coms = dict(zip([i['_nx_name'] for i in iG.vs], spinglass.membership))
    
    colors = ['lightblue', 'red', 'orange', 'gray', 'orange', 'pink', 'yellow', 'lightgreen', 'darkgreen', 'grey', 'darkblue'] + \
         ['lightblue', 'red', 'orange', 'gray', 'orange', 'pink', 'yellow', 'lightgreen', 'darkgreen', 'grey', 'darkblue']
    spinglass_colors = {}
    spinglass_community_dict = defaultdict(list)
    for top, community in spinglass_coms.items():
        spinglass_colors[top] = colors[community]
        spinglass_community_dict[community].append(top)
        
    return spinglass_colors, [[i['_nx_name'] for i in g.vs] for g in spinglass.subgraphs()]
    
def get_fast_greedy(nxGraph):
    """https://igraph.org/python/doc/api/igraph._igraph.GraphBase.html#community_fastgreedy"""
    iG = ig.Graph.from_networkx(nxGraph)

    fast_greedy = iG.community_fastgreedy(weights='weight')
    fast_greedy_communities = fast_greedy.as_clustering()
    
    fast_greedy_coms = {}
    for c_num, com in enumerate(fast_greedy_communities):
        for top in com:
            fast_greedy_coms[top] = c_num
    
    colors = ['lightblue', 'red', 'orange', 'gray', 'orange', 'pink', 'yellow', 'lightgreen']
    fast_greedy_colors = {}
    for top, community in fast_greedy_coms.items():
        fast_greedy_colors[top] = colors[community]

    return fast_greedy_colors, list(fast_greedy_communities)

def get_eigenvector(nxGraph):
    """https://igraph.org/python/doc/api/igraph._igraph.GraphBase.html#community_leading_eigenvector"""
    iG = ig.Graph.from_networkx(nxGraph)
    leading_eigenvector = iG.community_leading_eigenvector(weights='weight')
    leading_eigenvector_communities = dict(zip([i['_nx_name'] for i in iG.vs], leading_eigenvector.membership))
    
    colors = ['lightblue', 'red', 'orange', 'gray', 'orange', 'pink', 'yellow', 'lightgreen']
    leading_eigenvector_colors = {}
    for top, community in leading_eigenvector_communities.items():
        leading_eigenvector_colors[top] = colors[community]
    return leading_eigenvector_colors, [[i['_nx_name'] for i in g.vs] for g in leading_eigenvector.subgraphs()]

# Import data
Minimum data requirements: the news data must contain a column for identifying the article (e.g. unique identifier ID) and a column for the text itself. 

In [145]:
df = pd.read_csv('/path/to/your/data')

# Calculate embeddings
In the following section two methods are introduced for calculating embeddings: 
1) Whole text embedding (where either the first n tokens of the text are considered accoring to the sentence transformer model, or shorter texts can be used such as lead paragraphs)
2) Sentence based embeddings (where later averaging of embeddings is required)  

## Embedding model
For Hungarian Experimental Sentence-BERT model created by the Hungarian Research Centre for Linguistics
https://huggingface.co/NYTK/sentence-transformers-experimental-hubert-hungarian

In [57]:
sentence_transformer_model = SentenceTransformer('NYTK/sentence-transformers-experimental-hubert-hungarian')

## 1. Whole text embedding
As the SentenceTransformers embedding library was created for short texts, only lead paragraphs are advised as whole texts, or even just titles.

In [146]:
whole_df = df.copy()

In [None]:
whole_text_embeddings = sentence_transformer_model.encode(whole_df['text'].to_list(), show_progress_bar=True)  # Use whatever the text column name is for 'text'
whole_df['embedding'] = list(whole_text_embeddings)  # Append embeddings column to dataframe

## 2. Sentence based embeddings 

In [None]:
# Download spacy model for sentence splitting
# Use whatever model here, I use hu_core_new_lg https://huggingface.co/huspacy/hu_core_news_lg
!pip install https://huggingface.co/huspacy/hu_core_news_lg/resolve/main/hu_core_news_lg-any-py3-none-any.whl

In [139]:
# Initialize spacy model for sentence splitting
nlp = spacy.load('hu_core_news_lg')  
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x71042e376dd0>

In [None]:
sentence_ids = []
sentence_texts = []


# Iterate through each text and split texts into sentences
for index, row in df.iterrows():
    
    identifier = row['id']  # Get identifier column value
    text = row['text']  # Get text column value
    
    c = 0  # Counter for number of sentences

    text_proc = nlp(text)

    for sentence in text_proc.sents:
        c += 1
        sentence_ids.append((identifier, c))
        sentence_texts.append(sentence.text)

# Calculate sentence embeddings    
sentence_embeddings = list(sentence_transformer_model.encode(sentence_texts, show_progress_bar=True))
# Create multi-index of file and paragraph number
sentence_embeddings_index = pd.MultiIndex.from_tuples(sentence_ids, names=['id', 'par_num'])
# Create dataframe where multi-index is file and paragraph number, and columns are sentences and sentence embeddings
sent_df = pd.DataFrame({'text': sentence_texts, 'embedding': sentence_embeddings}, index=sentence_embeddings_index)

# Get topics
repeat steps for 1) whole text and 2) sentence based embeddings
- Run Silhuette and Calinski test
- Run get_model to create Topic model + use BERTopic visualizations to choose right HDBSCAN min_cluster_size 

## 1) Whole text

In [None]:
# Run min_cluster_size tests
# --> High silhuette score is good, consider higher Calinski-Harabasz Index, however, rely more on silhuette
test_sil_cal(whole_df['embedding'], whole_df['text'], ranges=(10, 25))  

In [None]:
# Get model for best number and see BERTopic topic model distribution visualization. I usually chose the min_cluster_size number where the visualization seemed most convincing
min_cluster_size = 15 # change this around to see, 15 usually works best
model = get_model(min_cluster_size)
whole_topics, whole_probs = model.fit_transform(whole_df['text'], embeddings=np.array(list(whole_df['embedding'])))
model.visualize_topics()

## 2) Sentence based embeddings

In [None]:
# Run min_cluster_size tests
# --> High silhuette score is good, consider higher Calinski-Harabasz Index, however, rely more on silhuette
test_sil_cal(sent_df['embedding'], sent_df['text'], ranges=(10, 25))  

In [None]:
# Get best scoring model and see BERTopic topic model distribution visualization. I usually chose the min_cluster_size number where the visualization seemed most convincing
min_cluster_size = 15 # change this around to see, 15 usually works best
model = get_model(min_cluster_size)
sent_topics, sent_probs = model.fit_transform(sent_df['text'], embeddings=np.array(list(sent_df['embedding'])))
model.visualize_topics()

# Annotate topics with topic names
In this step you must create a topic_number:topic_title dictionary for the topic you generated. (BERTopic now has OpenAI integration so you can generate topic labels with GPT). Here you can filter or even join together topics if they seem either redundant or unintelligible. 

In [None]:
whole_lables = {}  # 1: 'marriage', 2: 'children', 3: 'housing', etc...
sent_labels = {}

# Create networks based on pairwise probability scores
Repeat for 1) whole text and 2) sentence based  

## 1) Whole text

In [160]:
# Create theta matrix
whole_probs_df = pd.DataFrame(whole_probs)

In [167]:
# Create theta matrix
wG = nx.from_numpy_array(cosine_similarity(pd.DataFrame(whole_probs_df).transpose()), edge_attr='weight')
node_sizes = {k: v for k, v in zip(whole_probs_df.columns, [i*600 for i in normalize_node_size(whole_probs_df.sum())])}
nx.set_node_attributes(wG, node_sizes, name='size')
wG.remove_edges_from(nx.selfloop_edges(wG))
nx.set_node_attributes(wG, whole_lables, name='topic_label')

In [168]:
# Calculate clusters (second from tuple only for analysis)
w_louvain_colors, w_louvain = get_louvain(wG)
w_walktrap_colors, w_walktrap = get_walktrap(wG)
w_eigenvector_colors, w_eigenvector = get_eigenvector(wG)
w_fast_greedy_colors, w_fast_greedy = get_fast_greedy(wG)
w_spinglass_colors, w_spinglass = get_spinglass(wG)

nx.set_node_attributes(wG, w_louvain_colors, name='louvain')
nx.set_node_attributes(wG, w_walktrap_colors, name='walktrap')
nx.set_node_attributes(wG, w_eigenvector_colors, name='eigenvector')
nx.set_node_attributes(wG, w_fast_greedy_colors, name='fast_greedy')
nx.set_node_attributes(wG, w_spinglass_colors, name='spinglass')

In [None]:
# draw_network(wG, 'louvain')  # Change 'louvain' for other names of clusters

## 2) Sentence based

In [80]:
# Create theta matrix
sent_topic_df = sent_df.copy()
sent_topic_df['probs'] = list(sent_probs)
# Here exchange _np_max for _np_mean if you want to mean the sentence probabilities instead of taking the max probability
sent_topic_df_gp = sent_topic_df.groupby(sent_topic_df.index.get_level_values(0)).agg({'probs': _np_max, 'text': ' '.join})
sent_probs_df = pd.DataFrame(sent_topic_df_gp['probs'].to_list())

In [169]:
# Create newtork
sG = nx.from_numpy_array(cosine_similarity(pd.DataFrame(sent_probs_df).transpose()), edge_attr='weight')
size_cons = 300
node_sizes = {k: v for k, v in zip(sent_probs_df.columns, [i*size_cons for i in normalize_node_size(sent_probs_df.sum())])}
nx.set_node_attributes(sG, node_sizes, name='size')
sG.remove_edges_from(nx.selfloop_edges(sG))
nx.set_node_attributes(sG, sent_labels, name='topic_label')
sG.remove_nodes_from([n[0] for n in sG.nodes(data=True) if n[1]['topic_label'] is False])

In [170]:
# Calculate clusters (second from tuple only for analysis)
s_louvain_colors, s_louvain = get_louvain(sG)
s_walktrap_colors, walktrap = get_walktrap(G)
s_eigenvector_colors, s_eigenvector = get_eigenvector(sG)
s_fast_greedy_colors, s_fast_greedy = get_fast_greedy(sG)
s_spinglass_colors, s_spinglass = get_spinglass(sG)

nx.set_node_attributes(sG, s_louvain_colors, name='louvain')
nx.set_node_attributes(sG, s_walktrap_colors, name='walktrap')
nx.set_node_attributes(sG, s_eigenvector_colors, name='eigenvector')
nx.set_node_attributes(sG, s_fast_greedy_colors, name='fast_greedy')
nx.set_node_attributes(sG, s_spinglass_colors, name='spinglass')

In [174]:
# draw_network(sG, 'louvain')  # Change 'louvain' for other names of clusters