# Clustering co-authors

When looking at a specific author, we want to be able to understand author's research streams. We can do that by looking at their co-authors and clustering them. Since our recommender system learned embeddings of authors that incorporate both the data about author's publication context as well as with which authors they collaborated, we can use those embeddings for clustering. We do not know how many research streams an author have, but we assume authors from the same research streams are close together, hence we use HDBSCAN, which is a density-based clustering algorithm that builds a hierarchy of clusters and does not require a pre-defined number of clusters to be passed.

**Key takeaways:** 


### Imports & Global Variables

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath(".."))

import hdbscan
import numpy as np
import pandas as pd
import plotly.express as px
import ipywidgets as widgets

from box import Box
from IPython.display import display
from sklearn.manifold import TSNE

from util.postgres import create_sqlalchemy_connection, query

In [2]:
# -------------------- GLOBAL VARIABLES --------------------
PATH_TO_CONFIG_FILE = '../config.yaml'

# -------------------- LOAD CONFIGURATION --------------------
# Load the configuration file
config = Box.from_yaml(filename=PATH_TO_CONFIG_FILE)
# Initialize a BigQuery client
pg_connection = create_sqlalchemy_connection(
    username=config.POSTGRES.USERNAME,
    password=config.POSTGRES.PASSWORD,
    host=config.POSTGRES.HOST,
    port=config.POSTGRES.PORT,
    database=config.POSTGRES.DATABASE,
    schema=config.POSTGRES.SCHEMA
)
# Set numpy random seed
np.random.seed(config.RANDOM_SEED)

## Data preparation

In [3]:
# Get the authors
author_query = f"""
SELECT a.author_name AS author,
       COUNT(DISTINCT article_id)                    AS article_count,
       a.author_id
FROM fct_collaboration c
         INNER JOIN dim_author a
                    ON c.author_id = a.author_id
WHERE c.institution_id = 'UNI_LJ'
GROUP BY author, a.author_id
HAVING COUNT(DISTINCT article_id) > 10
ORDER BY article_count DESC
"""

author_df = query(conn=pg_connection, query_str=author_query)

In [4]:
# Prepare an ipywidget filter
# Construct a list of (author_label, author_id_value) tuples
author_options = [
    (row['author'], row['author_id']) 
    for _, row in author_df.iterrows()
]

author_dropdown = widgets.Dropdown(
    options=author_options,             # Display = 'author', Value = 'author_id'
    value=author_options[0][1],         # Set default selection to the first item's author_id
    description='Author:',
    style={'description_width': 'initial'}
)

display(author_dropdown)

Dropdown(description='Author:', options=(('Sandi Klavzar', '7004427576'), ('Tadej Battelino', '8726399700'), (…

In [5]:
# Get co-authors
co_author_embedding_query = f"""
WITH co_authors AS (SELECT DISTINCT c2.author_id
                    FROM fct_collaboration c1
                             INNER JOIN fct_collaboration c2
                                        ON c1.article_id = c2.article_id
                                            AND c1.author_id <> c2.author_id
                    WHERE c1.author_id = '{author_dropdown.value}')
SELECT c.author_id,
       a.author_name,
       e.embedding_tensor_data::float8[] AS embedding_tensor_data
FROM co_authors c
         INNER JOIN dim_author a
                    ON a.author_id = c.author_id
         INNER JOIN author_embedding e
                    ON a.author_id = e.author_id
"""

co_author_embedding_df = query(conn=pg_connection, query_str=co_author_embedding_query)
co_author_embedding_df.head(5)

Unnamed: 0,author_id,author_name,embedding_tensor_data
0,6603385749,Bresar B.,"[-0.39506229758262634, -0.23991012573242188, -..."
1,21934391300,Hafner-Bratkovic I.,"[0.31950807571411133, -0.33037662506103516, 0...."
2,35616671500,Tuza Z.,"[-0.39623090624809265, -0.13684344291687012, 0..."
3,24081926600,Yero I.G.,"[-0.26588693261146545, -0.2626689374446869, 0...."
4,57204123766,Gledel V.,"[-0.40502962470054626, -0.30505064129829407, 0..."


## Clustering collaborations 

In this section we use the co-author embeddings to cluster the co-author using HDBSCAN algorithm.

In [6]:
# Convert the embedding column into a single NumPy array.
X = np.array(co_author_embedding_df['embedding_tensor_data'].tolist())

In [7]:
# Clustering
def cluster(min_cluster_size_value: int, min_samples_value: int):
    """
    Cluster co-authors using using HDBSCAN
    """
    # HDBSCAN clustering
    hdb = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size_value,
        min_samples=min_samples_value,
        gen_min_span_tree=True
    )
    labels = hdb.fit_predict(X)
    
    # Assign cluster labels back to the DataFrame
    co_author_embedding_df['cluster'] = labels

    # t-SNE dimensionality reduction (2D)
    tsne = TSNE(
        n_components=2, 
        random_state=42, 
        perplexity=30, 
        max_iter=1000, 
        learning_rate='auto'
    )
    tsne_result = tsne.fit_transform(X)
    
    # Store TSNE components in the DataFrame
    co_author_embedding_df['t-SNE x'] = tsne_result[:, 0]
    co_author_embedding_df['t-SNE y'] = tsne_result[:, 1]

In [15]:
min_cluster_size_slider = widgets.IntSlider(
    value=3,
    min=2,
    max=20,
    step=1,
    description='min_cluster_size'
)

min_samples_slider = widgets.IntSlider(
    value=2,
    min=1,
    max=20,
    step=1,
    description='min_samples'
)

out = widgets.Output()

def update_plot(_):
    """
    Callback function to re-run the clustering and update the plot when slider values change.
    """
    with out:
        out.clear_output()

        # Perform HDBSCAN clustering
        cluster(
            min_cluster_size_value=min_cluster_size_slider.value, 
            min_samples_value=min_samples_slider.value
        )
        
        # Create an interactive Plotly scatter plot
        fig = px.scatter(
            co_author_embedding_df,
            x='t-SNE x',
            y='t-SNE y',
            color='cluster',
            hover_data=['author_id', 'author_name'],
            title=f'HDBSCAN (min_cluster_size={min_cluster_size_slider.value}, '
                  f'min_samples={min_samples_slider.value})',
            color_continuous_scale=px.colors.qualitative.Prism,
            height=800
        )

        fig.show()

# Observe changes in the sliders
min_cluster_size_slider.observe(update_plot, names='value')
min_samples_slider.observe(update_plot, names='value')

# Display the sliders and the output
display(min_cluster_size_slider, min_samples_slider, out)

# Generate the initial plot
update_plot(None)


IntSlider(value=3, description='min_cluster_size', max=20, min=2)

IntSlider(value=2, description='min_samples', max=20, min=1)

Output()