### Extracting keywords for each paper

In [1]:
import pandas as pd

df = pd.read_csv('../outputs/hyperbook_training.csv', index_col='url')

In [2]:
"""
Implementation using ChunkeyBERT

from transformers import AutoTokenizer
from langchain_text_splitters import TokenTextSplitter
from chunkey_bert.model import ChunkeyBert

tokenizer = AutoTokenizer.from_pretrained(model_name)
text_splitter = TokenTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=1024,chunk_overlap=128)
chunker = lambda text: text_splitter.split_text(text)

chunkey_bert = ChunkeyBert(keybert=keybert)
"""

'\nImplementation using ChunkeyBERT\n\nfrom transformers import AutoTokenizer\nfrom langchain_text_splitters import TokenTextSplitter\nfrom chunkey_bert.model import ChunkeyBert\n\ntokenizer = AutoTokenizer.from_pretrained(model_name)\ntext_splitter = TokenTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=1024,chunk_overlap=128)\nchunker = lambda text: text_splitter.split_text(text)\n\nchunkey_bert = ChunkeyBert(keybert=keybert)\n'

In [3]:
from tqdm import tqdm 
from keybert import KeyBERT

model_name = 'Qwen/Qwen3-Embedding-0.6B'

if 'keywords' not in df.columns:
    # Compute all the keywords. This may take a while
    keybert = KeyBERT(model=model_name)
    extracted_keywords = []

    for content in tqdm(df['content']):
        pairs = keybert.extract_keywords(
            content, 
            top_n=20, 
            stop_words='english', 
            nr_candidates=40
        )
        keywords = ','.join([pair[0] for pair in pairs])
        extracted_keywords.append(keywords)

    # Make sure to save the results to avoid computing again the whole thing!
    df['keywords'] = extracted_keywords

    df.to_csv('../outputs/hyperbook_training.csv')
    df = pd.read_csv('../outputs/hyperbook_training.csv', index_col='url')

### Clustering similar keywords to model topics

In [None]:
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.preprocessing import normalize
from collections import defaultdict

embedding_model = SentenceTransformer(model_name)
clusterer = HDBSCAN(min_cluster_size=2, metric='euclidean', algorithm='best')

def cluster_keywords(keywords: list[str]) -> dict:
    """
    Clusters relevant keywords into semantically
    related clusters and returns a map
    """
    embeddings = embedding_model.encode(keywords, show_progress_bar=True)
    embeddings = normalize(embeddings, norm='l2')
    clusters = clusterer.fit_predict(embeddings)
    cluster_map = defaultdict(list)

    for keyword, cluster_id in zip(keywords, clusters):
        if cluster_id != -1: # Skip noise if using HDBSCAN
            cluster_map[int(cluster_id)].append(keyword)

    print(f'Identified {len(cluster_map)} clusters')
    return cluster_map


# Test different clustering number members

In [5]:
from langchain_openai import AzureChatOpenAI

NAMING_PROMPT = (
    'Suggest one scientific, technical keyword that best describes the context. '
    'The suggested keyword must not be in the set of existing keywords. '
    'Avoid all formatting and output strictly in plaintext and lowercase.'
    '\n\nContext: {context}'
    '\n\nExisting keywords: {existing_keywords}'
    '\n\nSuggested keyword: '
)

naming_model = AzureChatOpenAI(azure_deployment='gpt-4o', api_version='2024-10-21')

def name_clusters(cluster_map: dict) -> dict:
    """
    Maps each cluster identifier to a semantically relevant name
    """
    cluster_names = {}

    for cluster_id in tqdm(cluster_map.keys()):
        keywords = cluster_map[cluster_id]
        context = ', '.join(keywords)
        existing_keywords = ', '.join(cluster_names.values())
        prompt = NAMING_PROMPT.format(context=context, existing_keywords=existing_keywords)

        try: 
            name = naming_model.invoke(prompt).content
            if name in set(cluster_names.values()): raise Exception
        except: 
            name = input(context)

        cluster_names[cluster_id] = name

    return cluster_names

In [6]:
def flatten_cluster_map(cluster_map: dict) -> tuple:
    """
    Flattens a cluster map and returns two equally long lists
    """
    flat_keywords = [
        kw 
        for group in cluster_map.values() 
        for kw in group
    ]
    flat_ids = [
        mapped_id 
        for cluster_id in cluster_map.keys() 
        for mapped_id in [cluster_id] * len(cluster_map[cluster_id]) 
    ]
    return flat_keywords, flat_ids

In [7]:
import umap.umap_ as umap
import plotly.express as px

reducer = umap.UMAP(n_neighbors=10, min_dist=0.1, metric='cosine', random_state=42)

def display_clusters(cluster_map: dict, cluster_names: dict):
    """
    Displays clusters and their member keywords
    """

    flat_keywords, flat_ids = flatten_cluster_map(cluster_map)
    flat_names = [cluster_names[cluster_id] for cluster_id in flat_ids]

    embeddings = embedding_model.encode(flat_keywords)
    embeddings_2d = reducer.fit_transform(embeddings)

    coords = pd.DataFrame({
        'keyword': flat_keywords,
        'x': embeddings_2d[:, 0],
        'y': embeddings_2d[:, 1],
        'cluster': flat_names
    })
        
    return px.scatter(
        coords, 
        x='x', 
        y='y',
        color=flat_names,
        hover_data=['keyword'],
        title='Semantically clustered keywords'
    )

In [8]:
# Flatten and remove duplicates
extracted_keywords = [group.split(',') for group in df['keywords']]
keywords = list(set([kw for group in extracted_keywords for kw in group]))
titles = list(df['title'])

In [9]:
import json

import_data = True  # Change this manually

if import_data:
    print('Importing cluster map data')
    with open('../outputs/cluster_data.json', 'r') as json_file:
        cluster_data = json.load(json_file)
    cluster_map, cluster_names = cluster_data['cluster_map'], cluster_data['cluster_names'] 
else:
    print('Mapping clusters')
    cluster_map = cluster_keywords(keywords)
    print('Naming clusters')
    cluster_names = name_clusters(cluster_map)

Importing cluster map data


In [11]:
cluster_names = name_clusters(cluster_map)

100%|██████████| 274/274 [16:51<00:00,  3.69s/it]


In [13]:
difference = len(cluster_map) - len(set(cluster_names.values()))

if difference == 0:
    print(f'You have {len(cluster_map)} uniquely named topics')
else:
    print('There are multiplicates in your topics. The difference is', difference)

You have 274 uniquely named topics


In [14]:
import json

# Save the data to avoid computing the whole thing again !
cluster_data = {'cluster_map': cluster_map, 'cluster_names': cluster_names}

with open('../outputs/cluster_data.json', 'w') as json_file:
    json.dump(cluster_data, json_file)

In [16]:
print('Generating display')
figure = display_clusters(cluster_map, cluster_names)
figure.update_layout(height=1000)
figure.show()

Generating display


In [41]:
import networkx as nx

def create_graph(cluster_map, cluster_names, titles, extracted_keywords):
    # Map each keyword to its cluster
    keyword_to_cluster = {
        keyword: cluster_id
        for cluster_id, keywords in cluster_map.items()
        for keyword in keywords
    }

    G = nx.Graph()

    # Add cluster nodes
    for cluster_id, cluster_name in cluster_names.items():
        G.add_node(f'cluster_{cluster_id}',  label=cluster_name + ' (topic)', type='cluster')
    
    # Add article nodes
    for i, pair in enumerate(zip(titles, extracted_keywords)):
        title, keywords = pair[0], pair[1]
        G.add_node(f'article_{i}', label=title + ' (article)', type='article')

        added_clusters = set()
        
        for kw in keywords:
            if kw in keyword_to_cluster:
                cluster_id = keyword_to_cluster[kw]
                # Add edge between cluster and article
                if cluster_id and cluster_id not in added_clusters:
                    G.add_edge(f'article_{i}', f'cluster_{cluster_id}')

    return G

In [42]:
G = create_graph(cluster_map, cluster_names, titles, extracted_keywords)

In [43]:
import json

# Exporting the Hyperbook graph
nodes = [{"id": n, "label": G.nodes[n]['label'], "type": G.nodes[n]['type']} for n in G.nodes()]
edges = [{"from": u, "to": v} for u, v in G.edges()]

with open('../outputs/hyperbook_graph.json', 'w') as f:
    json.dump({'nodes': nodes, 'edges': edges}, f)

In [44]:
from jinja2 import Environment, FileSystemLoader
import json

env = Environment(loader=FileSystemLoader('../outputs'))
template = env.get_template('/template_graph.html')

with open('../outputs/hyperbook_graph.json') as f:
    graph_data = json.load(f)

html_output = template.render(data=graph_data)

with open('../index.html', 'w') as f:
    f.write(html_output)

In [36]:
for topic in (set([x['label'] for x in graph_data['nodes'] if x['type'] == 'cluster'])):
    print(topic)

osteogenesis
ferromagnetic
frustule
lipidomics
biolocomotion
pyrolysis
paraxonic
stratum
photoreception
topography
thermal conductivity
echolocation
osteoregeneration
tactile sensory
quorum sensing
oscillations
vasoreactivity
nanomechanics
cephalopod
cardiodynamics
autophagy
geometric morphometry
metabolomics
phycology
glycoprotein
otology
odontoblasts
micropaleontology
amyloid
xerophyte
gliogenesis
pulmonary
phagocytosis
psittacology
locomotion
phototrophy
magnetoreception
biorobotics
entomophily
hydrodynamics
metamorphism
galactopoiesis
ecdysteroid
angiogenesis
vocalization
speciation
paleoclimate
camouflage
photospectrometry
mycology
tessellation
biopolymer
pigmentation
ethology
crypsis
biomimetics
monotrematology
microscopy
fertilization
scleromechanics
neuroimaging
iridescence
wetting
olfactometry
biosonar
extrusome
ivory
invertebrate
chemoregulatory
tardigradology
optometry
toxicodynamics
biofabrication
gustation
keratinization
mixotrophy
proprioception
cladistics
ungulate
autoto

In [45]:

def analyze_cluster_names(keywords):
    """
    Clusters the cluster names together!
    """
    
    print('Mapping clusters')
    cluster_map = cluster_keywords(keywords)

    print('Naming clusters')
    cluster_names = {i: f'cluster_{i}' for i in cluster_map.keys()}

    print('Generating display')
    figure = display_clusters(cluster_map, cluster_names)

    return figure

figure = analyze_cluster_names(list(cluster_names.values()))
figure.update_layout(height=1000)
figure.show()

Mapping clusters


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Identified 25 clusters
Naming clusters
Generating display
