In [3]:
import pandas as pd
import networkx as nx
import ast
from itertools import combinations
from collections import Counter
from networkx.algorithms import community
import matplotlib.pyplot as plt  # For color mapping

# Define similar topics
similar_topics = { "large-language-models",
    "large-language-model",
    "llm",
    "llms",
    "foundation-model",
    "foundation-models",
    "generative-ai",
    "language-model",
    "llm-inference",
    "multimodal-large-language-models",
    'gpt',
    'llama',
    'llama2',
    'openai',
    'chatgpt',
    'generative-ai',
    'multimodal',
    'gpt-4',
    'retrieval-augmented-generation',
    'chain-of-thought',
    'prompt-engineering',
    'langchain',
    'llm-agent'}

# Load repository data
df_repos = pd.read_csv("temp/llm_kb.csv")

# Create an undirected graph
G = nx.Graph()

# Add nodes with topics, URL, and stars as attributes
for idx, row in df_repos.iterrows():
    repo = row['full_name']
    topics = row['topics']
    url = f"https://github.com/{repo}"  # Repository URL
    stars = row['stargazers_count']  # Star count

    if isinstance(topics, str) and topics.startswith("[") and topics.endswith("]"):
        topics = ast.literal_eval(topics)  # Convert string list to actual list
    elif isinstance(topics, str):
        topics = [topics]  # Convert single topic string to list

    topics_cleaned = {t.strip().lower() for t in topics}  # Clean and deduplicate topics
    G.add_node(repo, topics=",".join(sorted(topics_cleaned)), url=url, stars=stars)  # Store attributes

# Add edges based on shared topics
repos = list(G.nodes)
for repo1, repo2 in combinations(repos, 2):
    topics1 = set(G.nodes[repo1]['topics'].split(","))
    topics2 = set(G.nodes[repo2]['topics'].split(","))

    shared_topics = topics1 & topics2
    relevant_shared_topics = shared_topics & similar_topics
    non_relevant_shared_topics = shared_topics - similar_topics

    if (len(relevant_shared_topics) >= 1) and (len(non_relevant_shared_topics) >= 1):
        G.add_edge(repo1, repo2, shared_topics=",".join(sorted(non_relevant_shared_topics)))

# Perform Louvain community detection (clusters repos by non-similar topics)
louvain_communities = community.louvain_communities(G, weight=None, resolution=1.0)
top_clusters = sorted(louvain_communities, key=len, reverse=True)[:15]  # Select top 15 largest clusters

# Extract dominant topics and assign clusters
cluster_topics = {}

for i, cluster in enumerate(top_clusters):
    topic_counter = Counter()
    
    for repo in cluster:
        non_similar_topics = set(G.nodes[repo]['topics'].split(",")) - similar_topics
        topic_counter.update(non_similar_topics)

    # Get the most common topic in this cluster (dominant topic)
    dominant_topic = topic_counter.most_common(1)[0][0] if topic_counter else "Unknown"
    cluster_topics[i] = dominant_topic  

    # Assign dominant topic and cluster ID to each node
    for repo in cluster:
        G.nodes[repo]["category"] = dominant_topic
        G.nodes[repo]["cluster_id"] = i  

# Generate distinct colors for clusters
num_clusters = len(top_clusters)
colors = plt.cm.get_cmap("tab20", num_clusters)  # Use "tab15" colormap for 15 distinct colors

# Assign colors to clusters
cluster_colors = {
    i: f"#{int(colors(i)[0]*255):02x}{int(colors(i)[1]*255):02x}{int(colors(i)[2]*255):02x}" 
    for i in range(num_clusters)
}

# Assign color attribute to each node
for i, cluster in enumerate(top_clusters):
    for repo in cluster:
        G.nodes[repo]["color"] = cluster_colors[i]  # Assign cluster color

# Print Cluster Topics
print("\n=== Dominant Topic for Each of the Top 15 Clusters ===")
for cluster_id, topic in cluster_topics.items():
    print(f"Cluster {cluster_id}: {topic}")

# Write the graph to a GEXF file
output_file = "temp/llm_clusters.gexf"
nx.write_gexf(G, output_file)
print(f"GEXF file created: {output_file} with cluster, category, color, URL, and star count attributes.")

  colors = plt.cm.get_cmap("tab20", num_clusters)  # Use "tab15" colormap for 15 distinct colors



=== Dominant Topic for Each of the Top 15 Clusters ===
Cluster 0: nlp
Cluster 1: chatbot
Cluster 2: ai
Cluster 3: ai
Cluster 4: python
Cluster 5: mbr
Cluster 6: 3d-object-detection
Cluster 7: structured-generation
Cluster 8: obsidian
Cluster 9: nature-medicine
Cluster 10: commercial
Cluster 11: reactjs
Cluster 12: grasp-dataset
Cluster 13: model-diagnostics
Cluster 14: lattice-gauge-theory
GEXF file created: temp/llm_clusters.gexf with cluster, category, color, URL, and star count attributes.
