In [12]:
import pandas as pd
import networkx as nx
import ast
import numpy as np
from itertools import combinations
from collections import Counter
from networkx.algorithms import community
import matplotlib.pyplot as plt

# Define similar topics
similar_topics = { "large-language-models",
    "large-language-model",
    "llm",
    "llms",
    "foundation-model",
    "foundation-models",
    "generative-ai",
    "language-model",
    "llm-inference",
    "multimodal-large-language-models",
    'gpt',
    'llama',
    'llama2',
    'openai',
    'chatgpt',
    'generative-ai',
    'multimodal',
    'gpt-4',
    'retrieval-augmented-generation',
    'chain-of-thought',
    'prompt-engineering',
    'langchain',
    'llm-agent',
    'ai'}

# Load repository data
df_repos = pd.read_csv("temp/llm_kb_github.csv")[0:10]

# Create an undirected graph
G = nx.Graph()

# Add nodes with topics, URL, stars, and additional attributes
for idx, row in df_repos.iterrows():
    repo = row['nameWithOwner']
    topics = row['topics'].replace("[","").replace("]","").split(" ")
    url = f"https://github.com/{repo}"  # Repository URL
    stars = row['stars']  # Star count
    
    # Additional attributes
    additional_attrs = {}
    for attr in ['watchers', 'forks', 'primaryLanguage', 'isFork', 'isArchived', 'license']:
        if attr in row:
            additional_attrs[attr] = row[attr]
    
    # Handle different topic formats
    if isinstance(topics, str) and topics.startswith("[") and topics.endswith("]"):
        topics = ast.literal_eval(topics)  # Convert string list to actual list
        print(topics)
    elif isinstance(topics, str):
        topics = [topics]  # Convert single topic string to list
    elif isinstance(topics, np.ndarray):
        topics = topics.tolist()  # Convert numpy array to list
    else:
        # Handle any other array-like format
        topics = list(topics)

    topics_cleaned = {t.strip().lower() for t in topics}  # Clean and deduplicate topics

    print(topics_cleaned)
    # Add node with all attributes
    node_attrs = {
        'topics': ",".join(sorted(topics_cleaned)), 
        'url': url, 
        'stars': stars
    }
    # Add the additional attributes
    node_attrs.update(additional_attrs)
    G.add_node(repo, **node_attrs)

# Rest of the code remains the same
# Add edges based on shared topics
repos = list(G.nodes)
for repo1, repo2 in combinations(repos, 2):
    topics1 = set(G.nodes[repo1]['topics'].split(","))
    topics2 = set(G.nodes[repo2]['topics'].split(","))

    shared_topics = topics1 & topics2
    relevant_shared_topics = shared_topics & similar_topics
    non_relevant_shared_topics = shared_topics - similar_topics

    if (len(relevant_shared_topics) >= 2) and (len(non_relevant_shared_topics) >= 2):
        G.add_edge(repo1, repo2, shared_topics=",".join(sorted(non_relevant_shared_topics)))

# Perform Louvain community detection (clusters repos by non-similar topics)
louvain_communities = community.louvain_communities(G, weight=None, resolution=1.0)
top_clusters = sorted(louvain_communities, key=len, reverse=True)[:15]  # Select top 15 largest clusters

# Extract dominant topics and assign clusters
cluster_topics = {}

for i, cluster in enumerate(top_clusters):
    topic_counter = Counter()
    
    for repo in cluster:
        non_similar_topics = set(G.nodes[repo]['topics'].split(",")) - similar_topics
        topic_counter.update(non_similar_topics)

    # Get the most common topic in this cluster (dominant topic)
    dominant_topic = topic_counter.most_common(1)[0][0] if topic_counter else "Unknown"
    cluster_topics[i] = dominant_topic  

    # Assign dominant topic and cluster ID to each node
    for repo in cluster:
        G.nodes[repo]["category"] = dominant_topic
        G.nodes[repo]["cluster_id"] = i  

# Generate distinct colors for clusters
num_clusters = len(top_clusters)
colors = plt.cm.get_cmap("tab20", num_clusters)

# Assign colors to clusters
cluster_colors = {
    i: f"#{int(colors(i)[0]*255):02x}{int(colors(i)[1]*255):02x}{int(colors(i)[2]*255):02x}" 
    for i in range(num_clusters)
}

# Assign color attribute to each node
for i, cluster in enumerate(top_clusters):
    for repo in cluster:
        G.nodes[repo]["color"] = cluster_colors[i]

# Print Cluster Topics
print("\n=== Dominant Topic for Each of the Top 15 Clusters ===")
for cluster_id, topic in cluster_topics.items():
    print(f"Cluster {cluster_id}: {topic}")

# Write the graph to a GEXF file
output_file = "temp/llm_clusters_github.gexf"
nx.write_gexf(G, output_file)
print(f"GEXF file created: {output_file} with cluster, category, color, URL, stars, watchers, forks, primaryLanguage, isFork, isArchived, and license attributes.")

{"'artificial-intelligence'", "'autonomous-agents'", "'gpt-4'", "'ai'", "'python'", "'openai'"}
{"'bert'", "'natural-language-processing'", "'transformer'", "'nlp'", "'tensorflow'", "'pytorch-transformers'", "'pytorch'", "'nlp-library'", "'language-model'", "'language-models'"}
{"'chatgpt'", "'language'", "'chatbot'", "'chatgpt-api'", "'bots'"}
{"'ollama'", "'gemma'", "'golang'", "'mistral'", "'llama2'", "'go'", "'llama'", "'llms'", "'llama3'", "'llm'"}
{"'chatgpt'", "'react'", "'desktop'", "'tauri'", "'nextjs'", "'webui'", "'cross-platform'", "'tauri-app'", "'vercel'", "'gemini'"}
{"'ai-chat'", "'llm-inference'"}
{"'llama'", "'ggml'"}
{"'chatgpt'", "'gpt-4'", "'large-language-models'", "'academic'", "'chatglm-6b'"}
{"'gpt'", "'llms'", "'chatgpt'", "'generativeai'", "'ai'", "'generative-ai'", "'azure'", "'openai'", "'prompt-engineering'", "'dall-e'"}
{"'gpt'", "'chatgpt'", "'chatgpt4'", "'gpt-4'", "'gpt4-api'", "'gpt4'", "'gpt-3'", "'openai'", "'gpt3'", "'chatgpt-4'"}

=== Dominant Top

  colors = plt.cm.get_cmap("tab20", num_clusters)
