In [2]:
import pandas as pd
import networkx as nx
import ast
from itertools import combinations

similar_topics = [
    "flow-based-programming",
    "visual-programming",
    "visual-programming-editor",
    "visual-programming-language",
    "dataflow-programming",
    "blockly",
    "graph-editor",
    "node-editor"
]

df_repos = pd.read_csv("temp/tag_repos.csv")

# Create an empty undirected graph.
G = nx.Graph()

# Add nodes: Each repo becomes a node, and its topics are stored as a node attribute.
for idx, row in df_repos.iterrows():
    repo = row['repo_name']
    topics = row['topics']
    # Ensure topics are properly parsed from string representation of lists
    if isinstance(topics, str) and topics.startswith("[") and topics.endswith("]"):
        topics = ast.literal_eval(topics)  # Convert string list to actual list
    elif isinstance(topics, str):
        topics = [topics]  # Convert single topic string to list
    
    topics_cleaned = [t.strip().lower() for t in topics]  # Clean topics
    topics_str = ','.join(topics_cleaned)
    G.add_node(repo, topics=topics_str)

# Add edges: Check for shared tags between repos and add an edge with weight equal to shared topics count in similar_topics.
repos = list(G.nodes)
for repo1, repo2 in combinations(repos, 2):
    topics1 = set(G.nodes[repo1]['topics'].split(','))
    topics2 = set(G.nodes[repo2]['topics'].split(','))
    shared_topics = topics1 & topics2  # Ensure lowercase comparison
    relevant_shared_topics = shared_topics & set(similar_topics)
    non_relevant_shared_topics = shared_topics - set(similar_topics)
    if len(relevant_shared_topics)>=2 & len(non_relevant_shared_topics)>=1:
        print(repo1, repo2, non_relevant_shared_topics)
        G.add_edge(repo1, repo2)

# Write the graph to a GEXF file.
output_file = "temp/logic_repos.gexf"
nx.write_gexf(G, output_file)
print(f"GEXF file created: {output_file}")

gllms/Flue jerosoler/Drawflow {'javascript', 'flowchart'}
gllms/Flue sacode387/FlowRun {'executable-flowcharts', 'flowchart'}
nevalang/neva cross-platform/dspatcher {'reactive-programming', 'fbp'}
nevalang/neva hlang-tech/hlang {'flow-based', 'fbp-runtime'}
nevalang/neva cross-platform/dspatch {'reactive-programming', 'fbp'}
nevalang/neva cross-platform/dspatchables {'reactive-programming', 'fbp'}
nevalang/neva flydelabs/flyde {'reactive-programming', 'flow-based'}
nevalang/neva samuelmtimbo/unit {'reactive-programming', 'functional-programming', 'programming-language'}
nevalang/neva fbbdev/nodal {'dataflow-compiler', 'compiler'}
nevalang/neva ERnsTL/flowd {'fbp', 'fbp-runtime'}
BugelNiels/nitro ern0/dataflow-editor-concept {'editor', 'dataflow'}
BugelNiels/nitro cross-platform/dspatcher {'cpp', 'dataflow'}
BugelNiels/nitro houmain/QML-NodeGraph {'graph', 'node'}
BugelNiels/nitro Nicolas-Constanty/Dnai.Editor {'editor', 'dataflow', 'graph'}
BugelNiels/nitro cross-platform/dspatch {'cpp

In [1]:
import pandas as pd
import networkx as nx
import ast
from itertools import combinations
from collections import Counter, defaultdict

df_repos = pd.read_csv("temp/tag_repos.csv")

# Count topic frequencies
topic_counter = Counter()
topic_pairs = defaultdict(set)  # Dictionary to track topic pairs and the repos they appear in

for idx, row in df_repos.iterrows():
    topics = row['topics']
    if isinstance(topics, str) and topics.startswith("[") and topics.endswith("]"):
        topics = ast.literal_eval(topics)  # Convert string list to actual list
    elif isinstance(topics, str):
        topics = [topics]  # Convert single topic string to list
    
    topics_cleaned = {t.strip().lower() for t in topics}
    topic_counter.update(topics_cleaned)
    
    for topic1, topic2 in combinations(topics_cleaned, 2):
        topic_pairs[(topic1, topic2)].add(idx)  # Store the repo index where the pair appears

# Filter topics with frequency greater than 5
filtered_topics = {topic for topic, count in topic_counter.items() if count > 5}

# Create an empty undirected graph.
G = nx.Graph()

# Add edges between topics appearing in at least two same repositories and in the filtered set
for (topic1, topic2), repos in topic_pairs.items():
    if len(repos) >= 10 and topic1 in filtered_topics and topic2 in filtered_topics:
        G.add_edge(topic1, topic2)

# Add filtered topics as nodes
G.add_nodes_from(filtered_topics)

# Write the graph to a GEXF file.
output_file = "temp/logic_topics.gexf"
nx.write_gexf(G, output_file)
print(f"GEXF file created: {output_file}")

GEXF file created: temp/logic_topics.gexf


In [3]:
import pandas as pd
import networkx as nx
import ast
from itertools import combinations
from collections import Counter
from networkx.algorithms import community

# Define similar topics
similar_topics = {
'swi-prolog',
 'prolog-implementation',
 'prolog',
 'prolog-programming-language',
 'logic-programming',
 'logic',
 'answer-set-programming',
 'declarative-programming'
 'datalog'
 'iso-prolog-standard',
 'logical-programming',
 'prolog-application'
}

# Load repository data
df_repos = pd.read_csv("temp/logic_tag_repos.csv")

# Create an undirected graph
G = nx.Graph()

# Add nodes with topics as attributes
for idx, row in df_repos.iterrows():
    repo = row['repo_name']
    topics = row['topics']

    if isinstance(topics, str) and topics.startswith("[") and topics.endswith("]"):
        topics = ast.literal_eval(topics)  # Convert string list to actual list
    elif isinstance(topics, str):
        topics = [topics]  # Convert single topic string to list

    topics_cleaned = {t.strip().lower() for t in topics}  # Clean and deduplicate topics
    G.add_node(repo, topics=",".join(sorted(topics_cleaned)))  # Convert set to a sorted string

# Add edges based on shared topics
repos = list(G.nodes)
for repo1, repo2 in combinations(repos, 2):
    topics1 = set(G.nodes[repo1]['topics'].split(","))
    topics2 = set(G.nodes[repo2]['topics'].split(","))

    shared_topics = topics1 & topics2
    relevant_shared_topics = shared_topics & similar_topics
    non_relevant_shared_topics = shared_topics - similar_topics

    if (len(relevant_shared_topics) >= 2) and (len(non_relevant_shared_topics) >= 1):
        G.add_edge(repo1, repo2, shared_topics=",".join(sorted(non_relevant_shared_topics)))

# Perform Louvain community detection (clusters repos by non-similar topics)
louvain_communities = community.louvain_communities(G, weight=None, resolution=1.0)
top_clusters = sorted(louvain_communities, key=len, reverse=True)[:10]  # Select top 10 largest clusters

# Extract dominant topics from each of the top 10 clusters
cluster_topics = {}

for i, cluster in enumerate(top_clusters):
    topic_counter = Counter()
    
    for repo in cluster:
        non_similar_topics = set(G.nodes[repo]['topics'].split(",")) - similar_topics
        topic_counter.update(non_similar_topics)

    # Get the **most common topic** in this cluster (dominant topic)
    dominant_topic = topic_counter.most_common(1)[0][0] if topic_counter else "Unknown"

    # Store the cluster's dominant topic
    cluster_topics[i] = dominant_topic  

    # Assign the dominant topic as the category for each node in the cluster
    for repo in cluster:
        G.nodes[repo]["category"] = dominant_topic  # Assign dominant topic as category

# Print Cluster Topics
print("\n=== Dominant Topic for Each of the Top 10 Clusters ===")
for cluster_id, topic in cluster_topics.items():
    print(f"Cluster {cluster_id}: {topic}")

# Write the graph to a GEXF file
output_file = "temp/logic_repos_with_clusters.gexf"
nx.write_gexf(G, output_file)
print(f"GEXF file created: {output_file}")


=== Dominant Topic for Each of the Top 10 Clusters ===
Cluster 0: language
Cluster 1: javascript
Cluster 2: artificial-intelligence
Cluster 3: declarative-language
Cluster 4: constraint-programming
Cluster 5: machine-learning
Cluster 6: rdf
Cluster 7: interpreter
Cluster 8: parser
Cluster 9: prolog-exercises
GEXF file created: temp/logic_repos_with_clusters.gexf
