In [5]:
import pandas as pd
import networkx as nx
import ast
from itertools import combinations

similar_topics = [
    "flow-based-programming",
    "visual-programming",
    "visual-programming-editor",
    "visual-programming-language",
    "dataflow-programming",
    "blockly",
    "graph-editor",
    "node-editor"
]

df_repos = pd.read_csv("temp/tag_repos.csv")

# Create an empty undirected graph.
G = nx.Graph()

# Add nodes: Each repo becomes a node, and its topics are stored as a node attribute.
for idx, row in df_repos.iterrows():
    repo = row['repo_name']
    topics = row['topics']
    # Ensure topics are properly parsed from string representation of lists
    if isinstance(topics, str) and topics.startswith("[") and topics.endswith("]"):
        topics = ast.literal_eval(topics)  # Convert string list to actual list
    elif isinstance(topics, str):
        topics = [topics]  # Convert single topic string to list
    
    topics_cleaned = [t.strip().lower() for t in topics]  # Clean topics
    topics_str = ','.join(topics_cleaned)
    G.add_node(repo, topics=topics_str)

# Add edges: Check for shared tags between repos and add an edge with weight equal to shared topics count in similar_topics.
repos = list(G.nodes)
for repo1, repo2 in combinations(repos, 2):
    topics1 = set(G.nodes[repo1]['topics'].split(','))
    topics2 = set(G.nodes[repo2]['topics'].split(','))
    shared_topics = topics1 & topics2  # Ensure lowercase comparison
    relevant_shared_topics = shared_topics & set(similar_topics)
    if len(relevant_shared_topics) >= 2:
        G.add_edge(repo1, repo2, weight=len(relevant_shared_topics))

# Write the graph to a GEXF file.
output_file = "temp/repos.gexf"
nx.write_gexf(G, output_file)
print(f"GEXF file created: {output_file}")

GEXF file created: temp/repos.gexf


In [None]:
import pandas as pd
import networkx as nx
import ast
from itertools import combinations

df_repos = pd.read_csv("temp/tag_repos.csv")

# Create an empty undirected graph.
G = nx.Graph()

# Extract unique topics and add them as nodes.
unique_topics = set()
for idx, row in df_repos.iterrows():
    topics = row['topics']
    # Ensure topics are properly parsed from string representation of lists
    if isinstance(topics, str) and topics.startswith("[") and topics.endswith("]"):
        topics = ast.literal_eval(topics)  # Convert string list to actual list
    elif isinstance(topics, str):
        topics = [topics]  # Convert single topic string to list
    
    topics_cleaned = {t.strip().lower() for t in topics}  # Use set for unique topics
    unique_topics.update(topics_cleaned)
    
    # Add edges between topics appearing in the same repository
    for topic1, topic2 in combinations(topics_cleaned, 2):
        G.add_edge(topic1, topic2)

# Add unique topics as nodes
G.add_nodes_from(unique_topics)

# Write the graph to a GEXF file.
output_file = "temp/topics.gexf"
nx.write_gexf(G, output_file)
print(f"GEXF file created: {output_file}")