In [7]:
import pandas as pd
import networkx as nx
import ast
from itertools import combinations

similar_topics = ['swi-prolog',
 'prolog-implementation',
 'prolog',
 'prolog-programming-language',
 'logic-programming',
 'logic',
 'answer-set-programming',
 'declarative-programming'
 'datalog']

df_repos = pd.read_csv("temp/logic_tag_repos.csv")

# Create an empty undirected graph.
G = nx.Graph()

# Add nodes: Each repo becomes a node, and its topics are stored as a node attribute.
for idx, row in df_repos.iterrows():
    repo = row['repo_name']
    topics = row['topics']
    # Ensure topics are properly parsed from string representation of lists
    if isinstance(topics, str) and topics.startswith("[") and topics.endswith("]"):
        topics = ast.literal_eval(topics)  # Convert string list to actual list
    elif isinstance(topics, str):
        topics = [topics]  # Convert single topic string to list
    
    topics_cleaned = [t.strip().lower() for t in topics]  # Clean topics
    topics_str = ','.join(topics_cleaned)
    G.add_node(repo, topics=topics_str)

# Add edges: Check for shared tags between repos and add an edge with weight equal to shared topics count in similar_topics.
repos = list(G.nodes)
for repo1, repo2 in combinations(repos, 2):
    topics1 = set(G.nodes[repo1]['topics'].split(','))
    topics2 = set(G.nodes[repo2]['topics'].split(','))
    shared_topics = topics1 & topics2  # Ensure lowercase comparison
    relevant_shared_topics = shared_topics & set(similar_topics)
    non_relevant_shared_topics = shared_topics - set(similar_topics)
    if len(relevant_shared_topics)>=2 & len(non_relevant_shared_topics)>=1:
        print(repo1, repo2, non_relevant_shared_topics)
        G.add_edge(repo1, repo2)

# Write the graph to a GEXF file.
output_file = "temp/logic_repos.gexf"
nx.write_gexf(G, output_file)
print(f"GEXF file created: {output_file}")

CapelliC/hhprolog CapelliC/hitchhicker-prolog {'javascript', 'virtual-machine'}
retrofor/iamai lab-v2/pyreason {'machine-learning', 'python'}
retrofor/iamai lps-js/lps-studio {'artificial-intelligence', 'ai'}
trealla-prolog/trealla tau-prolog/tau-prolog {'iso-prolog-standard', 'prolog-interpreter'}
trealla-prolog/trealla ciao-lang/ciao {'iso-prolog-standard', 'prolog-interpreter'}
trealla-prolog/trealla mthom/scryer-prolog {'iso-prolog-standard', 'prolog-interpreter'}
tau-prolog/tau-prolog ciao-lang/ciao {'iso-prolog-standard', 'prolog-interpreter'}
tau-prolog/tau-prolog mthom/scryer-prolog {'iso-prolog-standard', 'prolog-interpreter'}
AppliedLogicSystems/ALSProlog ciao-lang/ciao {'iso-prolog-standard', 'programming-language', 'compiler'}
AppliedLogicSystems/ALSProlog lambduli/minilog {'programming-language', 'language'}
ciao-lang/ciao mthom/scryer-prolog {'iso-prolog-standard', 'prolog-interpreter'}
ciao-lang/ciao lps-js/lps.js {'programming-language', 'interpreter'}
AndreaInfUFSM/elc

In [2]:
import pandas as pd
import networkx as nx
import ast
from itertools import combinations
from collections import Counter, defaultdict

df_repos = pd.read_csv("temp/logic_tag_repos.csv")

# Count topic frequencies
topic_counter = Counter()
topic_pairs = defaultdict(set)  # Dictionary to track topic pairs and the repos they appear in

for idx, row in df_repos.iterrows():
    topics = row['topics']
    if isinstance(topics, str) and topics.startswith("[") and topics.endswith("]"):
        topics = ast.literal_eval(topics)  # Convert string list to actual list
    elif isinstance(topics, str):
        topics = [topics]  # Convert single topic string to list
    
    topics_cleaned = {t.strip().lower() for t in topics}
    topic_counter.update(topics_cleaned)
    
    for topic1, topic2 in combinations(topics_cleaned, 2):
        topic_pairs[(topic1, topic2)].add(idx)  # Store the repo index where the pair appears

# Filter topics with frequency greater than 5
filtered_topics = {topic for topic, count in topic_counter.items() if count > 5}

# Create an empty undirected graph.
G = nx.Graph()

# Add edges between topics appearing in at least two same repositories and in the filtered set
for (topic1, topic2), repos in topic_pairs.items():
    if len(repos) >= 10 and topic1 in filtered_topics and topic2 in filtered_topics:
        G.add_edge(topic1, topic2)

# Add filtered topics as nodes
G.add_nodes_from(filtered_topics)

# Write the graph to a GEXF file.
output_file = "temp/logic_topics.gexf"
nx.write_gexf(G, output_file)
print(f"GEXF file created: {output_file}")

GEXF file created: temp/logic_topics.gexf
