In [6]:
import duckdb
import pandas as pd
from collections import Counter

search_term = "logic-programming"

# Step 1: Connect to the DuckDB file
con = duckdb.connect(database='../../public/data/github_meta.duckdb', read_only=True)
con.execute("SET threads TO 10;")  # Adjust based on your system

# Step 2: Get nameWithOwner and topic names from normalized table
query = """
    SELECT r.nameWithOwner, t.topic
    FROM repos r
    JOIN repo_topics t ON r.nameWithOwner = t.repo
"""
df = con.execute(query).fetchdf()

# Step 3: Group topics by repo into a list
grouped = df.groupby("nameWithOwner")["topic"].apply(list).reset_index()
grouped.columns = ["nameWithOwner", "topics"]

# Step 4: Filter repos based on search term in topics
search_term_lower = search_term.lower()
filtered_df = grouped[grouped["topics"].apply(lambda x: search_term_lower in [t.lower() for t in x])]

# Step 5: Count all co-occurring topics
all_topics = [topic for topics in filtered_df["topics"] for topic in topics]
topic_counts = Counter([t.lower() for t in all_topics])

# Remove the searched topic itself
topic_counts.pop(search_term_lower, None)

# Step 6: Convert to list of dicts and sort, only including topics with count > 2
topics = [{"name": name, "count": count} for name, count in topic_counts.items() if count > 2]
topics = sorted(topics, key=lambda x: x["count"], reverse=True)

# Print results
print(topics)

[{'name': 'prolog', 'count': 65}, {'name': 'logic', 'count': 23}, {'name': 'datalog', 'count': 22}, {'name': 'python', 'count': 18}, {'name': 'functional-programming', 'count': 16}, {'name': 'artificial-intelligence', 'count': 14}, {'name': 'minikanren', 'count': 13}, {'name': 'haskell', 'count': 12}, {'name': 'programming-language', 'count': 11}, {'name': 'language', 'count': 10}, {'name': 'javascript', 'count': 9}, {'name': 'answer-set-programming', 'count': 9}, {'name': 'java', 'count': 8}, {'name': 'swi-prolog', 'count': 7}, {'name': 'rust', 'count': 7}, {'name': 'declarative-programming', 'count': 6}, {'name': 'unification', 'count': 6}, {'name': 'c', 'count': 6}, {'name': 'machine-learning', 'count': 6}, {'name': 'smt', 'count': 6}, {'name': 'interpreter', 'count': 6}, {'name': 'sat-solver', 'count': 6}, {'name': 'constraints', 'count': 6}, {'name': 'object-oriented-programming', 'count': 5}, {'name': 'prolog-implementation', 'count': 5}, {'name': 'prolog-programming-language', '