# DeepGit: Building Edges one Tag

## Preparation
### Package Import

In [1]:
import requests
import pandas as pd
from datetime import datetime, timezone
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter

In [2]:
load_dotenv()

True

### GitHub Authenticate Setup

Create a `.env` file in the root directory and add the line `GITHUB_TOKEN = "your_token"`.

In [3]:
def authenticate_github(token):
    """
    Returns headers required for GitHub API requests.
    The Accept header includes the preview for topics.
    """
    return {
        'Authorization': f'token {token}',
        'Accept': 'application/vnd.github.mercy-preview+json'
    }
token = os.getenv("GITHUB_TOKEN")
headers = authenticate_github(token)

## Function Definition

### GitHub Extraction

In [4]:
def search_github_repos_by_topic(tag, headers):
    """
    Searches GitHub repositories that have the specified topic (tag) and over 100 stars.
    Handles pagination and returns a list of dictionaries containing only the repository name and repo id.
    """
    repos = []
    page = 1
    per_page = 100
    query = f"topic:{tag} stars:>10"
    
    while True:
        url = (
            f"https://api.github.com/search/repositories?q={query}"
            f"&sort=stars&order=desc&per_page={per_page}&page={page}"
        )
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Error fetching repositories for tag '{tag}': {response.status_code}")
            break

        data = response.json()
        items = data.get('items', [])
        if not items:
            break

        for item in items:
            repos.append(
                 item.get("full_name")
            )

        # Break if fewer than 'per_page' items were returned (i.e. last page)
        if len(items) < per_page:
            break

        page += 1

    return repos

def get_repo_topics(full_name, headers):
    """
    Fetches the topics (tags) of a repository given its full name.
    """
    url = f"https://api.github.com/repos/{full_name}/topics"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json().get('names', [])
    return []

### Tag Similarity

In [5]:
def compute_topic_similarities(model_name: str, given_topic: str, topic_pool: list, threshold: float = 0.8):
    """
    Compute cosine similarity between a given topic and a pool of topics, filtering by a threshold.

    Args:
        model_name (str): Name of the SentenceTransformer model (e.g., 'all-MiniLM-L6-v2').
        given_topic (str): The topic to compare against the pool.
        topic_pool (list): List of topics to compare.
        threshold (float): Minimum similarity score to be included in the results (default: 0.8).

    Returns:
        list: Sorted list of (topic, similarity) tuples in descending order of similarity.
    """
    # Load model
    model = SentenceTransformer(model_name)
    
    # Encode the given topic and the pool of topics
    given_embedding = model.encode([given_topic])
    pool_embeddings = model.encode(topic_pool)
    
    # Compute cosine similarities
    similarities = cosine_similarity(given_embedding, pool_embeddings)[0]
    
    # Pair topics with their similarity scores
    topic_similarity_pairs = [topic for topic, sim in zip(topic_pool, similarities) if sim >= threshold]
    
    # Sort topics by similarity (higher is better)
    topic_similarity_pairs.sort(key=lambda x: x[1], reverse=True)

    return topic_similarity_pairs

## Demonstration

### Input Tag(Topic)

Suppose we have a topic `visual programming`

In [6]:
input_topic = "visual-programming"

### Git the Initial Repos

**Note**: for demonstration Purpose, we only extract repos have more than 10 stars

In [7]:
repos_with_input_topic = search_github_repos_by_topic(input_topic, headers)

In [8]:
len(repos_with_input_topic)

160

In [9]:
repos_with_input_topic

['EvgSkv/logica',
 'triska/the-power-of-prolog',
 'SWI-Prolog/swipl-devel',
 'souffle-lang/souffle',
 'noprompt/meander',
 'composewell/streamly',
 'opencog/atomspace',
 'CoNarrative/precept',
 'ichiban/prolog',
 'potassco/clingo',
 'ekzhang/percival',
 'tau-prolog/tau-prolog',
 'NucleoidAI/Nucleoid',
 'yuce/pyswip',
 'ekzhang/crepe',
 's-arash/ascent',
 'slovnicki/pLam',
 'LogtalkDotOrg/logtalk3',
 'alexanyernas/Ejercicios-Practicos',
 'prove-rs/z3.rs',
 'Shen-Language/shen-sources',
 'i-am-tom/holmes',
 'SHI-Yu-Zhe/awesome-agi-cocosci',
 'ciao-lang/ciao',
 'LukasZahradnik/PyNeuraLogic',
 'logictensornetworks/logictensornetworks',
 'google/neural-logic-machines',
 'c-cube/datalog',
 'SuperDisk/tar.pl',
 'HackerFoo/poprc',
 'grafana/thema',
 'luc-tielen/eclair-lang',
 'lab-v2/pyreason',
 'pythological/kanren',
 'mdiep/Logician',
 'mcsoto/cosmos',
 'ztangent/Julog.jl',
 'HarvardPL/formulog',
 'go-air/gini',
 'fogfish/datalog',
 'SAKET-SK/Programming-Aptitude-Interview-Prep',
 'FLHonker/

### Get the Pool of Topics

Now we have a set of repos and we can extract the pool of topics from these repos

In [10]:
topic_pool = []
for full_name in repos_with_input_topic:
    topic_pool = topic_pool + get_repo_topics(full_name, headers)

In [11]:
len(topic_pool)

1130

In [12]:
topic_pool_rm_dup = list(set(topic_pool))

In [13]:
topic_pool_rm_dup[0:10]

['wikidata',
 'lambda',
 'streaming',
 'expert-system',
 'concatenative',
 'competitive-programming',
 'concurrent',
 'java',
 'charts',
 'graph-database']

### Further Process the Topics

Note:We probably should do this again and again to ensure that the list of tags is thorough

**Step1: Frequency Fiter**

In [14]:
topic_counts = Counter(topic_pool)

In [15]:
filtered_topic_counts = {topic: count for topic, count in topic_counts.items() if count >= 5}

In [16]:
list(filtered_topic_counts.keys())

['datalog',
 'language',
 'prolog-implementation',
 'prolog',
 'logic-programming',
 'constraints',
 'swi-prolog',
 'unification',
 'haskell',
 'declarative-programming',
 'interpreter',
 'answer-set-programming',
 'javascript',
 'prolog-programming-language',
 'logic',
 'python',
 'artificial-intelligence',
 'rust',
 'programming-language',
 'functional-programming',
 'compiler',
 'machine-learning',
 'minikanren']

**Step2: Similarity Filter**

In [17]:
model_name = "all-MiniLM-L6-v2"
similar_topics = compute_topic_similarities(model_name, input_topic, list(filtered_topic_counts.keys()), 0.5)

In [18]:
len(similar_topics)

10

In [19]:
similar_topics

['swi-prolog',
 'functional-programming',
 'prolog-implementation',
 'prolog',
 'prolog-programming-language',
 'programming-language',
 'logic-programming',
 'logic',
 'answer-set-programming',
 'declarative-programming']

**Step3: Mannually Craft(Human in the Loop)** 

In [28]:
similar_topics = ['swi-prolog',
 'prolog-implementation',
 'prolog',
 'prolog-programming-language',
 'logic-programming',
 'logic',
 'answer-set-programming',
 'declarative-programming'
 'datalog']

### Get All Possible Repos

In [29]:
final_list_repos = [ ]
for topic in similar_topics:
    ls_repos = search_github_repos_by_topic(topic, headers)
    final_list_repos = final_list_repos + ls_repos

In [30]:
final_list_rm_dup = list(set(final_list_repos)) 

In [31]:
len(final_list_rm_dup)

485

### Extract Tags

In [32]:
repo_data = []
for repo in final_list_rm_dup:
    topics = get_repo_topics(repo, headers)
    repo_data.append({
        'repo_name': repo,
        'topics': topics  # This could be a list, string, etc., depending on your function output
    })

# Create a DataFrame from the collected data
df_repos = pd.DataFrame(repo_data)

In [33]:
df_repos = pd.DataFrame(repo_data)

In [34]:
df_repos.to_csv("logic_tag_repos.csv")