# DeepGit: Building Edges one Tag

## Preparation
### Package Import

In [1]:
import requests
import pandas as pd
from datetime import datetime, timezone
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter

In [2]:
load_dotenv()

True

### GitHub Authenticate Setup

Create a `.env` file in the root directory and add the line `GITHUB_TOKEN = "your_token"`.

In [3]:
def authenticate_github(token):
    """
    Returns headers required for GitHub API requests.
    The Accept header includes the preview for topics.
    """
    return {
        'Authorization': f'token {token}',
        'Accept': 'application/vnd.github.mercy-preview+json'
    }
token = os.getenv("GITHUB_TOKEN")
headers = authenticate_github(token)

## Function Definition

### GitHub Extraction

In [4]:
def search_github_repos_by_topic(tag, headers):
    """
    Searches GitHub repositories that have the specified topic (tag) and over 100 stars.
    Handles pagination and returns a list of dictionaries containing only the repository name and repo id.
    """
    repos = []
    page = 1
    per_page = 100
    query = f"topic:{tag} stars:>5"
    
    while True:
        url = (
            f"https://api.github.com/search/repositories?q={query}"
            f"&sort=stars&order=desc&per_page={per_page}&page={page}"
        )
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Error fetching repositories for tag '{tag}': {response.status_code}")
            break

        data = response.json()
        items = data.get('items', [])
        if not items:
            break

        for item in items:
            repos.append(
                 item.get("full_name")
            )

        # Break if fewer than 'per_page' items were returned (i.e. last page)
        if len(items) < per_page:
            break

        page += 1

    return repos

def get_repo_topics(full_name, headers):
    """
    Fetches the topics (tags) of a repository given its full name.
    """
    url = f"https://api.github.com/repos/{full_name}/topics"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json().get('names', [])
    return []

### Tag Similarity

In [5]:
def compute_topic_similarities(model_name: str, given_topic: str, topic_pool: list, threshold: float = 0.8):
    """
    Compute cosine similarity between a given topic and a pool of topics, filtering by a threshold.

    Args:
        model_name (str): Name of the SentenceTransformer model (e.g., 'all-MiniLM-L6-v2').
        given_topic (str): The topic to compare against the pool.
        topic_pool (list): List of topics to compare.
        threshold (float): Minimum similarity score to be included in the results (default: 0.8).

    Returns:
        list: Sorted list of (topic, similarity) tuples in descending order of similarity.
    """
    # Load model
    model = SentenceTransformer(model_name)
    
    # Encode the given topic and the pool of topics
    given_embedding = model.encode([given_topic])
    pool_embeddings = model.encode(topic_pool)
    
    # Compute cosine similarities
    similarities = cosine_similarity(given_embedding, pool_embeddings)[0]
    
    # Pair topics with their similarity scores
    topic_similarity_pairs = [topic for topic, sim in zip(topic_pool, similarities) if sim >= threshold]
    
    # Sort topics by similarity (higher is better)
    topic_similarity_pairs.sort(key=lambda x: x[1], reverse=True)

    return topic_similarity_pairs

### Temp Store

In [17]:
def write_list_to_file(data_list, file_address):
    """
    Writes each item of the given list to a new line in a text file.

    :param data_list: List of elements to write to the file.
    :param file_address: Path to the file where the list should be saved.
    """
    try:
        with open(file_address, 'w', encoding='utf-8') as file:
            for item in data_list:
                file.write(f"{item}\n")
        print(f"List successfully written to {file_address}")
    except Exception as e:
        print(f"Error writing to file: {e}")

## Demonstration

### Input Tag(Topic)

Suppose we have a topic `visual programming`

In [6]:
input_topic = "visual-programming"

### Git the Initial Repos

**Note**: for demonstration Purpose, we only extract repos have more than 10 stars

In [7]:
repos_with_input_topic = search_github_repos_by_topic(input_topic, headers)

In [8]:
len(repos_with_input_topic)

151

In [9]:
repos_with_input_topic

['node-red/node-red',
 'retejs/rete',
 'jagenjo/litegraph.js',
 'jerosoler/Drawflow',
 'biolab/orange3',
 'PetoiCamp/OpenCat',
 'samuelmtimbo/unit',
 'xyflow/awesome-node-based-uis',
 'silexlabs/Silex',
 'flydelabs/flyde',
 'miroiu/nodify',
 'nevalang/neva',
 'rgleichman/glance',
 'polygonjs/polygonjs',
 'carlosperate/ardublockly',
 'd3cod3/Mosaic',
 'enso-org/ide',
 'endlessm/godot-block-coding',
 'weihuajiang/WPF-Blockly',
 'openmusic-project/openmusic',
 'zenoverflow/omnichain',
 'moonrailgun/codeck',
 'FlowFuse/flowfuse',
 'ayushk7/CodeWire',
 'ghostiam/vue-blocks',
 'meemoo/meemooapp',
 'vvvv/vvvv-sdk',
 'AlvarBer/Persimmon',
 'dgk/django-business-logic',
 'flojoy-ai/studio',
 'Blackprint/Blackprint',
 'cac-t-u-s/om-sharp',
 'honix/Pyno',
 'cortictechnology/cep',
 'imengyu/node-blueprint',
 'd3cod3/ofxVisualProgramming',
 'jpaulm/drawfbp',
 'retejs/rete-studio',
 'bromagosa/Snap4Arduino',
 'mitevpi/gh-web-ui',
 'schwa/SwiftNodeEditor',
 'mafik/automat',
 'mimorisuzuko/chain',
 'Hi

In [19]:
write_list_to_file(repos_with_input_topic, "./temp/visual_programming_R.txt")

List successfully written to ./temp/visual_programming_R.txt


### Get the Pool of Topics

Now we have a set of repos and we can extract the pool of topics from these repos

In [10]:
topic_pool = []
for full_name in repos_with_input_topic:
    topic_pool = topic_pool + get_repo_topics(full_name, headers)

In [11]:
len(topic_pool)

1115

In [12]:
topic_pool_rm_dup = list(set(topic_pool))

In [13]:
topic_pool_rm_dup[0:10]

['lm-studio',
 'graphs',
 'cairo',
 'low-code-development-platform',
 'diagrams',
 'modular-synthesizers',
 'nodejs',
 'diagramm',
 'node-system',
 'shape-recognition']

In [20]:
write_list_to_file(topic_pool_rm_dup, "./temp/visual_programming_T.txt")

List successfully written to ./temp/visual_programming_T.txt


### Further Process the Topics

Note:We probably should do this again and again to ensure that the list of tags is thorough

**Step1: Frequency Fiter**

In [14]:
topic_counts = Counter(topic_pool)

In [15]:
filtered_topic_counts = {topic: count for topic, count in topic_counts.items() if count >= 5}

In [16]:
list(filtered_topic_counts.keys())

['javascript',
 'low-code',
 'flow-based-programming',
 'visual-programming',
 'node-editor',
 'dataflow-programming',
 'graph-editor',
 'graph',
 'visual',
 'editor',
 'flowchart',
 'dataflow',
 'python',
 'arduino',
 'robotics',
 'education',
 'iot',
 'programming-language',
 'functional-programming',
 'no-code',
 'visual-programming-editor',
 'visual-programming-language',
 'fbp',
 'creative-coding',
 'blockly',
 'vvvv',
 'node',
 'csharp',
 'react',
 'vl',
 'tracking']

**Step2: Similarity Filter**

In [21]:
model_name = "all-MiniLM-L6-v2"
similar_topics = compute_topic_similarities(model_name, input_topic, list(filtered_topic_counts.keys()), 0.5)

In [22]:
len(similar_topics)

7

In [23]:
similar_topics

['programming-language',
 'creative-coding',
 'flow-based-programming',
 'visual-programming',
 'visual',
 'visual-programming-editor',
 'visual-programming-language']

**Step3: Mannually Craft(Human in the Loop)** 

In [24]:
# with the help of OpenAI O3-Mini
similar_topics = ['visual-programming',
 'visual-programming-language',
 'visual-programming-editor',
 'flow-based-programming',
 'dataflow-programming',
 'fbp', #flow-based programming => need veryfication
 'node-editor ',
 'graph-editor',
 'vvvv',
 'blockly']

### Get All Possible Repos

In [25]:
final_list_repos = [ ]
for topic in similar_topics:
    ls_repos = search_github_repos_by_topic(topic, headers)
    final_list_repos = final_list_repos + ls_repos

In [26]:
final_list_rm_dup = list(set(final_list_repos)) 

In [27]:
len(final_list_rm_dup)

636

### Extract All Possible Information for a Repo

In [29]:
# Function to fetch repository details
def get_repo_data(repo_name, headers):
    url = f"https://api.github.com/repos/{repo_name}"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching repo {repo_name}: {response.status_code}")
        return None

# List of repositories (ensure full names: 'owner/repo')
repo_data = []
for repo in final_list_rm_dup:
    repo_info = get_repo_data(repo, headers)
    if repo_info:
        repo_data.append(repo_info)  # Stores full JSON response

# Convert list of dictionaries into DataFrame
df_repos = pd.DataFrame(repo_data)

In [30]:
df_repos.to_csv("./temp/visual_programming_kb.csv")