# DeepGit: Building Edges one Tag

## Preparation
### Package Import

In [1]:
import requests
import pandas as pd
from datetime import datetime, timezone
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter

In [2]:
load_dotenv()

True

### GitHub Authenticate Setup

Create a `.env` file in the root directory and add the line `GITHUB_TOKEN = "your_token"`.

In [3]:
def authenticate_github(token):
    """
    Returns headers required for GitHub API requests.
    The Accept header includes the preview for topics.
    """
    return {
        'Authorization': f'token {token}',
        'Accept': 'application/vnd.github.mercy-preview+json'
    }
token = os.getenv("GITHUB_TOKEN")
headers = authenticate_github(token)

## Function Definition

### GitHub Extraction

In [4]:
def search_github_repos_by_topic(tag, headers):
    """
    Searches GitHub repositories that have the specified topic (tag) and over 100 stars.
    Handles pagination and returns a list of dictionaries containing only the repository name and repo id.
    """
    repos = []
    page = 1
    per_page = 100
    query = f"topic:{tag} stars:>5"
    
    while True:
        url = (
            f"https://api.github.com/search/repositories?q={query}"
            f"&sort=stars&order=desc&per_page={per_page}&page={page}"
        )
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Error fetching repositories for tag '{tag}': {response.status_code}")
            break

        data = response.json()
        items = data.get('items', [])
        if not items:
            break

        for item in items:
            repos.append(
                 item.get("full_name")
            )

        # Break if fewer than 'per_page' items were returned (i.e. last page)
        if len(items) < per_page:
            break

        page += 1

    return repos

def get_repo_topics(full_name, headers):
    """
    Fetches the topics (tags) of a repository given its full name.
    """
    url = f"https://api.github.com/repos/{full_name}/topics"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json().get('names', [])
    return []

### Tag Similarity

In [5]:
def compute_topic_similarities(model_name: str, given_topic: str, topic_pool: list, threshold: float = 0.8):
    """
    Compute cosine similarity between a given topic and a pool of topics, filtering by a threshold.

    Args:
        model_name (str): Name of the SentenceTransformer model (e.g., 'all-MiniLM-L6-v2').
        given_topic (str): The topic to compare against the pool.
        topic_pool (list): List of topics to compare.
        threshold (float): Minimum similarity score to be included in the results (default: 0.8).

    Returns:
        list: Sorted list of (topic, similarity) tuples in descending order of similarity.
    """
    # Load model
    model = SentenceTransformer(model_name)
    
    # Encode the given topic and the pool of topics
    given_embedding = model.encode([given_topic])
    pool_embeddings = model.encode(topic_pool)
    
    # Compute cosine similarities
    similarities = cosine_similarity(given_embedding, pool_embeddings)[0]
    
    # Pair topics with their similarity scores
    topic_similarity_pairs = [topic for topic, sim in zip(topic_pool, similarities) if sim >= threshold]
    
    # Sort topics by similarity (higher is better)
    topic_similarity_pairs.sort(key=lambda x: x[1], reverse=True)

    return topic_similarity_pairs

### Temp Store

In [6]:
def write_list_to_file(data_list, file_address):
    """
    Writes each item of the given list to a new line in a text file.

    :param data_list: List of elements to write to the file.
    :param file_address: Path to the file where the list should be saved.
    """
    try:
        with open(file_address, 'w', encoding='utf-8') as file:
            for item in data_list:
                file.write(f"{item}\n")
        print(f"List successfully written to {file_address}")
    except Exception as e:
        print(f"Error writing to file: {e}")

## Demonstration

### Input Tag(Topic)

Suppose we have a topic `llm`

In [7]:
input_topic = "large-language-model"

### Git the Initial Repos

**Note**: for demonstration Purpose, we only extract repos have more than 10 stars

In [8]:
repos_with_input_topic = search_github_repos_by_topic(input_topic, headers)

In [9]:
len(repos_with_input_topic)

185

In [10]:
repos_with_input_topic

['jingyaogong/minimind',
 'InternLM/InternLM',
 'OpenSPG/KAG',
 'gluonfield/enchanted',
 'open-compass/opencompass',
 '0xPlaygrounds/rig',
 'verazuo/jailbreak_llms',
 'ridgerchu/matmulfreellm',
 'BrainBlend-AI/atomic-agents',
 'InternLM/InternLM-XComposer',
 'RManLuo/Awesome-LLM-KG',
 'intel/intel-extension-for-transformers',
 'dvlab-research/LISA',
 'samchon/nestia',
 'hyp1231/awesome-llm-powered-agent',
 'EgoAlpha/prompt-in-context-learning',
 'coderonion/awesome-yolo-object-detection',
 'starpig1129/DATAGEN',
 'zchoi/Awesome-Embodied-Agent-with-LLMs',
 'llm-jp/awesome-japanese-llm',
 'Tebmer/Awesome-Knowledge-Distillation-of-LLMs',
 'acon96/home-llm',
 'X-LANCE/SLAM-LLM',
 'EasyJailbreak/EasyJailbreak',
 'zhanshijinwat/Steel-LLM',
 'showlab/VLog',
 'onejune2018/Awesome-LLM-Eval',
 'Kenza-AI/sagify',
 'Paranioar/Awesome_Matching_Pretraining_Transfering',
 'zchuz/CoT-Reasoning-Survey',
 'dvlab-research/LLMGA',
 'dylanhogg/llmgraph',
 'yingpengma/Awesome-Story-Generation',
 'SuperMedIn

In [12]:
write_list_to_file(repos_with_input_topic, "./temp/llm_R.txt")

List successfully written to ./temp/llm_R.txt


### Get the Pool of Topics

Now we have a set of repos and we can extract the pool of topics from these repos

In [13]:
topic_pool = []
for full_name in repos_with_input_topic:
    topic_pool = topic_pool + get_repo_topics(full_name, headers)

In [14]:
len(topic_pool)

1691

In [15]:
topic_pool_rm_dup = list(set(topic_pool))

In [16]:
topic_pool_rm_dup[0:10]

['multi-modal-llms',
 'in-context-learning',
 'video',
 'fine-tuning',
 'generative-pretraining',
 'camel',
 'data',
 'music-processing',
 'scene-graph',
 'llm-recommendation']

In [18]:
write_list_to_file(topic_pool_rm_dup, "./temp/llm_T.txt")

List successfully written to ./temp/llm_T.txt


### Further Process the Topics

Note:We probably should do this again and again to ensure that the list of tags is thorough

**Step1: Frequency Fiter**

In [19]:
topic_counts = Counter(topic_pool)

In [20]:
filtered_topic_counts = {topic: count for topic, count in topic_counts.items() if count >= 5}

In [21]:
list(filtered_topic_counts.keys())

['artificial-intelligence',
 'large-language-model',
 'chatbot',
 'gpt',
 'llm',
 'knowledge-graph',
 'llama',
 'llama2',
 'evaluation',
 'benchmark',
 'chatgpt',
 'openai',
 'ai',
 'agent',
 'generative-ai',
 'large-language-models',
 'llms',
 'openai-api',
 'gpt-4',
 'multimodal',
 'vision-language-model',
 'language-model',
 'llm-inference',
 'rag',
 'retrieval-augmented-generation',
 'awesome-list',
 'foundation-model',
 'foundation-models',
 'chain-of-thought',
 'prompt-engineering',
 'langchain',
 'python',
 'awesome',
 'multimodal-large-language-models',
 'dataset',
 'machine-learning',
 'nlp',
 'deep-learning',
 'image-generation',
 'natural-language-processing',
 'llm-agent',
 'fine-tuning',
 'large-multimodal-models',
 'ml',
 'reasoning']

**Step2: Similarity Filter**

In [22]:
model_name = "all-MiniLM-L6-v2"
similar_topics = compute_topic_similarities(model_name, input_topic, list(filtered_topic_counts.keys()), 0.5)

In [23]:
len(similar_topics)

6

In [24]:
similar_topics

['multimodal-large-language-models',
 'vision-language-model',
 'large-language-model',
 'large-language-models',
 'language-model',
 'large-multimodal-models']

Meanwhile, I am using this prompt in o3-mini
`return me relevant tags to {topic}`

**Step3: Mannually Craft(Human in the Loop)** 

In [26]:
# with the help of OpenAI O3-Mini
similar_topics = [
    "large-language-models",
    "large-language-model",
    "llm",
    "llms",
    "foundation-model",
    "foundation-models",
    "generative-ai",
    "language-model",
    "llm-inference",
    "multimodal-large-language-models",
    'gpt',
    'llama',
    'llama2',
    'openai',
    'chatgpt',
    'generative-ai',
    'multimodal',
    'gpt-4',
    'retrieval-augmented-generation',
    'chain-of-thought',
    'prompt-engineering',
    'langchain',
    'llm-agent'
]

### Get All Possible Repos

**"422" error means it exceed the GitHub API Rate Limits**

In [27]:
final_list_repos = [ ]
for topic in similar_topics:
    ls_repos = search_github_repos_by_topic(topic, headers)
    final_list_repos = final_list_repos + ls_repos

Error fetching repositories for tag 'large-language-models': 422
Error fetching repositories for tag 'llm': 422
Error fetching repositories for tag 'llms': 422
Error fetching repositories for tag 'generative-ai': 422
Error fetching repositories for tag 'gpt': 422
Error fetching repositories for tag 'openai': 422
Error fetching repositories for tag 'chatgpt': 422
Error fetching repositories for tag 'generative-ai': 422
Error fetching repositories for tag 'gpt-4': 422
Error fetching repositories for tag 'langchain': 422


In [29]:
final_list_rm_dup = list(set(final_list_repos)) 

In [30]:
len(final_list_rm_dup)

9367

### Extract All Possible Information for a Repo

In [31]:
# Function to fetch repository details
def get_repo_data(repo_name, headers):
    url = f"https://api.github.com/repos/{repo_name}"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching repo {repo_name}: {response.status_code}")
        return None

# List of repositories (ensure full names: 'owner/repo')
repo_data = []
for repo in final_list_rm_dup:
    repo_info = get_repo_data(repo, headers)
    if repo_info:
        repo_data.append(repo_info)  # Stores full JSON response

# Convert list of dictionaries into DataFrame
df_repos = pd.DataFrame(repo_data)

Error fetching repo irgolic/AutoPR: 403
Error fetching repo fluxninja/aperture: 403
Error fetching repo itsOwen/CyberScraper-2077: 403
Error fetching repo Alab-NII/Awesome-SciLM: 403
Error fetching repo YJiangcm/Lion: 403
Error fetching repo mccaffary/GPT-4-ChatGPT-Project-Euler: 403
Error fetching repo BiomedSciAI/gene-benchmark: 403
Error fetching repo association-rosia/flair-2: 403
Error fetching repo rryam/LumoKit: 403
Error fetching repo taichengguo/LLM_MultiAgents_Survey_Papers: 403
Error fetching repo michaelfeil/infinity: 403
Error fetching repo business-science/ai-data-science-team: 403
Error fetching repo xiangsx/gpt4free-ts: 403
Error fetching repo xebia-functional/xef: 403
Error fetching repo yunwei37/Awesome-Prompt-Engineering-ZH-CN: 403
Error fetching repo build-on-aws/gen-ai-workshop: 403
Error fetching repo adrianhajdin/project_ai_mern_image_generation: 403
Error fetching repo autowarefoundation/autoware.privately-owned-vehicles: 403
Error fetching repo hanbyel0105/Diff

In [32]:
df_repos.to_csv("./temp/llm_kb.csv")