In [None]:
curl "https://api.github.com/search/repositories?q=language:python+stars:>100&per_page=10&page=85"

curl "https://api.github.com/search/repositories?q=language:python+stars:%3E200+fork:false+pushed:%3E=2023-01-01&per_page=100&page=1"




In [32]:
import requests
import pandas as pd
import time
import urllib.parse
import os

# Constants
GITHUB_API_URL = "https://api.github.com"
PER_PAGE = 100  # Maximum allowed by GitHub API
ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")  # Set your GitHub access token
TOPICS = ["machine-learning", "data-science", "ai"]  # Replace with your topics

# Headers for authentication
headers = {}
if ACCESS_TOKEN:
    headers = {"Authorization": f"token {ACCESS_TOKEN}"}

# Function to collect repository data
def collect_repo_data():
    repos_data = []
    for topic in TOPICS:
        print(f"Processing topic: {topic}")
        page = 1
        first_page = True
        total_pages = 1
        while page <= total_pages:
            query = f"language:python stars:>200 fork:false topic:{topic} pushed:>=2023-01-01"
            params = {
                "q": query,
                "sort": "stars",
                "order": "desc",
                "per_page": PER_PAGE,
                "page": page
            }

            response = requests.get(f"{GITHUB_API_URL}/search/repositories", headers=headers, params=params)
            if response.status_code != 200:
                print(f"Failed to fetch repositories: {response.status_code}")
                print(response.json())
                break

            data = response.json()
            if first_page:
                total_count = data.get('total_count', 0)
                total_pages = min((total_count + PER_PAGE - 1) // PER_PAGE, 10)  # API caps at 1000 results
                print(f"Total repositories found: {total_count}. Total pages: {total_pages}")
                first_page = False

            items = data.get("items", [])
            if not items:
                print(f"No repositories found on page {page}")
                break

            for repo in items:
                repo_data = {
                    "repo_id": repo["id"],
                    "repo_name": repo["name"],
                    "full_name": repo["full_name"],
                    "owner_login": repo["owner"]["login"],
                    "repo_url": repo["html_url"],
                    "description": repo["description"],
                    "primary_language": repo["language"],
                    "topics": repo.get("topics", []),
                    "license": repo["license"]["name"] if repo["license"] else None,
                    "created_at": repo["created_at"],
                    "updated_at": repo["updated_at"],
                    "pushed_at": repo["pushed_at"],
                    "size": repo["size"],
                    "stargazers_count": repo["stargazers_count"],
                    "watchers_count": repo["watchers_count"],
                    "forks_count": repo["forks_count"],
                    "open_issues_count": repo["open_issues_count"],
                    "default_branch": repo["default_branch"],
                    "score": repo.get("score"),
                    "is_fork": repo["fork"],
                    "visibility": repo.get("visibility", "public"),
                    "topic": topic  # Include the topic
                }
                repos_data.append(repo_data)

            print(f"Completed page {page}/{total_pages} for topic {topic}")
            page += 1
            time.sleep(2)  # Sleep to respect API rate limits

    return repos_data

# Function to fetch READMEs for a list of repositories
def fetch_readmes(repo_full_names):
    readme_errors = []
    if not os.path.exists("readmes"):
        os.makedirs("readmes")

    for repo_full_name in repo_full_names:
        readme_url = f"{GITHUB_API_URL}/repos/{repo_full_name}/readme"

        # Create a copy of headers and add 'Accept' header
        readme_headers = headers.copy()
        readme_headers['Accept'] = 'application/vnd.github.v3.raw'

        readme_resp = requests.get(readme_url, headers=readme_headers)

        if readme_resp.status_code == 200:
            # Save README to a file named after the repo full name
            # Replace slashes in the full name to make it a valid filename
            safe_name = repo_full_name.replace("/", "_")
            with open(f"readmes/{safe_name}.md", "w", encoding="utf-8") as f:
                f.write(readme_resp.text)
        else:
            readme_errors.append((repo_full_name, readme_resp.status_code))
            continue  # Skip repositories without README

        time.sleep(0.5)  # Sleep to respect API rate limits

    return readme_errors

repos_data = collect_repo_data()
# Save repos_data to CSV

repos_df = pd.DataFrame(repos_data)
repos_df = repos_df.drop_duplicates(subset=["full_name"])
repos_df.to_csv("repositories.csv", index=False)



Processing topic: machine-learning
Total repositories found: 1237. Total pages: 10
Completed page 1/10 for topic machine-learning
Completed page 2/10 for topic machine-learning
Completed page 3/10 for topic machine-learning
Completed page 4/10 for topic machine-learning
Completed page 5/10 for topic machine-learning
Completed page 6/10 for topic machine-learning
Completed page 7/10 for topic machine-learning
Completed page 8/10 for topic machine-learning
Completed page 9/10 for topic machine-learning
Completed page 10/10 for topic machine-learning
Processing topic: data-science
Total repositories found: 328. Total pages: 4
Completed page 1/4 for topic data-science
Completed page 2/4 for topic data-science
Completed page 3/4 for topic data-science
Completed page 4/4 for topic data-science
Processing topic: ai
Total repositories found: 454. Total pages: 5
Completed page 1/5 for topic ai
Completed page 2/5 for topic ai
Completed page 3/5 for topic ai
Completed page 4/5 for topic ai
Comple

KeyboardInterrupt: 

In [37]:
repos_df.drop_duplicates(subset=["full_name"]).to_csv("repositories.csv", index=False)

In [38]:
repos_df

Unnamed: 0,repo_id,repo_name,full_name,owner_login,repo_url,description,primary_language,topics,license,created_at,...,size,stargazers_count,watchers_count,forks_count,open_issues_count,default_branch,score,is_fork,visibility,topic
0,155220641,transformers,huggingface/transformers,huggingface,https://github.com/huggingface/transformers,ü§ó Transformers: State-of-the-art Machine Learn...,Python,"[bert, deep-learning, flax, hacktoberfest, jax...",Apache License 2.0,2018-10-29T13:56:00Z,...,241457,132242,132242,26338,1452,main,1.0,False,public,machine-learning
1,65600975,pytorch,pytorch/pytorch,pytorch,https://github.com/pytorch/pytorch,Tensors and Dynamic neural networks in Python ...,Python,"[autograd, deep-learning, gpu, machine-learnin...",Other,2016-08-13T05:26:41Z,...,1018253,82267,82267,22129,15051,main,1.0,False,public,machine-learning
2,33015583,keras,keras-team/keras,keras-team,https://github.com/keras-team/keras,Deep Learning for humans,Python,"[data-science, deep-learning, jax, machine-lea...",Apache License 2.0,2015-03-28T00:35:42Z,...,43457,61636,61636,19422,242,master,1.0,False,public,machine-learning
3,101138315,d2l-zh,d2l-ai/d2l-zh,d2l-ai,https://github.com/d2l-ai/d2l-zh,„ÄäÂä®ÊâãÂ≠¶Ê∑±Â∫¶Â≠¶‰π†„ÄãÔºöÈù¢Âêë‰∏≠ÊñáËØªËÄÖ„ÄÅËÉΩËøêË°å„ÄÅÂèØËÆ®ËÆ∫„ÄÇ‰∏≠Ëã±ÊñáÁâàË¢´70Â§ö‰∏™ÂõΩÂÆ∂ÁöÑ500Â§öÊâÄÂ§ßÂ≠¶Áî®‰∫éÊïôÂ≠¶„ÄÇ,Python,"[book, chinese, computer-vision, deep-learning...",Apache License 2.0,2017-08-23T04:40:24Z,...,316965,61492,61492,10867,84,master,1.0,False,public,machine-learning
4,843222,scikit-learn,scikit-learn/scikit-learn,scikit-learn,https://github.com/scikit-learn/scikit-learn,scikit-learn: machine learning in Python,Python,"[data-analysis, data-science, machine-learning...","BSD 3-Clause ""New"" or ""Revised"" License",2010-08-17T09:43:38Z,...,163634,59453,59453,25273,2087,main,1.0,False,public,machine-learning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1777,635924960,doppel-bot,modal-labs/doppel-bot,modal-labs,https://github.com/modal-labs/doppel-bot,Train a language model to answer Slack message...,Python,"[ai, artificial-intelligence, language-model, ...",MIT License,2023-05-03T18:50:36Z,...,26,207,207,27,2,main,1.0,False,public,ai
1778,808203681,JamAIBase,EmbeddedLLM/JamAIBase,EmbeddedLLM,https://github.com/EmbeddedLLM/JamAIBase,The collaborative spreadsheet for AI. Chain ce...,Python,"[agents, ai, ai-agents-framework, baas, backen...",Apache License 2.0,2024-05-30T15:31:08Z,...,12214,205,205,13,3,main,1.0,False,public,ai
1779,189202592,embedding-as-service,amansrivastava17/embedding-as-service,amansrivastava17,https://github.com/amansrivastava17/embedding-...,One-Stop Solution to encode sentence to fixed ...,Python,"[ai, albert, bert, bert-as-service, deep-learn...",MIT License,2019-05-29T10:20:24Z,...,2021,204,204,29,20,master,1.0,False,public,ai
1780,594360358,AI-Image-PromptGenerator,526christian/AI-Image-PromptGenerator,526christian,https://github.com/526christian/AI-Image-Promp...,A flexible UI script to help create and expand...,Python,"[ai, ai-art, generative-art, image-generation,...",,2023-01-28T09:57:20Z,...,70,204,204,41,0,main,1.0,False,public,ai
