In [None]:
import requests
import pandas as pd
import re
import time

In [None]:
# GITHUB_TOKEN = add_your_token
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}
SEARCH_URL = "https://api.github.com/search/commits"
QUERY_LIST = ["fix OR bug OR error", "patch", "syntax error", "issue fixed", "refactor"]

In [None]:
def search_commits(keyword, language="Python", per_page=100, page=1):
    url = f"{SEARCH_URL}?q={keyword}+language:{language}&sort=author-date&order=desc&per_page={per_page}&page={page}"
    response = requests.get(url, headers=HEADERS)

    if response.status_code == 403:  # Rate limit exceeded
        print("Rate limit hit! Sleeping for 60 seconds...")
        time.sleep(60)
        return []

    return response.json().get("items", []) if response.status_code == 200 else []

In [None]:
def get_code_diff(repo_full_name, commit_sha):
    url = f"https://api.github.com/repos/{repo_full_name}/commits/{commit_sha}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        return None, None

    files = response.json().get("files", [])
    buggy_code, fixed_code = [], []

    for file in files:
        if file["filename"].endswith(".py"):  # Filter only Python files
            patch = file.get("patch", "")

            # Clean up Git diff metadata
            cleaned_patch = re.sub(r'@@.*?@@', '', patch)  # Remove diff headers
            cleaned_patch = re.sub(r'^\+', '', cleaned_patch, flags=re.MULTILINE)  # Remove '+'
            cleaned_patch = re.sub(r'^-', '', cleaned_patch, flags=re.MULTILINE)  # Remove '-'

            # Extract buggy and fixed code separately
            buggy_lines = [line[1:] for line in patch.split("\n") if line.startswith("-")]
            fixed_lines = [line[1:] for line in patch.split("\n") if line.startswith("+")]

            if buggy_lines and fixed_lines:
                buggy_code.append("\n".join(buggy_lines))
                fixed_code.append("\n".join(fixed_lines))

    return "\n".join(buggy_code), "\n".join(fixed_code)

In [None]:
data = []
MAX_COMMITS = 100000
fetched_count = 0

In [None]:
for query in QUERY_LIST:
    page = 1
    while fetched_count < MAX_COMMITS:
        print(f"Fetching commits with query: {query} (Page {page})")
        commits = search_commits(query, per_page=100, page=page)

        if not commits:
            break  # No more commits to fetch

        for commit in commits:
            if fetched_count >= MAX_COMMITS:
                break

            repo_name = commit["repository"]["full_name"]
            commit_sha = commit["sha"]
            buggy, fixed = get_code_diff(repo_name, commit_sha)

            if buggy and fixed:
                data.append((buggy, "buggy"))
                data.append((fixed, "bug-free"))
                fetched_count += 2

        page += 1
        time.sleep(2)  # Avoid hitting rate limits for the github api [TODO optimize it ]

Fetching commits with query: fix OR bug OR error (Page 1)
Fetching commits with query: fix OR bug OR error (Page 2)
Rate limit hit! Sleeping for 60 seconds...
Fetching commits with query: patch (Page 1)
Fetching commits with query: patch (Page 2)
Fetching commits with query: patch (Page 3)
Fetching commits with query: patch (Page 4)
Fetching commits with query: patch (Page 5)
Fetching commits with query: patch (Page 6)
Fetching commits with query: patch (Page 7)
Fetching commits with query: patch (Page 8)
Fetching commits with query: patch (Page 9)
Fetching commits with query: patch (Page 10)
Fetching commits with query: patch (Page 11)
Fetching commits with query: syntax error (Page 1)
Fetching commits with query: syntax error (Page 2)
Fetching commits with query: syntax error (Page 3)
Fetching commits with query: issue fixed (Page 1)
Fetching commits with query: issue fixed (Page 2)
Fetching commits with query: refactor (Page 1)
Fetching commits with query: refactor (Page 2)
Fetching

In [None]:
# Save dataset
df = pd.DataFrame(data, columns=["code", "label"])
df.to_csv("github_scraped.csv", index=False)

print(f"Dataset saved as github_scraped.csv with {len(df)} samples.")

Dataset saved as github_scraped.csv with 486 samples.


In [None]:
from google.colab import files
files.download('github_scraped.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>