In [19]:
import requests
from datetime import datetime, timezone
from dateutil import parser # A helpful library for parsing date strings
import re
import pandas as pd
from dotenv import load_dotenv
import os


In [20]:
load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")

In [21]:
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")
repo_parts = repo_df['full_name'].str.split('/', n=1, expand=True)
REPOSITORIES = repo_parts.to_records(index=False).tolist()
print("\nFirst 5 entries of the generated REPOSITORIES list:")
print(REPOSITORIES[:5])


First 5 entries of the generated REPOSITORIES list:
[('kizuna-ai-lab', 'sokuji'), ('freenet', 'freenet-core'), ('coleam00', 'mcp-crawl4ai-rag'), ('vexxhost', 'atmosphere'), ('JonasKruckenberg', 'k23')]


In [22]:
# ============================================================
# Helper: Total Contributors
# ============================================================
def get_total_contributors(owner, repo, headers):
    """
    Retrieves the total number of contributors for a GitHub repository 
    using the Link header pagination trick.
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/contributors"
    params = {"per_page": 1, "anon": "true"}  # include anonymous contributors

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))
        # fallback for small repos
        return 1
    except requests.exceptions.RequestException as e:
        print(f"Error fetching contributors for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Commit Count
# ============================================================
def get_commit_count(owner, repo, headers):
    url = f"https://api.github.com/repos/{owner}/{repo}/commits"
    params = {"per_page": 1}

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))

        # fallback for repos with few commits
        response = requests.get(url, headers=headers, params={"per_page": 100}, timeout=10)
        return len(response.json())
    except requests.exceptions.RequestException as e:
        print(f"Error fetching commits for {owner}/{repo}: {e}")
        return 0
    
    
# ============================================================
# Helper: Issues and Pull Requests Queries
# ============================================================
def get_issue_and_pull_metrics(owner, repo, headers):
    base_search_url = "https://api.github.com/search/issues"

    queries = {
        'open_issues': f"repo:{owner}/{repo} is:issue is:open",
        'closed_issues': f"repo:{owner}/{repo} is:issue is:closed",
        'open_prs': f"repo:{owner}/{repo} is:pr is:open",
        'closed_prs': f"repo:{owner}/{repo} is:pr is:closed",
    }

    metrics = {'open_issues': 0, 'closed_issues': 0, 'total_pull_requests': 0}

    for key, query in queries.items():
        try:
            response = requests.get(base_search_url, headers=headers, params={'q': query, 'per_page': 1}, timeout=10)
            response.raise_for_status()
            total = response.json().get('total_count', 0)

            if key.startswith('open_issue'):
                metrics['open_issues'] = total
            elif key.startswith('closed_issue'):
                metrics['closed_issues'] = total
            else:
                metrics['total_pull_requests'] += total

        except requests.exceptions.RequestException:
            continue

    return metrics

# ============================================================
# Main Function: Repository Metrics
# ============================================================
def get_repo_metrics(owner, repo, github_token=None):
    """
    Retrieves metrics for a GitHub repository:
    Watchers, Duration, Commits, Issues, Pull Requests, Contributors
    """
    repo_url = f"https://api.github.com/repos/{owner}/{repo}"
    headers = {}
    if github_token:
        headers["Authorization"] = f"token {github_token}"

    try:
        response = requests.get(repo_url, headers=headers, timeout=10)
        response.raise_for_status()
        repo_data = response.json()

        # Basic metrics
        num_watchers = repo_data.get('stargazers_count', 0)
        num_commits = get_commit_count(owner, repo, headers)
        issue_pr_metrics = get_issue_and_pull_metrics(owner, repo, headers)
        num_closed_issues = issue_pr_metrics['closed_issues']
        num_open_issues = issue_pr_metrics['open_issues']
        num_pull_reqs = issue_pr_metrics['total_pull_requests']
        num_contributors = get_total_contributors(owner, repo, headers)

        # Duration (days since creation)
        created_at_str = repo_data.get('created_at')
        num_days = 0
        if created_at_str:
            created_at_aware = parser.parse(created_at_str)
            now_utc = datetime.now(timezone.utc)
            num_days = (now_utc - created_at_aware).days

        return {
            "Repo": f"{owner}/{repo}",
            "NumWatchers": num_watchers,
            "NumDays": num_days,
            "NumCommits": num_commits,
            "NumOpenIssues": num_open_issues,
            "NumClosedIssues": num_closed_issues,
            "NumPullReqs": num_pull_reqs,
            "NumContributors": num_contributors,
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {owner}/{repo}: {e}")
        return None
    


# ============================================================
# Helper: Total Contributors
# ============================================================
def get_total_contributors(owner, repo, headers):
    """
    Retrieves the total number of contributors for a GitHub repository 
    using the Link header pagination trick.
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/contributors"
    params = {"per_page": 1, "anon": "true"}  # include anonymous contributors

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))
        # fallback for small repos
        return 1
    except requests.exceptions.RequestException as e:
        print(f"Error fetching contributors for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Commit Count
# ============================================================
def get_commit_count(owner, repo, headers):
    url = f"https://api.github.com/repos/{owner}/{repo}/commits"
    params = {"per_page": 1}

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))

        # fallback for repos with few commits
        response = requests.get(url, headers=headers, params={"per_page": 100}, timeout=10)
        return len(response.json())
    except requests.exceptions.RequestException as e:
        print(f"Error fetching commits for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Issues and Pull Requests
# ============================================================
def get_issue_and_pull_metrics(owner, repo, headers):
    base_search_url = "https://api.github.com/search/issues"

    queries = {
        'open_issues': f"repo:{owner}/{repo} is:issue is:open",
        'closed_issues': f"repo:{owner}/{repo} is:issue is:closed",
        'open_prs': f"repo:{owner}/{repo} is:pr is:open",
        'closed_prs': f"repo:{owner}/{repo} is:pr is:closed",
    }

    metrics = {'open_issues': 0, 'closed_issues': 0, 'total_pull_requests': 0}

    for key, query in queries.items():
        try:
            response = requests.get(base_search_url, headers=headers, params={'q': query, 'per_page': 1}, timeout=10)
            response.raise_for_status()
            total = response.json().get('total_count', 0)

            if key.startswith('open_issue'):
                metrics['open_issues'] = total
            elif key.startswith('closed_issue'):
                metrics['closed_issues'] = total
            else:
                metrics['total_pull_requests'] += total

        except requests.exceptions.RequestException:
            continue

    return metrics

# ============================================================
# Main Function: Repository Metrics
# ============================================================
def get_repo_metrics(owner, repo, github_token=None):
    """
    Retrieves metrics for a GitHub repository:
    Watchers, Duration, Commits, Issues, Pull Requests, Contributors
    """
    repo_url = f"https://api.github.com/repos/{owner}/{repo}"
    headers = {}
    if github_token:
        headers["Authorization"] = f"token {github_token}"

    try:
        response = requests.get(repo_url, headers=headers, timeout=10)
        response.raise_for_status()
        repo_data = response.json()

        # Basic metrics
        num_watchers = repo_data.get('stargazers_count', 0)
        num_commits = get_commit_count(owner, repo, headers)
        issue_pr_metrics = get_issue_and_pull_metrics(owner, repo, headers)
        num_closed_issues = issue_pr_metrics['closed_issues']
        num_open_issues = issue_pr_metrics['open_issues']
        num_pull_reqs = issue_pr_metrics['total_pull_requests']
        num_contributors = get_total_contributors(owner, repo, headers)

        # Duration (days since creation)
        created_at_str = repo_data.get('created_at')
        num_days = 0
        if created_at_str:
            created_at_aware = parser.parse(created_at_str)
            now_utc = datetime.now(timezone.utc)
            num_days = (now_utc - created_at_aware).days

        return {
            "Repo": f"{owner}/{repo}",
            "NumWatchers": num_watchers,
            "NumDays": num_days,
            "NumCommits": num_commits,
            "NumOpenIssues": num_open_issues,
            "NumClosedIssues": num_closed_issues,
            "NumPullReqs": num_pull_reqs,
            "NumContributors": num_contributors,
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {owner}/{repo}: {e}")
        return None

# ============================================================
# MAIN PROGRAM
# ============================================================
if __name__ == "__main__":
    # --- 2. Fetch metrics for all repositories ---
    results = []
    print("\nStarting data retrieval... (may take a moment due to multiple API calls)")

    # You might want to limit the number of repositories processed here for testing
    # For example: for owner, repo in REPOSITORIES[:10]:
    for owner, repo in REPOSITORIES[:2]:
        metrics = get_repo_metrics(owner, repo, GITHUB_TOKEN)
        if metrics:
            results.append(metrics)

    # --- 3. Create and display a DataFrame instead of printing a table ---
    metrics_df = pd.DataFrame(results)

    # Clean up the column names for a more DataFrame-friendly look
    metrics_df.columns = [
        'Repo', 'Watchers', 'Duration_Days', 'Commits', 
        'Open_Issues', 'Closed_Issues', 'Pull_Requests', 'Contributors'
    ]

    print("\n--- Repository Metrics DataFrame Created ---")
    print(metrics_df.head())
    print(f"\nTotal rows in DataFrame: {len(metrics_df)}")
    
    # Uncomment the line below to save your metrics to a file
    # metrics_df.to_parquet("repo_metrics.parquet", index=False)
    # print("\nData saved to repo_metrics.parquet")


Starting data retrieval... (may take a moment due to multiple API calls)

--- Repository Metrics DataFrame Created ---
                   Repo  Watchers  Duration_Days  Commits  Open_Issues  \
0  kizuna-ai-lab/sokuji       311            211      587           17   
1  freenet/freenet-core      2456           1579     2735           93   

   Closed_Issues  Pull_Requests  Contributors  
0             33             16             2  
1            259           1644            33  

Total rows in DataFrame: 2
