In [10]:
import requests
from datetime import datetime, timezone
from dateutil import parser 
import re
import pandas as pd
from IPython.display import display
from dotenv import load_dotenv
import os


In [11]:
load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")

In [12]:
# Repositories
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")

# Filter for Java language
initial_count = len(repo_df)
repo_df = repo_df[repo_df['language'] == 'Java']
print(f"Filtered {initial_count} repos down to {len(repo_df)} using language='Java'.")

# Split the name abd put it in a List of Dict
repo_parts = repo_df['full_name'].str.split('/', n=1, expand=True)
REPOSITORIES = repo_parts.to_records(index=False).tolist()
print("\nFirst 5 entries of the generated REPOSITORIES list:")
print(REPOSITORIES[:5])

Filtered 2807 repos down to 86 using language='Java'.

First 5 entries of the generated REPOSITORIES list:
[('dotCMS', 'core'), ('MeteorDevelopment', 'meteor-client'), ('hyperledger', 'besu'), ('apache', 'pulsar'), ('wso2', 'product-is')]


In [13]:
# ============================================================
# Helper: Total Contributors
# ============================================================
def get_total_contributors(owner, repo, headers):
    """
    Retrieves the total number of contributors for a GitHub repository 
    using the Link header pagination trick.
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/contributors"
    params = {"per_page": 1, "anon": "true"}  # include anonymous contributors

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))
        # fallback for small repos
        return 1
    except requests.exceptions.RequestException as e:
        print(f"Error fetching contributors for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Commit Count
# ============================================================
def get_commit_count(owner, repo, headers):
    url = f"https://api.github.com/repos/{owner}/{repo}/commits"
    params = {"per_page": 1}

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))

        # fallback for repos with few commits
        response = requests.get(url, headers=headers, params={"per_page": 100}, timeout=10)
        return len(response.json())
    except requests.exceptions.RequestException as e:
        print(f"Error fetching commits for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Issues and Pull Requests
# ============================================================
def get_issue_and_pull_metrics(owner, repo, headers):
    base_search_url = "https://api.github.com/search/issues"

    queries = {
        'open_issues': f"repo:{owner}/{repo} is:issue is:open",
        'closed_issues': f"repo:{owner}/{repo} is:issue is:closed",
        'open_prs': f"repo:{owner}/{repo} is:pr is:open",
        'closed_prs': f"repo:{owner}/{repo} is:pr is:closed",
    }

    metrics = {'open_issues': 0, 'closed_issues': 0, 'total_pull_requests': 0}

    for key, query in queries.items():
        try:
            response = requests.get(base_search_url, headers=headers, params={'q': query, 'per_page': 1}, timeout=10)
            response.raise_for_status()
            total = response.json().get('total_count', 0)

            if key.startswith('open_issue'):
                metrics['open_issues'] = total
            elif key.startswith('closed_issue'):
                metrics['closed_issues'] = total
            else:
                metrics['total_pull_requests'] += total

        except requests.exceptions.RequestException:
            continue

    return metrics

# ============================================================
# Main Function: Repository Metrics
# ============================================================
def get_repo_metrics(owner, repo, github_token=None):
    """
    Retrieves metrics for a GitHub repository:
    Watchers, Duration, Commits, Issues, Pull Requests, Contributors
    """
    repo_url = f"https://api.github.com/repos/{owner}/{repo}"
    headers = {}
    if github_token:
        headers["Authorization"] = f"token {github_token}"

    try:
        response = requests.get(repo_url, headers=headers, timeout=10)
        response.raise_for_status()
        repo_data = response.json()

        # Basic metrics
        num_watchers = repo_data.get('stargazers_count', 0)
        num_commits = get_commit_count(owner, repo, headers)
        issue_pr_metrics = get_issue_and_pull_metrics(owner, repo, headers)
        num_closed_issues = issue_pr_metrics['closed_issues']
        num_open_issues = issue_pr_metrics['open_issues']
        num_pull_reqs = issue_pr_metrics['total_pull_requests']
        num_contributors = get_total_contributors(owner, repo, headers)

        # Duration (days since creation)
        created_at_str = repo_data.get('created_at')
        num_days = 0
        if created_at_str:
            created_at_aware = parser.parse(created_at_str)
            now_utc = datetime.now(timezone.utc)
            num_days = (now_utc - created_at_aware).days

        return {
            "Repo": f"{owner}/{repo}",
            "NumWatchers": num_watchers,
            "NumDays": num_days,
            "NumCommits": num_commits,
            "NumOpenIssues": num_open_issues,
            "NumClosedIssues": num_closed_issues,
            "NumPullReqs": num_pull_reqs,
            "NumContributors": num_contributors,
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {owner}/{repo}: {e}")
        return None

# ============================================================
# MAIN PROGRAM
# ============================================================
results = []
print("\nStarting data retrieval... (may take a moment due to multiple API calls)")

# limit the number of repositories processed here for testing REPOSITORIES[:10]:
for owner, repo in REPOSITORIES:
    metrics = get_repo_metrics(owner, repo, GITHUB_TOKEN)
    if metrics:
        results.append(metrics)

# Create the Metric DataFrame 
metrics_df = pd.DataFrame(results)


Starting data retrieval... (may take a moment due to multiple API calls)


In [14]:
# Merge the 'stars' column from the original dataset (repo_df) into the API metrics DataFrame (metrics_df) using the repository name.
final_df = pd.merge(
    metrics_df, 
    repo_df[['full_name', 'stars', 'forks']],
    left_on='Repo', 
    right_on='full_name', 
    how='left' # Use a left merge to keep all API results
)

# Clean up and reorder columns
final_df = final_df.drop(columns=['full_name'])
final_df = final_df.rename(columns={
    'NumWatchers': 'Watchers',
    'NumDays': 'Duration_Days',
    'NumCommits': 'Commits', 
    'NumOpenIssues': 'Open_Issues', 
    'NumClosedIssues': 'Closed_Issues', 
    'NumPullReqs': 'Pull_Requests', 
    'NumContributors': 'Contributors',
    'stars': 'Stars',
    'forks': 'Forks' 
})

# Define the final column order
column_order = [
    'Repo', 'Contributors','Stars', 'Forks', 'Watchers', 'Duration_Days', 'Commits', 
    'Open_Issues', 'Closed_Issues', 'Pull_Requests', 
]

# Apply the final column order
final_df = final_df[column_order]

print("\n--- Repository Metrics DataFrame Created ---")
display(final_df)
print(f"\nTotal rows in DataFrame: {len(final_df)}")

# Uncomment the line below to save your metrics to a file
final_df.to_parquet("repo_metrics_with_stars.parquet", index=False)
final_df.to_csv("repo_metrics_with_stars.csv", index=False)


--- Repository Metrics DataFrame Created ---


Unnamed: 0,Repo,Contributors,Stars,Forks,Watchers,Duration_Days,Commits,Open_Issues,Closed_Issues,Pull_Requests
0,dotCMS/core,156,912,477,923,4990,27356,581,18530,14635
1,MeteorDevelopment/meteor-client,195,2823,1149,3046,1835,4582,604,3376,1581
2,hyperledger/besu,246,1671,948,1724,2260,6251,132,3020,6238
3,apache/pulsar,760,14760,3646,14955,3424,13818,1211,6112,16822
4,wso2/product-is,729,799,820,824,4223,17386,1377,15797,8921
...,...,...,...,...,...,...,...,...,...,...
81,jbangdev/jbang,103,1609,171,1664,2149,2134,0,0,0
82,lunasaw/gb28181-proxy,4,113,26,134,763,312,0,0,0
83,liyupi/yu-ai-agent,1,741,163,1161,204,16,0,0,0
84,rieckpil/testing-spring-boot-applications-mast...,7,318,255,319,1999,509,0,0,0



Total rows in DataFrame: 86
