In [22]:
import requests
from datetime import datetime, timezone
from dateutil import parser 
import re
import numpy as np
import pandas as pd
from IPython.display import display
from dotenv import load_dotenv
import os


In [23]:
load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")

In [24]:
# Repositories
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")

# Pull Request
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet", columns=['repo_id', 'merged_at'])


In [25]:
# Filter for Repo that has Java language
initial_count = len(repo_df)
repo_df = repo_df[repo_df['language'] == 'Java']
print(f"Filtered {initial_count} repos down to {len(repo_df)} using language='Java'.")

Filtered 2807 repos down to 86 using language='Java'.


In [30]:
# Join Repo and PR table based on repo id
merged_pr_df = pr_df.merge(
        repo_df[['id', 'full_name']], 
        left_on='repo_id', 
        right_on='id', 
        how='inner' 
    ).drop(columns=['id', 'repo_id'])

# Filter PRs that were rejected (not merged)
merged_pr_df['is_accepted'] = merged_pr_df['merged_at'].notnull()

# Group and Aggregate: Group by the new 'full_name' column
pr_metrics = merged_pr_df.groupby('full_name').agg(
    total_prs=('is_accepted', 'size'),        
    accepted_prs=('is_accepted', 'sum')       
).reset_index()

# Calculate Rejected: Subtract accepted from total
pr_metrics['rejected_prs'] = pr_metrics['total_prs'] - pr_metrics['accepted_prs']

# Prepare for Merge: Rename the key column
pr_metrics = pr_metrics[['full_name', 'accepted_prs', 'rejected_prs']]

# print to csv for checking
print(f"Aggregated metrics for {len(pr_metrics)} unique repositories ready for merge.")
pr_metrics.to_csv("accepted_rejected_repo.csv", index=False)

Aggregated metrics for 86 unique repositories ready for merge.


In [35]:
# Accepted PR if accepted_pr >= rejected_pr,
pr_metrics['pr_status'] = np.where(
    pr_metrics['accepted_prs'] >= pr_metrics['rejected_prs'], 
    'Accepted', 
    'Rejected'
)

# Filter the DataFrame into two parts
accepted_repos_df = pr_metrics[pr_metrics['pr_status'] == 'Accepted']
rejected_repos_df = pr_metrics[pr_metrics['pr_status'] == 'Rejected']

display(rejected_repos_df)

Unnamed: 0,full_name,accepted_prs,rejected_prs,pr_status
0,1c-syntax/bsl-language-server,1,2,Rejected
1,2006-Scape/2006Scape,0,1,Rejected
2,AutoMQ/automq,0,1,Rejected
3,Azure-Samples/azure-search-openai-demo-java,0,1,Rejected
4,Azure/azure-sdk-for-java,5,10,Rejected
6,CarGuo/GSYVideoPlayer,0,1,Rejected
9,GrimAnticheat/Grim,0,1,Rejected
10,Igalia/wolvic,0,1,Rejected
12,Kaljurand/K6nele,0,2,Rejected
13,MeteorDevelopment/meteor-client,0,1,Rejected


In [None]:
# Split the name and put it in a List of Dict (not needed but ehh accidentally made the method like that)
repo_parts = accepted_repos_df['full_name'].str.split('/', n=1, expand=True)
ACCEPTED_REPOSITORIES = repo_parts.to_records(index=False).tolist()
print("\nFirst 5 entries of the generated REPOSITORIES list:")
print(ACCEPTED_REPOSITORIES[:5])

# Split rejected repo full_name
repo_parts = rejected_repos_df['full_name'].str.split('/', n=1, expand=True)
REJECTED_REPOSITORIES = repo_parts.to_records(index=False).tolist()
print("\nFirst 5 entries of the generated REPOSITORIES list:")
print(REJECTED_REPOSITORIES[:5])


First 5 entries of the generated REPOSITORIES list:
[('1c-syntax', 'bsl-language-server'), ('2006-Scape', '2006Scape'), ('AutoMQ', 'automq'), ('Azure-Samples', 'azure-search-openai-demo-java'), ('Azure', 'azure-sdk-for-java')]


In [None]:
# ============================================================
# Helper: Total Contributors
# ============================================================
def get_total_contributors(owner, repo, headers):
    """
    Retrieves the total number of contributors for a GitHub repository 
    using the Link header pagination trick.
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/contributors"
    params = {"per_page": 1, "anon": "true"}  # include anonymous contributors

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))
        # fallback for small repos
        return 1
    except requests.exceptions.RequestException as e:
        print(f"Error fetching contributors for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Commit Count
# ============================================================
def get_commit_count(owner, repo, headers):
    url = f"https://api.github.com/repos/{owner}/{repo}/commits"
    params = {"per_page": 1}

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))

        # fallback for repos with few commits
        response = requests.get(url, headers=headers, params={"per_page": 100}, timeout=10)
        return len(response.json())
    except requests.exceptions.RequestException as e:
        print(f"Error fetching commits for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Issues and Pull Requests
# ============================================================
def get_issue_and_pull_metrics(owner, repo, headers):
    base_search_url = "https://api.github.com/search/issues"

    queries = {
        'open_issues': f"repo:{owner}/{repo} is:issue is:open",
        'closed_issues': f"repo:{owner}/{repo} is:issue is:closed",
        'open_prs': f"repo:{owner}/{repo} is:pr is:open",
        'closed_prs': f"repo:{owner}/{repo} is:pr is:closed",
    }

    metrics = {'open_issues': 0, 'closed_issues': 0, 'total_pull_requests': 0}

    for key, query in queries.items():
        try:
            response = requests.get(base_search_url, headers=headers, params={'q': query, 'per_page': 1}, timeout=10)
            response.raise_for_status()
            total = response.json().get('total_count', 0)

            if key.startswith('open_issue'):
                metrics['open_issues'] = total
            elif key.startswith('closed_issue'):
                metrics['closed_issues'] = total
            else:
                metrics['total_pull_requests'] += total

        except requests.exceptions.RequestException:
            continue

    return metrics

# ============================================================
# Main Function: Repository Metrics
# ============================================================
def get_repo_metrics(owner, repo, github_token=None):
    """
    Retrieves metrics for a GitHub repository:
    Watchers, Duration, Commits, Issues, Pull Requests, Contributors
    """
    repo_url = f"https://api.github.com/repos/{owner}/{repo}"
    headers = {}
    if github_token:
        headers["Authorization"] = f"token {github_token}"

    try:
        response = requests.get(repo_url, headers=headers, timeout=10)
        response.raise_for_status()
        repo_data = response.json()

        # Basic metrics
        num_watchers = repo_data.get('stargazers_count', 0)
        num_commits = get_commit_count(owner, repo, headers)
        issue_pr_metrics = get_issue_and_pull_metrics(owner, repo, headers)
        num_closed_issues = issue_pr_metrics['closed_issues']
        num_open_issues = issue_pr_metrics['open_issues']
        num_pull_reqs = issue_pr_metrics['total_pull_requests']
        num_contributors = get_total_contributors(owner, repo, headers)

        # Duration (days since creation)
        created_at_str = repo_data.get('created_at')
        num_days = 0
        if created_at_str:
            created_at_aware = parser.parse(created_at_str)
            now_utc = datetime.now(timezone.utc)
            num_days = (now_utc - created_at_aware).days

        return {
            "Repo": f"{owner}/{repo}",
            "NumWatchers": num_watchers,
            "NumDays": num_days,
            "NumCommits": num_commits,
            "NumOpenIssues": num_open_issues,
            "NumClosedIssues": num_closed_issues,
            "NumPullReqs": num_pull_reqs,
            "NumContributors": num_contributors,
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {owner}/{repo}: {e}")
        return None

# ============================================================
# MAIN PROGRAM
# ============================================================
results_accept = []
results_reject = []
print("\nStarting data retrieval... (may take a moment due to multiple API calls)")

# limit the number of repositories processed here for testing REPOSITORIES[:10]:
for owner, repo in ACCEPTED_REPOSITORIES:
    metrics = get_repo_metrics(owner, repo, GITHUB_TOKEN)
    if metrics:
        results_accept.append(metrics)

# Create the Metric DataFrame 
metrics_df = pd.DataFrame(results_accept)

# limit the number of repositories processed here for testing REPOSITORIES[:10]:
for owner, repo in REJECTED_REPOSITORIES:
    metrics = get_repo_metrics(owner, repo, GITHUB_TOKEN)
    if metrics:
        results_reject.append(metrics)

# Create the Metric DataFrame 
metrics_df = pd.DataFrame(results_reject)


Starting data retrieval... (may take a moment due to multiple API calls)


In [38]:
# Merge the 'stars' column from the original dataset (repo_df) into the API metrics DataFrame (metrics_df) using the repository name.
final_df = pd.merge(
    metrics_df, 
    repo_df[['full_name', 'stars', 'forks']],
    left_on='Repo', 
    right_on='full_name', 
    how='left' # Use a left merge to keep all API results
)

# Clean up and reorder columns
final_df = final_df.drop(columns=['full_name'])
final_df = final_df.rename(columns={
    'NumWatchers': 'Watchers',
    'NumDays': 'Duration_Days',
    'NumCommits': 'Commits', 
    'NumOpenIssues': 'Open_Issues', 
    'NumClosedIssues': 'Closed_Issues', 
    'NumPullReqs': 'Pull_Requests', 
    'NumContributors': 'Contributors',
    'stars': 'Stars',
    'forks': 'Forks' 
})

# Define the final column order
column_order = [
    'Repo', 'Contributors','Stars', 'Forks', 'Watchers', 'Duration_Days', 'Commits', 
    'Open_Issues', 'Closed_Issues', 'Pull_Requests', 
]

# Apply the final column order
final_df = final_df[column_order]

print("\n--- Repository Metrics DataFrame Created ---")
display(final_df)
print(f"\nTotal rows in DataFrame: {len(final_df)}")

# Uncomment the line below to save your metrics to a file
#final_df.to_parquet("repo_metrics_with_stars.parquet", index=False)
final_df.to_csv("repo_metrics_rejected.csv", index=False)


--- Repository Metrics DataFrame Created ---


Unnamed: 0,Repo,Contributors,Stars,Forks,Watchers,Duration_Days,Commits,Open_Issues,Closed_Issues,Pull_Requests
0,1c-syntax/bsl-language-server,65,340,112,357,2508,7618,408,698,2471
1,2006-Scape/2006Scape,33,204,178,218,2339,653,53,199,423
2,AutoMQ/automq,1527,6819,471,7699,818,15154,54,514,2401
3,Azure-Samples/azure-search-openai-demo-java,56,109,91,122,799,572,5,45,63
4,Azure/azure-sdk-for-java,795,2506,2106,2539,5090,33647,500,11181,35393
5,CarGuo/GSYVideoPlayer,27,21004,4291,21224,3286,1644,14,3875,70
6,GrimAnticheat/Grim,107,1319,422,1406,1724,4528,297,1352,628
7,Igalia/wolvic,93,898,118,912,1422,3652,133,535,0
8,Kaljurand/K6nele,3,283,82,286,5075,519,0,0,0
9,MeteorDevelopment/meteor-client,195,2823,1149,3045,1835,4582,0,0,0



Total rows in DataFrame: 48
