In [4]:
import requests
from datetime import datetime, timezone
from dateutil import parser 
import re
import numpy as np
import pandas as pd
from IPython.display import display
from dotenv import load_dotenv
import os

load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")


# Import the Hao-Li AIDev datasets

In [5]:
# Repositories
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")

# Pull Request
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")


# 1. Prepare the Dataset

In [6]:
# Filter the repository data for 'Java' language
java_repo_df = repo_df[repo_df['language'] == 'Java'].copy()
java_repo_select_df = java_repo_df[['id', 'full_name']]

# Join Repo and PR table based on repo id
merged_pr_df = pr_df.merge(
    java_repo_select_df,
    left_on='repo_id',
    right_on='id',
    how='inner'
)

# clean up extra attribute
merged_pr_df = merged_pr_df.drop(columns=['id_y'])
merged_pr_df = merged_pr_df.rename(columns={'id_x':'id'})

# Filter PRs that were rejected (not merged) and create a new attribute
accepted_prs = merged_pr_df[merged_pr_df['merged_at'].notnull()]
rejected_prs = merged_pr_df[merged_pr_df['merged_at'].isnull()]

# Prepare for Merge: Rename the key column
accepted_prs = accepted_prs[['full_name', 'number']]
rejected_prs = rejected_prs[['full_name', 'number']]

# print to csv for checking
accepted_prs.to_csv("accepted_PR.csv", index=False)
rejected_prs.to_csv("rejected_PR.csv", index=False)

## 1.1. Split between Accepted and Rejected Repo 

In [7]:
# ============================================================
# Helper: Split the name and put it in a List of Dict (not needed but ehh accidentally made the method like that)
# ============================================================
def process_repositories(pr_df):
    """
    Filters the DataFrame by status, splits the full_name, and creates a 
    list of (owner, repo) tuples for processing.
    """
    
    # 1. Split the 'full_name' column into 'owner' and 'repo' columns
    split_df = pr_df['full_name'].str.split('/', n=1, expand=True)
    split_df.columns = ['owner', 'repo']
    
    # 2. Combine the split columns and the 'number' column into a list of tuples
    # We use axis=1 to apply the tuple creation row-wise across the three columns
    repositories = pd.concat([split_df, pr_df['number']], axis=1).apply(tuple, axis=1).tolist()
    
    # Print the first 5 results for verification
    print(repositories[:5])
    
    return repositories

ACCEPTED_PULL_REQUEST = process_repositories(accepted_prs)
REJECTED_PULL_REQUEST = process_repositories(rejected_prs)

[('dotCMS', 'core', 32609), ('apache', 'pulsar', 24542), ('dotCMS', 'core', 32771), ('dotCMS', 'core', 32561), ('microsoft', 'ApplicationInsights-Java', 4293)]
[('dotCMS', 'core', 32656), ('dotCMS', 'core', 32657), ('dotCMS', 'core', 32658), ('dotCMS', 'core', 32659), ('dotCMS', 'core', 32660)]


# 2. Helper code block to limit the API rate request

In [8]:
import time
import requests

def safe_request(method, url, headers=None, params=None, timeout=10, sleep_between=0.4):
    """
    A rate-limit-safe GitHub request wrapper that handles:
    - Primary rate limits (5000/hour)
    - Secondary abuse limits (burst protection)
    - GET and HEAD requests
    """
    while True:
        response = requests.request(method, url, headers=headers, params=params, timeout=timeout)

        # Primary rate limit
        remaining = int(response.headers.get("X-RateLimit-Remaining", 1))
        reset_ts = int(response.headers.get("X-RateLimit-Reset", time.time()))

        if remaining == 0:
            wait = max(reset_ts - int(time.time()), 10)
            print(f"[Primary Limit] Waiting {wait} seconds...")
            time.sleep(wait)
            continue

        # Secondary rate limit (abuse detection)
        if response.status_code == 403:
            print("[Secondary Limit] Hit GitHub abuse limit. Backing off 60 seconds...")
            time.sleep(60)
            continue

        # Success or other errors handled normally
        if not response.ok:
            response.raise_for_status()

        # Small delay prevents triggering secondary limit
        time.sleep(sleep_between)

        return response


# 3. Git API to extract metrics 

In [None]:
# ============================================================
# Helper: Total Contributors
# ============================================================
def get_total_contributors(owner, repo, headers):
    """
    Retrieves the total number of contributors for a GitHub repository 
    using the Link header pagination trick.
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/contributors"
    params = {"per_page": 1, "anon": "true"}  # include anonymous contributors

    try:
        response = safe_request("HEAD", url, headers=headers, params=params)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))
        # fallback for small repos
        return 1
    except requests.exceptions.RequestException as e:
        print(f"Error fetching contributors for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Commit Count
# ============================================================
def get_commit_count(owner, repo, headers):
    url = f"https://api.github.com/repos/{owner}/{repo}/commits"
    params = {"per_page": 1}

    try:
        response = safe_request("HEAD", url, headers=headers, params=params)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))

        # fallback for repos with few commits
        response = safe_request("GET", url, headers=headers, params={"per_page": 100})
        return len(response.json())
    except requests.exceptions.RequestException as e:
        print(f"Error fetching commits for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Queries for Issues and Pull Requests
# ============================================================
def get_issue_and_pull_metrics(owner, repo, headers):
    base_search_url = "https://api.github.com/search/issues"

    queries = {
        'open_issues': f"repo:{owner}/{repo} is:issue is:open",
        'closed_issues': f"repo:{owner}/{repo} is:issue is:closed",
        'open_prs': f"repo:{owner}/{repo} is:pr is:open",
        'closed_prs': f"repo:{owner}/{repo} is:pr is:closed",
    }

    metrics = {'open_issues': 0, 'closed_issues': 0, 'total_pull_requests': 0}

    for key, query in queries.items():
        try:
            response = safe_request("GET", base_search_url, headers=headers, params={'q': query, 'per_page': 1})
            response.raise_for_status()
            total = response.json().get('total_count', 0)

            if key.startswith('open_issue'):
                metrics['open_issues'] = total
            elif key.startswith('closed_issue'):
                metrics['closed_issues'] = total
            else:
                metrics['total_pull_requests'] += total

        except requests.exceptions.RequestException:
            continue

    return metrics

# ============================================================
# Main Function: Repository Metrics
# ============================================================
def get_repo_metrics(owner, repo, pr_number, github_token=None):
    """
    Retrieves metrics for a GitHub repository:
    Watchers, Duration, Commits, Issues, Pull Requests, Contributors
    """
    repo_url = f"https://api.github.com/repos/{owner}/{repo}"
    headers = {}
    if github_token:
        headers["Authorization"] = f"token {github_token}"

    try:
        response = safe_request("GET", repo_url, headers=headers)
        response.raise_for_status()
        repo_data = response.json()

        # Basic metrics
        num_watchers = repo_data.get('stargazers_count', 0)
        num_commits = get_commit_count(owner, repo, headers)
        issue_pr_metrics = get_issue_and_pull_metrics(owner, repo, headers)
        num_closed_issues = issue_pr_metrics['closed_issues']
        num_open_issues = issue_pr_metrics['open_issues']
        num_pull_reqs = issue_pr_metrics['total_pull_requests']
        num_contributors = get_total_contributors(owner, repo, headers)

        # Duration (days since creation)
        created_at_str = repo_data.get('created_at')
        num_days = 0
        if created_at_str:
            created_at_aware = parser.parse(created_at_str)
            now_utc = datetime.now(timezone.utc)
            num_days = (now_utc - created_at_aware).days

        return {
            "PR_ID": pr_number,
            #"Repo": f"{owner}/{repo}",
            "NumWatchers": num_watchers,
            "NumDays": num_days,
            "NumCommits": num_commits,
            "NumOpenIssues": num_open_issues,
            "NumClosedIssues": num_closed_issues,
            "NumPullReqs": num_pull_reqs,
            "NumContributors": num_contributors,
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {owner}/{repo}: {e}")
        return None
    
# ============================================================
# Main Helper: Fetch the main metric functions 
# ============================================================
def fetch_metrics(repo_list, token):
    results = []
    # limit the number of repositories processed here for testing REPOSITORIES[:10]:
    for owner, repo, pr_number in repo_list: # Apply the test limit here
        metrics = get_repo_metrics(owner, repo, pr_number, token)
        if metrics:
            results.append(metrics)
    
    # Create the Metric DataFrame
    return pd.DataFrame(results)

# ============================================================
# MAIN PROGRAM
# ============================================================
print("\nStarting data retrieval... (may take a moment due to multiple API calls)")
repo_metrics_df_accept = fetch_metrics(ACCEPTED_PULL_REQUEST, GITHUB_TOKEN)
repo_metrics_df_reject = fetch_metrics(REJECTED_PULL_REQUEST, GITHUB_TOKEN)


Starting data retrieval... (may take a moment due to multiple API calls)
[Primary Limit] Waiting 50 seconds...
[Primary Limit] Waiting 48 seconds...
[Primary Limit] Waiting 50 seconds...
[Primary Limit] Waiting 47 seconds...
[Primary Limit] Waiting 51 seconds...
[Primary Limit] Waiting 49 seconds...
[Primary Limit] Waiting 50 seconds...
[Primary Limit] Waiting 47 seconds...
[Primary Limit] Waiting 47 seconds...
[Primary Limit] Waiting 49 seconds...
[Primary Limit] Waiting 49 seconds...
[Primary Limit] Waiting 49 seconds...
[Primary Limit] Waiting 49 seconds...
[Primary Limit] Waiting 2624 seconds...
[Primary Limit] Waiting 48 seconds...
[Primary Limit] Waiting 50 seconds...
[Primary Limit] Waiting 48 seconds...
[Primary Limit] Waiting 48 seconds...
[Primary Limit] Waiting 50 seconds...
[Primary Limit] Waiting 49 seconds...
[Primary Limit] Waiting 49 seconds...
[Primary Limit] Waiting 44 seconds...
[Primary Limit] Waiting 49 seconds...
[Primary Limit] Waiting 49 seconds...
[Primary Lim

In [None]:
# ============================================================
# Helper: Finalize the dataframe, adding stars and forks
# ============================================================
def finalize_dataframe(metrics_df, repo_df, output_filename):
    """
    Applies the merging, cleaning, renaming, and reordering steps 
    to a single metrics DataFrame.
    """
    # 1. Merge with the original repo_df to get stars and forks
    final_df = pd.merge(
        metrics_df, 
        repo_df[['full_name', 'stars', 'forks']],
        left_on='Repo', 
        right_on='full_name', 
        how='left' 
    )

    # 2. Clean up and reorder columns
    final_df = final_df.drop(columns=['full_name'])
    
    # Define the rename mapping
    rename_map = {
        'NumWatchers': 'Watchers',
        'NumDays': 'Duration_Days',
        'NumCommits': 'Commits', 
        'NumOpenIssues': 'Open_Issues', 
        'NumClosedIssues': 'Closed_Issues', 
        'NumPullReqs': 'Pull_Requests', 
        'NumContributors': 'Contributors',
        'stars': 'Stars',
        'forks': 'Forks' 
    }
    final_df = final_df.rename(columns=rename_map)

    # 3. Define the final column order
    column_order = [
        'Repo', 'PR_ID', 'Contributors', 'Stars', 'Forks', 'Watchers', 'Duration_Days', 
        'Commits', 'Open_Issues', 'Closed_Issues', 'Pull_Requests', 
    ]
    
    # Apply the final column order
    final_df = final_df[column_order]

    # 4. Save the file (using CSV as per your original request)
    final_df.to_csv(output_filename, index=False)
    
    return final_df

# ============================================================
# MAIN PROGRAM - Separate Processing
# ============================================================

# --- Processing Accepted Repositories ---
print("\n--- Processing Accepted Repositories ---")
final_df_accept = finalize_dataframe(
    repo_metrics_df_accept, 
    repo_df, 
    "repo_metrics_accepted.csv" # Save to a separate file
)

print("\nAccepted Repository Metrics DataFrame Created:")
print(f"Total rows in Accepted DataFrame: {len(final_df_accept)}")

# --- Processing Rejected Repositories ---
print("\n--- Processing Rejected Repositories ---")
final_df_reject = finalize_dataframe(
    repo_metrics_df_reject, 
    repo_df, 
    "repo_metrics_rejected.csv" # Save to a separate file
)

print("\nRejected Repository Metrics DataFrame Created:")
print(f"Total rows in Rejected DataFrame: {len(final_df_reject)}")


--- Processing Accepted Repositories ---

Accepted Repository Metrics DataFrame Created:
Total rows in Accepted DataFrame: 38

--- Processing Rejected Repositories ---

Rejected Repository Metrics DataFrame Created:
Total rows in Rejected DataFrame: 48
