In [1]:
import requests
from datetime import datetime, timezone
from dateutil import parser 
import re
import numpy as np
import pandas as pd
from typing import Optional, Dict, List, Tuple
from IPython.display import display
from dotenv import load_dotenv
import os

load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")

# Import the Hao-Li AIDev datasets

In [2]:
# Repositories
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")

# Pull Request
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")

# 1. Prepare the Dataset

In [3]:
# Filter the repository data for 'Java' language
java_repo_df = repo_df[repo_df['language'] == 'Java'].copy()
java_repo_select_df = java_repo_df[['id', 'full_name']]

# Join Repo and PR table based on repo id
merged_pr_df = pr_df.merge(
    java_repo_select_df,
    left_on='repo_id',
    right_on='id',
    how='inner'
)

# clean up extra attribute
merged_pr_df = merged_pr_df.drop(columns=['id_y'])
merged_pr_df = merged_pr_df.rename(columns={'id_x':'id'})

# Filter PRs that were rejected (not merged) and create a new attribute
accepted_prs = merged_pr_df[merged_pr_df['merged_at'].notnull()]
rejected_prs = merged_pr_df[merged_pr_df['merged_at'].isnull()]

# Prepare for Merge: Rename the key column
accepted_prs = accepted_prs[['full_name', 'number']]
rejected_prs = rejected_prs[['full_name', 'number']]

# print to csv for checking
accepted_prs.to_csv("accepted_PR.csv", index=False)
rejected_prs.to_csv("rejected_PR.csv", index=False)

## 1.1. Split the full_name of repo into owner and repo name

In [4]:
# ============================================================
# Helper: Split the name and put it in a List of Dict (not needed but ehh accidentally made the method like that)
# ============================================================
def process_repositories(pr_df):
    """
    Filters the DataFrame by status, splits the full_name, and creates a 
    list of (owner, repo) tuples for processing.
    """
    
    # 1. Split the 'full_name' column into 'owner' and 'repo' columns
    split_df = pr_df['full_name'].str.split('/', n=1, expand=True)
    split_df.columns = ['owner', 'repo']
    
    # 2. Combine the split columns and the 'number' column into a list of tuples
    # We use axis=1 to apply the tuple creation row-wise across the three columns
    repositories = pd.concat([split_df, pr_df['number']], axis=1).apply(tuple, axis=1).tolist()
    
    # Print the first 5 results for verification
    print(repositories[:5])
    
    return repositories


ACCEPTED_PULL_REQUEST = process_repositories(accepted_prs)
REJECTED_PULL_REQUEST = process_repositories(rejected_prs)

[('dotCMS', 'core', 32609), ('apache', 'pulsar', 24542), ('dotCMS', 'core', 32771), ('dotCMS', 'core', 32561), ('microsoft', 'ApplicationInsights-Java', 4293)]
[('dotCMS', 'core', 32656), ('dotCMS', 'core', 32657), ('dotCMS', 'core', 32658), ('dotCMS', 'core', 32659), ('dotCMS', 'core', 32660)]


# 2. Helper code block to limit the API rate request

In [None]:
import time
import requests
import requests_cache

def safe_request(method, url, headers=None, params=None, timeout=10, sleep_between=0.4):
    """
    A rate-limit-safe GitHub request wrapper that handles:
    - Primary rate limits (5000/hour)
    - Secondary abuse limits (burst protection)
    - GET and HEAD requests
    """
    while True:
        response = requests.request(method, url, headers=headers, params=params, timeout=timeout)

        # ============================================================
        # Rate Limit handling
        # ============================================================
        # Primary rate limit
        remaining = int(response.headers.get("X-RateLimit-Remaining", 1))
        reset_ts = int(response.headers.get("X-RateLimit-Reset", time.time()))

        if remaining == 0:
            wait = max(reset_ts - int(time.time()), 10)
            print(f"[Primary Limit] Waiting {wait} seconds...")
            time.sleep(wait)
            continue

        # Secondary rate limit (abuse detection)
        if response.status_code == 403:
            print("[Secondary Limit] Hit GitHub abuse limit. Backing off 60 seconds...")
            time.sleep(60)
            continue
        
        # ============================================================
        # API cache
        # ============================================================
        # Check if the response came from the cache
        if hasattr(response, 'from_cache') and response.from_cache:
            print(f"[CACHE] Hit for {url}")
            # Skip the time.sleep(sleep_between) if it came from the cache
            return response

        # ============================================================
        # Network handling
        # ============================================================
        # Success or other errors handled normally
        if not response.ok:
            response.raise_for_status()

        # Small delay prevents triggering secondary limit
        time.sleep(sleep_between)

        return response


# 3. Git API to extract metrics 

In [None]:
# ============================================================
# Helper: Total reviews (not inline) for a PR
# ============================================================
def get_review_count(owner: str, repo: str, pr_number: int, headers: Dict) -> int:
    """Retrieves the total count of formal reviews submitted for a Pull Request using the dedicated /reviews endpoint."""
    reviews_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews"
    
    try:
        # We use a HEAD request with per_page=1 and pagination trick to get the total count
        response = safe_request("HEAD", reviews_url, headers=headers, params={"per_page": 1})
        response.raise_for_status()
        
        # Check the 'Link' header for the last page
        link_header = response.headers.get('Link')
        if link_header:
            last_page_match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if last_page_match:
                return int(last_page_match.group(1))
        
    except requests.exceptions.RequestException:
        return 0

# ============================================================
# Helper: Path files of a repo
# ============================================================
def get_file_path_metrics(owner: str, repo: str, pr_number: int, headers: Dict) -> Tuple[int, float, int]:
    """
    Retrieves the count of changed files and calculates file path length statistics.
    Returns: (total_files, avg_path_length, max_path_length)
    """
    files_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files"
    all_file_paths = []
    page = 1
    
    # 
    while True:
        try:
            response = safe_request("GET", files_url, headers=headers, params={"per_page": 100, "page": page})
            response.raise_for_status()
            files_data = response.json()
            
            if not files_data:
                break
                
            # Extract the 'filename' (which includes the full path)
            for file in files_data:
                # Store the length of the full file path string
                all_file_paths.append(len(file.get('filename', ''))) 
            
            # Check for the next page
            if 'link' not in response.headers or 'rel="next"' not in response.headers['link']:
                break
            page += 1
            
        except requests.exceptions.RequestException:
            break
            
    num_paths = len(all_file_paths)
    
    if num_paths == 0:
        return 0, 0.0, 0
    
    # Calculate average and max path length
    avg_path_length = sum(all_file_paths) / num_paths
    max_path_length = max(all_file_paths)
    
    return num_paths, avg_path_length, max_path_length

# ============================================================
# Main Function: Pull Request Metrics
# ============================================================
def get_pull_request_metrics(owner: str, repo: str, pr_number: int, github_token: Optional[str] = None) -> Optional[Dict]:
    """
    Retrieves the lines added, lines deleted, and the number of files changed
    for a specific GitHub Pull Request.
    """
    
    # 1. API URL for a specific Pull Request
    pr_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}"
    
    headers = {
        # Standard Accept header for the V3 API
        "Accept": "application/vnd.github.v3+json"
    }
    if github_token:
        headers["Authorization"] = f"token {github_token}"
    
    try:
        # Fetch the Pull Request object
        response = safe_request("GET", pr_url, headers=headers)
        response.raise_for_status()
        pr_data = response.json()

        # 1. Line/File Change Metrics (from previous step)
        num_additions = pr_data.get('additions', 0)
        num_deletions = pr_data.get('deletions', 0)
        num_files_changed = pr_data.get('changed_files', 0)
        
        # 2. NumCommits, NumComments (exclude review) 
        num_commits = pr_data.get('commits', 0)
        num_comments = pr_data.get('comments', 0)
        num_formal_reviews = get_review_count(owner, repo, pr_number, headers)
        num_inline_comments = pr_data.get('review_comments', 0)
        
        #3
        num_paths, avg_path_len, max_path_len = get_file_path_metrics(
            owner, repo, pr_number, headers
        )
        
        return {
            "Repo": f"{owner}/{repo}",
            "PR_ID": pr_number,            
            "Additions": num_additions,
            "Deletions": num_deletions,
            "Files_Changed": num_files_changed,
            "NumCommits": num_commits,
            "NumComments": num_comments,
            "NumFormalReviews": num_formal_reviews, 
            "NumInlineComments": num_inline_comments, 
            "NumPathsInFile": num_paths,          # The number of paths (or files changed)
            "AvgPathCharLength": avg_path_len,    # Average characters in file paths
            "MaxPathCharLength": max_path_len,    # Max characters in file paths
        }
        
    except requests.exceptions.RequestException as e:
            print(f"Error fetching data for PR #{pr_number} in {owner}/{repo}: {e}")
            return None
        
# ============================================================
# Main Helper Function: Pull Request Metrics (With Caching)
# ============================================================
def fetch_metrics(pr_list: list, github_token: Optional[str], cached_filename: str) -> pd.DataFrame:
    """
    Fetches Pull Request metrics with caching and incremental saving.
    The cache key is (owner, repo, pr_number).
    """
    results = []
    
    # Load Cache
    try:
        cached_df = pd.read_csv(cached_filename)
        cached_df['Key'] = cached_df.apply(lambda row: (row['owner'], row['repo'], row['pr_number']), axis=1)
        cached_keys = set(cached_df['Key'].tolist())
        print(f"Loaded {len(cached_keys)} existing metrics from {cached_filename}.")
        # Use a list for new results to easily append
        new_results = []
    except FileNotFoundError:
        cached_df = pd.DataFrame()
        cached_keys = set()
        print(f"No existing cache file found at {cached_filename}. Starting from scratch.")
        new_results = []

    # Filter list to only process uncached entries. Input is (owner, repo, pr_number) tuples
    uncached_prs = [
        (owner, repo, pr_number) for owner, repo, pr_number in pr_list 
        if (owner, repo, pr_number) not in cached_keys
    ]
    
    print(f"Total PRs to process: {len(pr_list)}. Uncached PRs remaining: {len(uncached_prs)}.")

    # Process uncached entries
    for i, (owner, repo, pr_number) in enumerate(uncached_prs):
        # Progress printout
        if i % 50 == 0:
            print(f"Processing PR {i+1}/{len(uncached_prs)}: {owner}/{repo} #{pr_number}")
        
        # Call the core metric function
        metrics = get_pull_request_metrics(owner, repo, pr_number, github_token)
        
        if metrics:
            # Add key columns to the results dictionary
            metrics['owner'] = owner
            metrics['repo'] = repo
            metrics['pr_number'] = pr_number
            new_results.append(metrics)
        
        # Intermediate Save every 20 successful API calls (good practice for rate limit recovery)
        if (len(new_results) > 0) and (len(new_results) % 20 == 0):
            print(f"--- Saving intermediate progress: {len(new_results)} new entries...")
            
            # Combine new results with cached data and save
            temp_df = pd.DataFrame(new_results)
            # Ensure proper concatenation by dropping the temporary 'Key' column from cache
            updated_df = pd.concat([cached_df.drop(columns=['Key']), temp_df], ignore_index=True)
            updated_df = updated_df.drop_duplicates(subset=['owner', 'repo', 'pr_number'], keep='last') 
            updated_df.to_csv(cached_filename, index=False)
            
            # Update the cached_df and cached_keys for the next iteration (important for a true resume)
            cached_df = updated_df
            cached_df['Key'] = cached_df.apply(lambda row: (row['owner'], row['repo'], row['pr_number']), axis=1)
            new_results = [] # Clear new results list as they are now in the cache file

    # Final Save (if there are any remaining new results)
    if new_results:
        print(f"--- Final save: {len(new_results)} remaining new entries...")
        temp_df = pd.DataFrame(new_results)
        # Handle case where cached_df might be empty initially
        if not cached_df.empty:
            updated_df = pd.concat([cached_df.drop(columns=['Key']), temp_df], ignore_index=True)
        else:
            updated_df = temp_df
    else:
        updated_df = cached_df.drop(columns=['Key']) if not cached_df.empty else pd.DataFrame()

    updated_df = updated_df.drop_duplicates(subset=['owner', 'repo', 'pr_number'], keep='last') 
    updated_df.to_csv(cached_filename, index=False)
    print(f"Metrics saved to {cached_filename}. Total entries: {len(updated_df)}.")
    
    # 5. Return the final full DataFrame (cached + new)
    return updated_df

# ============================================================
# MAIN PROGRAM
# ============================================================
print("\nStarting data retrieval... (may take a moment due to multiple API calls)")
pr_metrics_df_accept = fetch_metrics(ACCEPTED_PULL_REQUEST, GITHUB_TOKEN, "pr_metrics_accepted_cached.csv")
pr_metrics_df_reject = fetch_metrics(REJECTED_PULL_REQUEST, GITHUB_TOKEN, "pr_metrics_rejected_cached.csv")


Starting data retrieval... (may take a moment due to multiple API calls)
[Primary Limit] Waiting 3552 seconds...
[Primary Limit] Waiting 3553 seconds...


# 4. Finalize

In [None]:
# ============================================================
# Helper: Finalize the dataframe, adding stars and forks
# ============================================================
def finalize_dataframe(metrics_df, output_filename):
    """
    Applies the merging, cleaning, renaming, and reordering steps 
    to a single metrics DataFrame.
    """
    
    # Define the rename mapping
    rename_map = {
        'PR_ID': 'PR_Number',
        'NumCommits': 'Commits', 
        'NumComments': 'Comments', 
        'NumFormalReviews': 'Formal_Review', 
        'NumInlineComments': 'Inline_Comments_Review'
    }
    final_df = metrics_df.rename(columns=rename_map)

    # 3. Define the final column order
    column_order = [
        'Repo', 'PR_Number', 'Commits', 'Additions', 'Deletions', 'Files_Changed', 'Comments', 'Formal_Review', 
        'Inline_Comments_Review', 'NumPathsInFile', 'AvgPathCharLength', 'MaxPathCharLength', 
    ]
    
    # Apply the final column order
    final_df = final_df[column_order]
    final_df = final_df.fillna(0.0)

    # 4. Save the file (using CSV as per your original request)
    final_df.to_csv(output_filename, index=False)
    
    return final_df

# ============================================================
# MAIN PROGRAM - Separate Processing
# ============================================================

# --- Processing Accepted Repositories ---
print("\n--- Processing Accepted Repositories ---")
final_df_accept = finalize_dataframe(
    pr_metrics_df_accept, 
    "pr_metrics_accepted.csv" # Save to a separate file
)

print("\nAccepted Repository Metrics DataFrame Created:")
print(f"Total rows in Accepted DataFrame: {len(final_df_accept)}")

# --- Processing Rejected Repositories ---
print("\n--- Processing Rejected Repositories ---")
final_df_reject = finalize_dataframe(
    pr_metrics_df_reject, 
    "pr_metrics_rejected.csv" # Save to a separate file
)

print("\nRejected Repository Metrics DataFrame Created:")
print(f"Total rows in Rejected DataFrame: {len(final_df_reject)}")


--- Processing Accepted Repositories ---

Accepted Repository Metrics DataFrame Created:
Total rows in Accepted DataFrame: 43

--- Processing Rejected Repositories ---


KeyError: "None of [Index(['Commits', 'Additions', 'Deletions', 'Files_Changed', 'Comments',\n       'Formal_Review', 'Inline_Comments_Review', 'NumPathsInFile',\n       'AvgPathCharLength', 'MaxPathCharLength'],\n      dtype='object')] are in the [columns]"