In [1]:
import requests
from datetime import datetime, timezone
from dateutil import parser 
import re
import numpy as np
import pandas as pd
from typing import Optional, Dict, List, Tuple
from IPython.display import display
from dotenv import load_dotenv
import os

load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")

# Import the Hao-Li AIDev datasets

In [2]:
# Repositories
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")

# Pull Request
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")

# 1. Prepare the Dataset

In [3]:
# Filter the repository data for 'Java' language
java_repo_df = repo_df[repo_df['language'] == 'Java'].copy()
java_repo_select_df = java_repo_df[['id', 'full_name']]

# Join Repo and PR table based on repo id
merged_pr_df = pr_df.merge(
    java_repo_select_df,
    left_on='repo_id',
    right_on='id',
    how='inner'
)

# clean up extra attribute
merged_pr_df = merged_pr_df.drop(columns=['id_y'])
merged_pr_df = merged_pr_df.rename(columns={'id_x':'id'})

# Filter PRs that were rejected (not merged) and create a new attribute
accepted_prs = merged_pr_df[merged_pr_df['merged_at'].notnull()]
rejected_prs = merged_pr_df[merged_pr_df['merged_at'].isnull()]

# Prepare for Merge: Rename the key column
accepted_prs = accepted_prs[['full_name', 'number']]
rejected_prs = rejected_prs[['full_name', 'number']]

# print to csv for checking
accepted_prs.to_csv("accepted_PR.csv", index=False)
rejected_prs.to_csv("rejected_PR.csv", index=False)

## 1.1. Split the full_name of repo into owner and repo name

In [4]:
# ============================================================
# Helper: Split the name and put it in a List of Dict (not needed but ehh accidentally made the method like that)
# ============================================================
def process_repositories(pr_df):
    """
    Filters the DataFrame by status, splits the full_name, and creates a 
    list of (owner, repo) tuples for processing.
    """
    
    # 1. Split the 'full_name' column into 'owner' and 'repo' columns
    split_df = pr_df['full_name'].str.split('/', n=1, expand=True)
    split_df.columns = ['owner', 'repo']
    
    # 2. Combine the split columns and the 'number' column into a list of tuples
    # We use axis=1 to apply the tuple creation row-wise across the three columns
    repositories = pd.concat([split_df, pr_df['number']], axis=1).apply(tuple, axis=1).tolist()
    
    # Print the first 5 results for verification
    print(repositories[:5])
    
    return repositories


ACCEPTED_PULL_REQUEST = process_repositories(accepted_prs)
REJECTED_PULL_REQUEST = process_repositories(rejected_prs)

[('dotCMS', 'core', 32609), ('apache', 'pulsar', 24542), ('dotCMS', 'core', 32771), ('dotCMS', 'core', 32561), ('microsoft', 'ApplicationInsights-Java', 4293)]
[('dotCMS', 'core', 32656), ('dotCMS', 'core', 32657), ('dotCMS', 'core', 32658), ('dotCMS', 'core', 32659), ('dotCMS', 'core', 32660)]


# 2. Helper code block to limit the API rate request

In [5]:
import time
import requests

def safe_request(method, url, headers=None, params=None, timeout=10, sleep_between=0.4):
    """
    A rate-limit-safe GitHub request wrapper that handles:
    - Primary rate limits (5000/hour)
    - Secondary abuse limits (burst protection)
    - GET and HEAD requests
    """
    while True:
        response = requests.request(method, url, headers=headers, params=params, timeout=timeout)

        # Primary rate limit
        remaining = int(response.headers.get("X-RateLimit-Remaining", 1))
        reset_ts = int(response.headers.get("X-RateLimit-Reset", time.time()))

        if remaining == 0:
            wait = max(reset_ts - int(time.time()), 10)
            print(f"[Primary Limit] Waiting {wait} seconds...")
            time.sleep(wait)
            continue

        # Secondary rate limit (abuse detection)
        if response.status_code == 403:
            print("[Secondary Limit] Hit GitHub abuse limit. Backing off 60 seconds...")
            time.sleep(60)
            continue

        # Success or other errors handled normally
        if not response.ok:
            response.raise_for_status()

        # Small delay prevents triggering secondary limit
        time.sleep(sleep_between)

        return response

# 3. Git API to extract metrics 

In [13]:
# ============================================================
# Helper: Get the files name, patch code, addition, deletion, status, and RAW URL
# ============================================================
def get_pr_file_details(owner: str, repo: str, pr_number: int, github_token: Optional[str] = None) -> List[Dict]:
    """
    Fetches the details for all files changed in a Pull Request,
    INCLUDING the raw URL for the file content.
    """
    
    base_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files"
    all_file_details = []
    page = 1
    
    headers = {
        "Accept": "application/vnd.github.v3+json",
    }
    if github_token:
        headers["Authorization"] = f"Bearer {github_token}"

    print(f"Fetching file details for PR #{pr_number} (Paginating 100 files/page)...")

    while True:
        params = {"per_page": 100, "page": page}
        
        try:
            response = safe_request("GET", base_url, headers=headers, params=params,)
            response.raise_for_status()
            files_data = response.json()

            if not files_data:
                break

            for file in files_data:
                filename = file.get('filename')
                patch_content = file.get('patch')
                # final_patch = patch_content if patch_content else "NULL"
                raw_url = file.get('raw_url') 
                
                # All the file metrics here
                all_file_details.append({
                    "filename": filename,
                    #"patch": final_patch,
                    "status": file.get('status'),
                    #"additions": file.get('additions', 0),
                    #"deletions": file.get('deletions', 0),
                    "raw_url": raw_url 
                })
            
            # Check for the next page header
            if 'link' not in response.headers or 'rel="next"' not in response.headers['link']:
                break
                
            page += 1
            
        except requests.exceptions.RequestException as e:
            print(f"Error during API call on page {page}: {e}")
            break
            
    print(f"Finished fetching. Total files processed: {len(all_file_details)}")
    return all_file_details
        
# ============================================================
# Main Helper: Fetch the main metric functions 
# ============================================================
def fetch_metrics(repo_list, token):
    results = []
    # limit the number of repositories processed here for testing REPOSITORIES[:10]:
    for owner, repo, pr_number in repo_list[:2]: # Apply the test limit here
        metrics = get_pr_file_details(owner, repo, pr_number, token)
        if metrics:
            results.append(metrics)
    
    # Create the Metric DataFrame
    return results # Return a List[List[Dict]] to be process later

# ============================================================
# MAIN PROGRAM
# ============================================================
print("\nStarting data retrieval... (may take a moment due to multiple API calls)")
files_list_accepted = fetch_metrics(ACCEPTED_PULL_REQUEST, GITHUB_TOKEN)
files_list_rejected = fetch_metrics(REJECTED_PULL_REQUEST, GITHUB_TOKEN)


Starting data retrieval... (may take a moment due to multiple API calls)
Fetching file details for PR #32609 (Paginating 100 files/page)...
Finished fetching. Total files processed: 9
Fetching file details for PR #24542 (Paginating 100 files/page)...
Finished fetching. Total files processed: 4
Fetching file details for PR #32656 (Paginating 100 files/page)...
Finished fetching. Total files processed: 25
Fetching file details for PR #32657 (Paginating 100 files/page)...
Finished fetching. Total files processed: 18


In [17]:
# ============================================================
# Helper: Filter and Aggregate PR Data
# ============================================================
def filter_and_aggregate_pr_data(pr_files_list: List[List[Dict]], repo_list: List[Tuple[str, str, int]]) -> List[Dict]:
    """
    Filters file details for Java files that are not deleted and aggregates the 
    relevant data (like raw_urls) at the Pull Request level.

    Args:
        pr_files_list: The nested list of file details from the GitHub API.
        repo_list: The original list of (owner, repo, pr_number) tuples 
                   used to fetch the data.

    Returns:
        A list of dictionaries, one for each PR, containing aggregated metrics.
    """
    aggregated_pr_data = []
    
    # Iterate through the results for each PR
    for pr_index, pr_files in enumerate(pr_files_list):
        
        # Safely retrieve metadata for the current PR
        if pr_index >= len(repo_list):
            print(f"Warning: Missing metadata for PR at index {pr_index}. Skipping.")
            continue
            
        owner, repo, pr_number = repo_list[pr_index]
        
        java_files_to_analyze = []
        
        # --- File-level filtering ---
        for file in pr_files:
            filename = file.get('filename', '')
            status = file.get('status', '')
            
            # 1. Detect .java file in the file name (case-insensitive)
            is_java = filename.lower().endswith('.java')
            
            # 2. Exclude status deleted (we only analyze added or modified code)
            is_not_deleted = status != 'deleted'
            
            # Store the file name and raw URL for the non-deleted Java file
            if is_java and is_not_deleted:
                java_files_to_analyze.append({
                    "file_name": filename,
                    "raw_url": file.get('raw_url')
                })
        
        # --- PR-level aggregation ---
        aggregated_pr_data.append({
            'owner': owner,
            'repo': repo,
            'pr_number': pr_number,
            'java_files_analyzed_count': len(java_files_to_analyze),
            'files_to_analyze': java_files_to_analyze, # List[Dict]
            # add the PMD violation counts later
            'pmd_violations': {} 
        })
        
    return aggregated_pr_data

# ============================================================
# MAIN PROGRAM: Filter the 
# ============================================================
print("\nStarting filtering and sorting")
pr_code_metrics_filtered_accepted = filter_and_aggregate_pr_data(files_list_accepted, ACCEPTED_PULL_REQUEST)
pr_code_metrics_filtered_rejected = filter_and_aggregate_pr_data(files_list_accepted, ACCEPTED_PULL_REQUEST)


Starting filtering and sorting


In [19]:
def download_file(raw_url, local_path, token=None):
    """Downloads a single file from GitHub's raw URL."""
    headers = {}
    if token:
        headers['Authorization'] = f"token {token}"
        
    response = requests.get(raw_url, headers=headers)
    
    if response.status_code == 200:
        # Create directories if they don't exist
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        with open(local_path, 'w', encoding='utf-8') as f:
            f.write(response.text)
        return True
    else:
        print(f"Failed to download {raw_url}. Status: {response.status_code}")
        return False
        
# Example of the download loop:
for pr_data in pr_code_metrics_filtered_accepted:
    owner = pr_data['owner']
    repo = pr_data['repo']
    pr_number = pr_data['pr_number']
    files_to_analyze = pr_data['files_to_analyze'] 
    
    # Create a local staging directory structure
    # This prevents file path clashes and organizes your PMD reports
    base_dir = "./pr_analysis_staging"
    local_staging_dir = os.path.join(base_dir, owner, repo, str(pr_number))
    
    print(f"\n--- Processing PR: {owner}/{repo} #{pr_number} ({len(files_to_analyze)} files) ---")
    
    # Inner loop: Iterate over the list of file dictionaries
    for file_data in files_to_analyze:
        
        # Get the two keys you need from the file dictionary
        file_name = file_data['file_name']
        raw_url = file_data['raw_url']
        
        # Determine the full local path for this file
        local_path = os.path.join(local_staging_dir, file_name)
        
        download_file(raw_url, local_path, GITHUB_TOKEN)


--- Processing PR: dotCMS/core #32609 (0 files) ---

--- Processing PR: apache/pulsar #24542 (4 files) ---


In [None]:
# ============================================================
# Helper: Finalize the dataframe, adding stars and forks
# ============================================================
def finalize_dataframe(metrics_df, output_filename):
    """
    Applies the merging, cleaning, renaming, and reordering steps 
    to a single metrics DataFrame.
    """
    
    # Define the rename mapping
    rename_map = {
        'PR_ID': 'PR_number',
        'NumCommits': 'Commits', 
        'NumComments': 'Comments', 
        'NumFormalReviews': 'Formal_Review', 
        'NumInlineComments': 'Inline_Comments_Review'
    }
    final_df = metrics_df.rename(columns=rename_map)

    # 3. Define the final column order
    column_order = [
        'Repo', 'PR_number', 'Commits', 'Additions', 'Deletions', 'Files_Changed', 'Comments', 'Formal_Review', 
        'Inline_Comments_Review', 'NumPathsInFile', 'AvgPathCharLength', 'MaxPathCharLength', 
    ]
    
    # Apply the final column order
    final_df = final_df[column_order]
    final_df = final_df.fillna(0.0)

    # 4. Save the file (using CSV as per your original request)
    final_df.to_csv(output_filename, index=False)
    
    return final_df

# ============================================================
# MAIN PROGRAM - Separate Processing
# ============================================================

# --- Processing Accepted Repositories ---
print("\n--- Processing Accepted Repositories ---")
final_df_accept = finalize_dataframe(
    pr_metrics_df_accept, 
    "pr_metrics_accepted.csv" # Save to a separate file
)

print("\nAccepted Repository Metrics DataFrame Created:")
print(f"Total rows in Accepted DataFrame: {len(final_df_accept)}")

# --- Processing Rejected Repositories ---
print("\n--- Processing Rejected Repositories ---")
final_df_reject = finalize_dataframe(
    pr_metrics_df_reject, 
    "pr_metrics_rejected.csv" # Save to a separate file
)

print("\nRejected Repository Metrics DataFrame Created:")
print(f"Total rows in Rejected DataFrame: {len(final_df_reject)}")


--- Processing Accepted Repositories ---

Accepted Repository Metrics DataFrame Created:
Total rows in Accepted DataFrame: 43

--- Processing Rejected Repositories ---


KeyError: "None of [Index(['Commits', 'Additions', 'Deletions', 'Files_Changed', 'Comments',\n       'Formal_Review', 'Inline_Comments_Review', 'NumPathsInFile',\n       'AvgPathCharLength', 'MaxPathCharLength'],\n      dtype='object')] are in the [columns]"