In [10]:
import requests
from datetime import datetime, timezone
from dateutil import parser 
import re
import numpy as np
import pandas as pd
from typing import Optional, Dict, List, Tuple
from IPython.display import display
from dotenv import load_dotenv
import os

In [11]:
load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")

In [12]:
# Repositories
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")

# Pull Request
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")

In [13]:
# Filter the repository data for 'Java' language
java_repo_df = repo_df[repo_df['language'] == 'Java'].copy()
java_repo_select_df = java_repo_df[['id', 'full_name']]

# Join Repo and PR table based on repo id
merged_pr_df = pr_df.merge(
    java_repo_select_df,
    left_on='repo_id',
    right_on='id',
    how='inner'
)

# clean up extra attribute
merged_pr_df = merged_pr_df.drop(columns=['id_y'])
merged_pr_df = merged_pr_df.rename(columns={'id_x':'id'})

# Filter PRs that were rejected (not merged) and create a new attribute
accepted_prs = merged_pr_df[merged_pr_df['merged_at'].notnull()]
rejected_prs = merged_pr_df[merged_pr_df['merged_at'].isnull()]

# Prepare for Merge: Rename the key column
accepted_prs = accepted_prs[['full_name', 'number']]
rejected_prs = rejected_prs[['full_name', 'number']]

# print to csv for checking
accepted_prs.to_csv("accepted_PR.csv", index=False)
rejected_prs.to_csv("rejected_PR.csv", index=False)

In [14]:
# ============================================================
# Helper: Split the name and put it in a List of Dict (not needed but ehh accidentally made the method like that)
# ============================================================
def process_repositories(pr_df):
    """
    Filters the DataFrame by status, splits the full_name, and creates a 
    list of (owner, repo) tuples for processing.
    """
    
    # 1. Split the 'full_name' column into 'owner' and 'repo' columns
    split_df = pr_df['full_name'].str.split('/', n=1, expand=True)
    split_df.columns = ['owner', 'repo']
    
    # 2. Combine the split columns and the 'number' column into a list of tuples
    # We use axis=1 to apply the tuple creation row-wise across the three columns
    repositories = pd.concat([split_df, pr_df['number']], axis=1).apply(tuple, axis=1).tolist()
    
    # Print the first 5 results for verification
    print(repositories[:5])
    
    return repositories


ACCEPTED_PULL_REQUEST = process_repositories(accepted_prs)
REJECTED_PULL_REQUEST = process_repositories(rejected_prs)

[('dotCMS', 'core', 32609), ('apache', 'pulsar', 24542), ('dotCMS', 'core', 32771), ('dotCMS', 'core', 32561), ('microsoft', 'ApplicationInsights-Java', 4293)]
[('dotCMS', 'core', 32656), ('dotCMS', 'core', 32657), ('dotCMS', 'core', 32658), ('dotCMS', 'core', 32659), ('dotCMS', 'core', 32660)]


In [None]:
# ============================================================
# Helper: Total reviews (not inline) for a PR
# ============================================================
def get_review_count(owner: str, repo: str, pr_number: int, headers: Dict) -> int:
    """Retrieves the total count of formal reviews submitted for a Pull Request using the dedicated /reviews endpoint."""
    reviews_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews"
    
    try:
        # We use a HEAD request with per_page=1 and pagination trick to get the total count
        response = requests.head(reviews_url, headers=headers, params={"per_page": 1}, timeout=10)
        response.raise_for_status()
        
        # Check the 'Link' header for the last page
        link_header = response.headers.get('Link')
        if link_header:
            last_page_match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if last_page_match:
                return int(last_page_match.group(1))
        
    except requests.exceptions.RequestException:
        return 0

# ============================================================
# Helper: Path files of a repo
# ============================================================
def get_file_path_metrics(owner: str, repo: str, pr_number: int, headers: Dict) -> Tuple[int, float, int]:
    """
    Retrieves the count of changed files and calculates file path length statistics.
    Returns: (total_files, avg_path_length, max_path_length)
    """
    files_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files"
    all_file_paths = []
    page = 1
    
    # 
    while True:
        try:
            response = requests.get(
                files_url, 
                headers=headers, 
                params={"per_page": 100, "page": page}, 
                timeout=10
            )
            response.raise_for_status()
            files_data = response.json()
            
            if not files_data:
                break
                
            # Extract the 'filename' (which includes the full path)
            for file in files_data:
                # Store the length of the full file path string
                all_file_paths.append(len(file.get('filename', ''))) 
            
            # Check for the next page
            if 'link' not in response.headers or 'rel="next"' not in response.headers['link']:
                break
            page += 1
            
        except requests.exceptions.RequestException:
            break
            
    num_paths = len(all_file_paths)
    
    if num_paths == 0:
        return 0, 0.0, 0
    
    # Calculate average and max path length
    avg_path_length = sum(all_file_paths) / num_paths
    max_path_length = max(all_file_paths)
    
    return num_paths, avg_path_length, max_path_length

# ============================================================
# Main Function: Pull Request Metrics
# ============================================================
def get_pull_request_metrics(owner: str, repo: str, pr_number: int, github_token: Optional[str] = None) -> Optional[Dict]:
    """
    Retrieves the lines added, lines deleted, and the number of files changed
    for a specific GitHub Pull Request.
    """
    
    # 1. API URL for a specific Pull Request
    pr_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}"
    
    headers = {
        # Standard Accept header for the V3 API
        "Accept": "application/vnd.github.v3+json"
    }
    if github_token:
        headers["Authorization"] = f"token {github_token}"
    
    try:
        # Fetch the Pull Request object
        response = requests.get(pr_url, headers=headers, timeout=10)
        response.raise_for_status()
        pr_data = response.json()

        # 1. Line/File Change Metrics (from previous step)
        num_additions = pr_data.get('additions', 0)
        num_deletions = pr_data.get('deletions', 0)
        num_files_changed = pr_data.get('changed_files', 0)
        
        # 2. NumCommits, NumComments (exclude review) 
        num_commits = pr_data.get('commits', 0)
        num_comments = pr_data.get('comments', 0)
        num_formal_reviews = get_review_count(owner, repo, pr_number, headers)
        num_inline_comments = pr_data.get('review_comments', 0)
        
        #3
        num_paths, avg_path_len, max_path_len = get_file_path_metrics(
            owner, repo, pr_number, headers
        )
        
        return {
            "PR_ID": pr_number,
            "Additions": num_additions,
            "Deletions": num_deletions,
            "Files_Changed": num_files_changed,
            "NumCommits": num_commits,
            "NumComments": num_comments,
            "NumFormalReviews": num_formal_reviews, 
            "NumInlineComments": num_inline_comments, 
            "NumPathsInFile": num_paths,          # The number of paths (or files changed)
            "AvgPathCharLength": avg_path_len,    # Average characters in file paths
            "MaxPathCharLength": max_path_len,    # Max characters in file paths
        }
        
    except requests.exceptions.RequestException as e:
            print(f"Error fetching data for PR #{pr_number} in {owner}/{repo}: {e}")
            return None
        
# ============================================================
# Main Helper: Fetch the main metric functions 
# ============================================================
def fetch_metrics(repo_list, token):
    results = []
    # limit the number of repositories processed here for testing REPOSITORIES[:10]:
    for owner, repo, pr_number in repo_list: # Apply the test limit here
        metrics = get_pull_request_metrics(owner, repo, pr_number, token)
        if metrics:
            results.append(metrics)
    
    # Create the Metric DataFrame
    return pd.DataFrame(results)

# ============================================================
# MAIN PROGRAM
# ============================================================
print("\nStarting data retrieval... (may take a moment due to multiple API calls)")
pr_metrics_df_accept = fetch_metrics(ACCEPTED_PULL_REQUEST, GITHUB_TOKEN)
pr_metrics_df_reject = fetch_metrics(REJECTED_PULL_REQUEST, GITHUB_TOKEN)


Starting data retrieval... (may take a moment due to multiple API calls)


In [17]:
display(pr_metrics_df_reject.head())

Unnamed: 0,PR_ID,Additions,Deletions,Files_Changed,NumCommits,NumComments,NumFormalReviews,NumInlineComments,NumPathsInFile,AvgPathCharLength,MaxPathCharLength
0,32656,2703,546,25,2,1,6.0,2,25,75.96,90
1,32657,3811,906,18,1,3,,0,18,75.611111,88
2,32658,19214,2952,170,8,6,,0,170,82.117647,114
3,32659,21977,3267,185,9,6,,0,185,81.32973,114
4,32660,25559,4991,197,10,7,,0,197,80.543147,114


In [None]:
# ============================================================
# Helper: Finalize the dataframe, adding stars and forks
# ============================================================
def finalize_dataframe(metrics_df, repo_df, output_filename):
    """
    Applies the merging, cleaning, renaming, and reordering steps 
    to a single metrics DataFrame.
    """
    
    # Define the rename mapping
    rename_map = {
        'PR_ID': 'PR_number',
        'NumCommits': 'Commits', 
        'NumComments': 'Comments', 
        'NumFormalReviews': 'Formal_Review', 
        'NumInlineComments': 'Inline_Comments_Review'
    }
    final_df = metrics_df.rename(columns=rename_map)

    # 3. Define the final column order
    column_order = [
        'Commits', 'Additions', 'Deletions', 'Files_Changed', 'Comments', 'Formal_Review', 
        'Inline_Comments_Review', 'NumPathsInFile', 'AvgPathCharLength', 'MaxPathCharLength', 
    ]
    
    # Apply the final column order
    final_df = final_df[column_order]
    metrics_df = metrics_df.fillna(0.0)

    # 4. Save the file (using CSV as per your original request)
    final_df.to_csv(output_filename, index=False)
    
    return final_df

# ============================================================
# MAIN PROGRAM - Separate Processing
# ============================================================

# --- Processing Accepted Repositories ---
print("\n--- Processing Accepted Repositories ---")
final_df_accept = finalize_dataframe(
    pr_metrics_df_accept, 
    repo_df, 
    "pr_metrics_accepted.csv" # Save to a separate file
)

print("\nAccepted Repository Metrics DataFrame Created:")
print(f"Total rows in Accepted DataFrame: {len(final_df_accept)}")

# --- Processing Rejected Repositories ---
print("\n--- Processing Rejected Repositories ---")
final_df_reject = finalize_dataframe(
    pr_metrics_df_reject, 
    repo_df, 
    "pr_metrics_rejected.csv" # Save to a separate file
)

print("\nRejected Repository Metrics DataFrame Created:")
print(f"Total rows in Rejected DataFrame: {len(final_df_reject)}")


--- Processing Accepted Repositories ---

Accepted Repository Metrics DataFrame Created:
Total rows in Accepted DataFrame: 10

--- Processing Rejected Repositories ---

Rejected Repository Metrics DataFrame Created:
Total rows in Rejected DataFrame: 10
