In [39]:
import requests
from datetime import datetime, timezone
from dateutil import parser 
import re
import numpy as np
import pandas as pd
from IPython.display import display
from dotenv import load_dotenv
import os


In [40]:
load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")

In [41]:
# Repositories
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")

# Pull Request
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet", columns=['repo_id', 'merged_at'])


In [42]:
# Filter for Repo that has Java language
initial_count = len(repo_df)
repo_df = repo_df[repo_df['language'] == 'Java']
print(f"Filtered {initial_count} repos down to {len(repo_df)} using language='Java'.")

Filtered 2807 repos down to 86 using language='Java'.


In [43]:
# Join Repo and PR table based on repo id
merged_pr_df = pr_df.merge(
        repo_df[['id', 'full_name']], 
        left_on='repo_id', 
        right_on='id', 
        how='inner' 
    ).drop(columns=['id', 'repo_id'])

# Filter PRs that were rejected (not merged)
merged_pr_df['is_accepted'] = merged_pr_df['merged_at'].notnull()

# Group and Aggregate: Group by the new 'full_name' column
pr_metrics = merged_pr_df.groupby('full_name').agg(
    total_prs=('is_accepted', 'size'),        
    accepted_prs=('is_accepted', 'sum')       
).reset_index()

# Calculate Rejected: Subtract accepted from total
pr_metrics['rejected_prs'] = pr_metrics['total_prs'] - pr_metrics['accepted_prs']

# Prepare for Merge: Rename the key column
pr_metrics = pr_metrics[['full_name', 'accepted_prs', 'rejected_prs']]

# print to csv for checking
print(f"Aggregated metrics for {len(pr_metrics)} unique repositories ready for merge.")
pr_metrics.to_csv("accepted_rejected_repo.csv", index=False)

Aggregated metrics for 86 unique repositories ready for merge.


In [44]:
# Accepted PR if accepted_pr >= rejected_pr,
pr_metrics['pr_status'] = np.where(
    pr_metrics['accepted_prs'] >= pr_metrics['rejected_prs'], 
    'Accepted', 
    'Rejected'
)

# Filter the DataFrame into two parts
accepted_repos_df = pr_metrics[pr_metrics['pr_status'] == 'Accepted']
rejected_repos_df = pr_metrics[pr_metrics['pr_status'] == 'Rejected']

In [45]:
# ============================================================
# Helper: Split the name and put it in a List of Dict (not needed but ehh accidentally made the method like that)
# ============================================================
def process_repositories(pr_metrics_df, status_filter):
    """
    Filters the DataFrame by status, splits the full_name, and creates a 
    list of (owner, repo) tuples for processing.
    """
    # 1. Filter the DataFrame by status
    repos_df = pr_metrics_df[pr_metrics_df['pr_status'] == status_filter]
    
    # 2. Split the name into (owner, repo) tuples
    # Use .str.split().apply(tuple) to get a list of tuples directly (more Pythonic)
    repositories = repos_df['full_name'].str.split('/', n=1, expand=True).apply(tuple, axis=1).tolist()
    
    print(f"\nFirst 5 entries of the {status_filter.upper()} REPOSITORIES list:")
    print(repositories[:5])
    
    return repositories


ACCEPTED_REPOSITORIES = process_repositories(pr_metrics, 'Accepted')
REJECTED_REPOSITORIES = process_repositories(pr_metrics, 'Rejected')



First 5 entries of the ACCEPTED REPOSITORIES list:
[('Camelcade', 'Perl5-IDEA'), ('DataDog', 'dd-trace-java'), ('EduMIPS64', 'edumips64'), ('JetBrains', 'psiviewer'), ('OWASP', 'wrongsecrets')]

First 5 entries of the REJECTED REPOSITORIES list:
[('1c-syntax', 'bsl-language-server'), ('2006-Scape', '2006Scape'), ('AutoMQ', 'automq'), ('Azure-Samples', 'azure-search-openai-demo-java'), ('Azure', 'azure-sdk-for-java')]


In [46]:
# ============================================================
# Helper: Total Contributors
# ============================================================
def get_total_contributors(owner, repo, headers):
    """
    Retrieves the total number of contributors for a GitHub repository 
    using the Link header pagination trick.
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/contributors"
    params = {"per_page": 1, "anon": "true"}  # include anonymous contributors

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))
        # fallback for small repos
        return 1
    except requests.exceptions.RequestException as e:
        print(f"Error fetching contributors for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Commit Count
# ============================================================
def get_commit_count(owner, repo, headers):
    url = f"https://api.github.com/repos/{owner}/{repo}/commits"
    params = {"per_page": 1}

    try:
        response = requests.head(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        link_header = response.headers.get('Link')

        if link_header:
            match = re.search(r'page=(\d+)>; rel="last"', link_header)
            if match:
                return int(match.group(1))

        # fallback for repos with few commits
        response = requests.get(url, headers=headers, params={"per_page": 100}, timeout=10)
        return len(response.json())
    except requests.exceptions.RequestException as e:
        print(f"Error fetching commits for {owner}/{repo}: {e}")
        return 0

# ============================================================
# Helper: Issues and Pull Requests
# ============================================================
def get_issue_and_pull_metrics(owner, repo, headers):
    base_search_url = "https://api.github.com/search/issues"

    queries = {
        'open_issues': f"repo:{owner}/{repo} is:issue is:open",
        'closed_issues': f"repo:{owner}/{repo} is:issue is:closed",
        'open_prs': f"repo:{owner}/{repo} is:pr is:open",
        'closed_prs': f"repo:{owner}/{repo} is:pr is:closed",
    }

    metrics = {'open_issues': 0, 'closed_issues': 0, 'total_pull_requests': 0}

    for key, query in queries.items():
        try:
            response = requests.get(base_search_url, headers=headers, params={'q': query, 'per_page': 1}, timeout=10)
            response.raise_for_status()
            total = response.json().get('total_count', 0)

            if key.startswith('open_issue'):
                metrics['open_issues'] = total
            elif key.startswith('closed_issue'):
                metrics['closed_issues'] = total
            else:
                metrics['total_pull_requests'] += total

        except requests.exceptions.RequestException:
            continue

    return metrics

# ============================================================
# Main Function: Repository Metrics
# ============================================================
def get_repo_metrics(owner, repo, github_token=None):
    """
    Retrieves metrics for a GitHub repository:
    Watchers, Duration, Commits, Issues, Pull Requests, Contributors
    """
    repo_url = f"https://api.github.com/repos/{owner}/{repo}"
    headers = {}
    if github_token:
        headers["Authorization"] = f"token {github_token}"

    try:
        response = requests.get(repo_url, headers=headers, timeout=10)
        response.raise_for_status()
        repo_data = response.json()

        # Basic metrics
        num_watchers = repo_data.get('stargazers_count', 0)
        num_commits = get_commit_count(owner, repo, headers)
        issue_pr_metrics = get_issue_and_pull_metrics(owner, repo, headers)
        num_closed_issues = issue_pr_metrics['closed_issues']
        num_open_issues = issue_pr_metrics['open_issues']
        num_pull_reqs = issue_pr_metrics['total_pull_requests']
        num_contributors = get_total_contributors(owner, repo, headers)

        # Duration (days since creation)
        created_at_str = repo_data.get('created_at')
        num_days = 0
        if created_at_str:
            created_at_aware = parser.parse(created_at_str)
            now_utc = datetime.now(timezone.utc)
            num_days = (now_utc - created_at_aware).days

        return {
            "Repo": f"{owner}/{repo}",
            "NumWatchers": num_watchers,
            "NumDays": num_days,
            "NumCommits": num_commits,
            "NumOpenIssues": num_open_issues,
            "NumClosedIssues": num_closed_issues,
            "NumPullReqs": num_pull_reqs,
            "NumContributors": num_contributors,
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {owner}/{repo}: {e}")
        return None
    
# ============================================================
# Helper: Fetch the main metric functions 
# ============================================================
def fetch_metrics(repo_list, token):
    results = []
    # limit the number of repositories processed here for testing REPOSITORIES[:10]:
    for owner, repo in repo_list[:10]: # Apply the test limit here
        metrics = get_repo_metrics(owner, repo, token)
        if metrics:
            results.append(metrics)
    
    # Create the Metric DataFrame
    return pd.DataFrame(results)

# ============================================================
# MAIN PROGRAM
# ============================================================
print("\nStarting data retrieval... (may take a moment due to multiple API calls)")
metrics_df_accept = fetch_metrics(ACCEPTED_REPOSITORIES, GITHUB_TOKEN)
metrics_df_reject = fetch_metrics(REJECTED_REPOSITORIES, GITHUB_TOKEN)


Starting data retrieval... (may take a moment due to multiple API calls)


In [47]:
# ============================================================
# Helper: Finalize the dataframe, adding stars and forks
# ============================================================
def finalize_dataframe(metrics_df, repo_df, output_filename):
    """
    Applies the merging, cleaning, renaming, and reordering steps 
    to a single metrics DataFrame.
    """
    # 1. Merge with the original repo_df to get stars and forks
    final_df = pd.merge(
        metrics_df, 
        repo_df[['full_name', 'stars', 'forks']],
        left_on='Repo', 
        right_on='full_name', 
        how='left' 
    )

    # 2. Clean up and reorder columns
    final_df = final_df.drop(columns=['full_name'])
    
    # Define the rename mapping
    rename_map = {
        'NumWatchers': 'Watchers',
        'NumDays': 'Duration_Days',
        'NumCommits': 'Commits', 
        'NumOpenIssues': 'Open_Issues', 
        'NumClosedIssues': 'Closed_Issues', 
        'NumPullReqs': 'Pull_Requests', 
        'NumContributors': 'Contributors',
        'stars': 'Stars',
        'forks': 'Forks' 
    }
    final_df = final_df.rename(columns=rename_map)

    # 3. Define the final column order
    column_order = [
        'Repo', 'Contributors', 'Stars', 'Forks', 'Watchers', 'Duration_Days', 
        'Commits', 'Open_Issues', 'Closed_Issues', 'Pull_Requests', 
    ]
    
    # Apply the final column order
    final_df = final_df[column_order]

    # 4. Save the file (using CSV as per your original request)
    final_df.to_csv(output_filename, index=False)
    
    return final_df

# ============================================================
# MAIN PROGRAM - Separate Processing
# ============================================================

# --- Processing Accepted Repositories ---
print("\n--- Processing Accepted Repositories ---")
final_df_accept = finalize_dataframe(
    metrics_df_accept, 
    repo_df, 
    "repo_metrics_accepted.csv" # Save to a separate file
)

print("\nAccepted Repository Metrics DataFrame Created:")
print(f"Total rows in Accepted DataFrame: {len(final_df_accept)}")

# --- Processing Rejected Repositories ---
print("\n--- Processing Rejected Repositories ---")
final_df_reject = finalize_dataframe(
    metrics_df_reject, 
    repo_df, 
    "repo_metrics_rejected.csv" # Save to a separate file
)

print("\nRejected Repository Metrics DataFrame Created:")
print(f"Total rows in Rejected DataFrame: {len(final_df_reject)}")


--- Processing Accepted Repositories ---

Accepted Repository Metrics DataFrame Created:
Total rows in Accepted DataFrame: 10

--- Processing Rejected Repositories ---

Rejected Repository Metrics DataFrame Created:
Total rows in Rejected DataFrame: 10
