In [11]:
import requests
import json
import pandas as pd
from typing import List, Dict, Optional
import readability
import syntok.segmenter as segmenter
from dotenv import load_dotenv
import os

load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")

# Import the Hao-Li AIDev datasets

In [12]:
# Repositories
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")

# Pull Request
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")

# Related issues
related_issue_df = pd.read_parquet("hf://datasets/hao-li/AIDev/related_issue.parquet")
issue_df = pd.read_parquet("hf://datasets/hao-li/AIDev/issue.parquet", columns=['id', 'number', 'body'])

# 1. Prepare the Dataset

In [13]:
# Filter the repository data for 'Java' language
java_repo_df = repo_df[repo_df['language'] == 'Java'].copy()
java_repo_select_df = java_repo_df[['id', 'full_name']]

# Join Repo and PR table based on repo id
merged_pr_df = pr_df.merge(
    java_repo_select_df,
    left_on='repo_id',
    right_on='id',
    how='inner'
)
print(len(merged_pr_df))
# clean up extra attribute
merged_pr_df = merged_pr_df.drop(columns=['id_y'])
merged_pr_df = merged_pr_df.rename(columns={'id_x':'id'})

# Join the merged table with pr_commit
merged_pr_df = related_issue_df.merge(
    merged_pr_df,
    left_on='pr_id',
    right_on='id',
    how='inner'
)

# clean up extra attribute
merged_pr_df = merged_pr_df.drop(columns=['id','source'])


# Join the merged table with pr_commit
merged_pr_df = issue_df.merge(
    merged_pr_df,
    left_on='id',
    right_on='issue_id',
    how='inner'
)

# clean up extra attribute
merged_pr_df = merged_pr_df.drop(columns=['id'])
rename_map = {
    'number_x':             'issue_number',
    'number_y':           'pr_number',
}
merged_pr_df = merged_pr_df.rename(columns=rename_map)
display(merged_pr_df.head())



1278


Unnamed: 0,issue_number,body_x,pr_id,issue_id,pr_number,title,body_y,agent,user_id,user,state,created_at,closed_at,merged_at,repo_id,repo_url,html_url,full_name
0,23190,### Search before asking\n\n- [X] I searched i...,3250080019,2471504000.0,24542,[fix][broker]Fix thread safety issues in Bucke...,### Motivation\r\n\r\nFixes #23190\r\n\r\nBuck...,Claude_Code,10327630,Apurva007,closed,2025-07-21T21:21:39Z,2025-07-22T06:17:01Z,2025-07-22T06:17:01Z,62117812,https://api.github.com/repos/apache/pulsar,https://github.com/apache/pulsar/pull/24542,apache/pulsar
1,24138,### Search before asking\n\n- [x] I searched i...,2959025892,2955889000.0,24145,[fix][cli] Enhance split-bundle command to acc...,- Modified pulsar-admin CLI to handle both...,Claude_Code,201149937,alexander-nailed-it,closed,2025-03-30T18:27:22Z,2025-04-18T18:12:35Z,,62117812,https://api.github.com/repos/apache/pulsar,https://github.com/apache/pulsar/pull/24145,apache/pulsar
2,3390,"Было бы не плохо, возможно опционально, по умо...",3114848770,2799329000.0,3481,Add excludeTrailingComments option to LineLeng...,This PR adds a new configuration parameter `ex...,Copilot,198982749,Copilot,closed,2025-06-03T17:34:41Z,2025-06-03T20:24:48Z,2025-06-03T20:24:48Z,163654595,https://api.github.com/repos/1c-syntax/bsl-lan...,https://github.com/1c-syntax/bsl-language-serv...,1c-syntax/bsl-language-server
3,3482,Публикация артефактов в OOSRH больше не доступ...,3116095750,3116096000.0,3483,Migrate from legacy OSSRH to Central Portal fo...,This PR migrates the Maven/Sonatype publishing...,Copilot,198982749,Copilot,open,2025-06-04T02:39:37Z,,,163654595,https://api.github.com/repos/1c-syntax/bsl-lan...,https://github.com/1c-syntax/bsl-language-serv...,1c-syntax/bsl-language-server
4,3485,Sentry Issue: [BSL-LANGUAGE-SERVER-DW](https:/...,3145625420,3145622000.0,3486,Fix ClassCastException in MagicNumberDiagnosti...,The `MagicNumberDiagnostic.configure()` method...,Copilot,198982749,Copilot,open,2025-06-14T07:07:30Z,,,163654595,https://api.github.com/repos/1c-syntax/bsl-lan...,https://github.com/1c-syntax/bsl-language-serv...,1c-syntax/bsl-language-server


In [14]:
# Filter PRs that were rejected (not merged) and create a new attribute
accepted_prs = merged_pr_df[merged_pr_df['merged_at'].notnull()]
rejected_prs = merged_pr_df[merged_pr_df['merged_at'].isnull()]

# Prepare for Merge: Rename the key column
accepted_prs = accepted_prs[['full_name', 'issue_number', 'pr_number']]
rejected_prs = rejected_prs[['full_name', 'issue_number', 'pr_number']]

# print to csv for checking
accepted_prs.to_csv("accepted_PR.csv", index=False)
rejected_prs.to_csv("rejected_PR.csv", index=False)

## 1.2. Split the full_name of repo into owner and repo name

In [15]:
# ============================================================
# Helper: Split the name and put it in a List of Dict (not needed but ehh accidentally made the method like that)
# ============================================================
def process_repositories(pr_df):
    """
    Filters the DataFrame by status, splits the full_name, and creates a 
    list of (owner, repo) tuples for processing.
    """
    
    # 1. Split the 'full_name' column into 'owner' and 'repo' columns
    split_df = pr_df['full_name'].str.split('/', n=1, expand=True)
    split_df.columns = ['owner', 'repo']
    
    # 2. Combine the split columns and the 'number' column into a list of tuples
    # We use axis=1 to apply the tuple creation row-wise across the three columns
    combined_df = pd.concat([split_df, pr_df['issue_number'], pr_df['pr_number']],axis=1) # use axis=1 to apply the tuple creation row-wise across the three columns
    
    #
    repositories = combined_df.apply(tuple, axis=1).tolist()
    print(repositories[:5])
    
    return repositories


ACCEPTED_PULL_REQUEST = process_repositories(accepted_prs)
REJECTED_PULL_REQUEST = process_repositories(rejected_prs)

[('apache', 'pulsar', 23190, 24542), ('1c-syntax', 'bsl-language-server', 3390, 3481), ('Azure', 'azure-sdk-for-java', 42765, 45797), ('Azure', 'azure-sdk-for-java', 45594, 45595), ('Azure', 'azure-sdk-for-java', 45762, 45795)]
[('apache', 'pulsar', 24138, 24145), ('1c-syntax', 'bsl-language-server', 3482, 3483), ('1c-syntax', 'bsl-language-server', 3485, 3486), ('AutoMQ', 'automq', 2650, 2652), ('Azure-Samples', 'azure-search-openai-demo-java', 54, 111)]


# 2. Helper code block to limit the API rate request

In [16]:
import time
import requests

def safe_request(method, url, headers=None, params=None, timeout=10, sleep_between=0.4):
    """
    A rate-limit-safe GitHub request wrapper that handles:
    - Primary rate limits (5000/hour)
    - Secondary abuse limits (burst protection)
    - GET and HEAD requests
    """
    while True:
        response = requests.request(method, url, headers=headers, params=params, timeout=timeout)

        # Primary rate limit
        remaining = int(response.headers.get("X-RateLimit-Remaining", 1))
        reset_ts = int(response.headers.get("X-RateLimit-Reset", time.time()))

        if remaining == 0:
            wait = max(reset_ts - int(time.time()), 10)
            print(f"[Primary Limit] Waiting {wait} seconds...")
            time.sleep(wait)
            continue

        # Secondary rate limit (abuse detection)
        if response.status_code == 403:
            print("[Secondary Limit] Hit GitHub abuse limit. Backing off 60 seconds...")
            time.sleep(60)
            continue

        # Success or other errors handled normally
        if not response.ok:
            response.raise_for_status()

        # Small delay prevents triggering secondary limit
        time.sleep(sleep_between)

        return response


# 3. Git API to extract metrics 

In [None]:
# ----------------------------------------------------
# Helper: Function for Preprocessing and Scoring
# ----------------------------------------------------
def get_readability_score(text: str) -> Optional[float]:
    """
    Tokenizes text using syntok and returns the Flesch Reading Ease score.
    Returns None if the text is empty or analysis fails.
    Based on: https://pypi.org/project/readability/
    """
    if not text or text == "No body content provided.":
        return None
    
    # 1. Use syntok to tokenize and segment the text
    # This complex join statement transforms the raw string into the required format:
    # 'word1 word2 .\nword3 word4 .\n\nnew_paragraph_word1 .'
    try:
        tokenized = '\n\n'.join(
            '\n'.join(' '.join(token.value for token in sentence) 
                      for sentence in paragraph)
            for paragraph in segmenter.analyze(text)
        )
    except Exception as e:
        print(f"Error during syntok analysis: {e}")
        return None

    # 2. Calculate the readability scores
    try:
        # Check if the tokenized text is not empty before scoring
        if not tokenized.strip():
            return None
            
        results = readability.getmeasures(tokenized, lang='en')
        
        # We will return the Flesch Reading Ease score as an example
        return results['readability grades']['FleschReadingEase']
        
    except Exception as e:
        print(f"Error calculating readability score: {e}")
        return None

# ============================================================
# Helper 1: Fetch Metrics for Issue
# ============================================================
def fetch_single_issue(owner: str, repo: str, issue_number: int, github_token: Optional[str]) -> Optional[Dict]:
    """
    Fetches the title, body, and labels for a single issue number.
    Returns None if the issue is a PR, deleted, or if an error occurs.
    """
    api_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}"
    
    headers = {
        "Accept": "application/vnd.github.v3+json",
    }
    if github_token:
        headers["Authorization"] = f"Bearer {github_token}"

    try:
        response = safe_request("GET", api_url, headers=headers)
        response.raise_for_status()
        issue_data = response.json()

        # Check if the object is actually a Pull Request (PR)
        if 'pull_request' in issue_data:
            print(f"   [SKIP] #{issue_number} is a Pull Request, skipping.")
            return None
        
        # Extract the body fields for readability
        issue_body = issue_data.get('body') or "No body content provided."

        # Calculate Readability Score
        flesch_score = get_readability_score(issue_body)
        
        # Extract the required fields
        return {
            #"type": "issue",
            "issue_number": issue_number,
            "issue_title": issue_data.get('title'),
            "issue_body": issue_body,
            #"issue_labels": [label['name'] for label in issue_data.get('labels', [])],
            #"isue_url": issue_data.get('html_url'),
            "issue_readability": flesch_score
        }

    except requests.exceptions.HTTPError as e:
        if response.status_code == 404:
            print(f"   [FAIL] #{issue_number} not found (Error 404).")
        else:
            print(f"   [FAIL] HTTP Error for #{issue_number}: {e}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"   [FAIL] Connection Error for #{issue_number}: {e}")
        return None
    
# ----------------------------------------------------
# Helper 2: Fetch Metrics for PR
# ----------------------------------------------------
def fetch_single_pr(owner: str, repo: str, issue_number: int, pr_number: int, github_token: Optional[str]) -> Optional[Dict]:
    """
    Fetches the title, body, labels, and readability score for a single Pull Request number.
    Returns None if the PR is deleted or if an error occurs.
    """
    api_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}"
    
    headers = {
        "Accept": "application/vnd.github.v3+json",
    }
    if github_token:
        headers["Authorization"] = f"Bearer {github_token}"

    try:
        response = safe_request("GET", api_url, headers=headers)
        response.raise_for_status()
        pr_data = response.json()
        
        pr_body = pr_data.get('body') or "No body content provided."

        # Calculate Readability Score
        flesch_score = get_readability_score(pr_body)
        
        issues_readability_metrics = fetch_single_issue(owner, repo, issue_number, github_token)
        
        # Extract the required fields
        return {
            #"type": "pull_request",
            "Repo": f"{owner}/{repo}",
            "PR_Number": pr_number,
            "pr_title": pr_data.get('title'),
            "pr_body": pr_body,
            #"pr_labels": [label['name'] for label in pr_data.get('labels', [])],
            #"pr_url": pr_data.get('html_url'),
            "pr_readability": flesch_score,
            **issues_readability_metrics
        }

    except requests.exceptions.HTTPError as e:
        if response.status_code == 404:
            print(f"   [FAIL] PR #{pr_number} not found (Error 404).")
        else:
            print(f"   [FAIL] HTTP Error for PR #{pr_number}: {e}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"   [FAIL] Connection Error for PR #{pr_number}: {e}")
        return None
    
# ============================================================
# Main Helper: Fetch the main metric functions 
# ============================================================
def fetch_metrics(repo_list, token):
    results = []
    # limit the number of repositories processed here for testing REPOSITORIES[:10]:
    for owner, repo, issue_number, pr_number in repo_list: # Apply the test limit here
        metrics = fetch_single_pr(owner, repo, issue_number, pr_number, token)
        if metrics:
            results.append(metrics)
    
    # Create the Metric DataFrame
    return pd.DataFrame(results)

# ============================================================
# MAIN PROGRAM
# ============================================================
print("\nStarting data retrieval... (may take a moment due to multiple API calls)")
pr_readability_df_accept = fetch_metrics(ACCEPTED_PULL_REQUEST, GITHUB_TOKEN)
pr_readability_df_reject = fetch_metrics(REJECTED_PULL_REQUEST, GITHUB_TOKEN)


Starting data retrieval... (may take a moment due to multiple API calls)
[Primary Limit] Waiting 3554 seconds...
[Primary Limit] Waiting 3554 seconds...
[Primary Limit] Waiting 3553 seconds...
[Primary Limit] Waiting 3553 seconds...
[Primary Limit] Waiting 3550 seconds...
[Primary Limit] Waiting 3552 seconds...


In [18]:
display(pr_readability_df_accept.head())

Unnamed: 0,type,pr_number,pr_title,pr_body,pr_readability,issue_number,issue_title,issue_body,issue_readability
0,issue,24542,[fix][broker]Fix thread safety issues in Bucke...,### Motivation\r\n\r\nFixes #23190\r\n\r\nBuck...,20.666222,23190,[Bug] BucketDelayedDeliveryTracker.containsMes...,### Search before asking\n\n- [X] I searched i...,37.379912
1,issue,3481,Add excludeTrailingComments option to LineLeng...,This PR adds a new configuration parameter `ex...,8.134711,3390,Не учитывать инлайн комментарии в длине строки,"Было бы не плохо, возможно опционально, по умо...",198.715
2,issue,45797,Refactor SDK test dependencies from TestBase t...,This PR completes the migration of SDK test cl...,43.31729,42765,Refactor SDKs dependency on TestBase,Update all files taking a dependency on TestBa...,69.141364
3,issue,45595,Remove unnecessary Maven plugins from azure-op...,This PR removes 4 unnecessary Maven plugins fr...,27.222647,45594,Copilot: Remove Unused OpenRewrite plugins,This task is a prompt for GitHub Copilot to co...,49.202272
4,issue,45795,Deprecate SharedTokenCacheCredential and remov...,This PR deprecates the `SharedTokenCacheCreden...,9.421703,45762,Deprecate SharedTokenCacheCredential,**Ask**: \n- [ ] Deprecate `SharedTokenCacheCr...,24.581275


In [19]:
# ============================================================
# Helper: Finalize the dataframe, adding stars and forks
# ============================================================
def finalize_dataframe(metrics_df, output_filename):
    """
    Applies the merging, cleaning, renaming, and reordering steps 
    to a single metrics DataFrame.
    """
    #
    final_df = metrics_df.fillna(0)

    # 4. Save the file (using CSV as per your original request)
    final_df.to_csv(output_filename, index=False)
    
    return final_df

# ============================================================
# MAIN PROGRAM - Separate Processing
# ============================================================

# --- Processing Accepted Repositories ---
print("\n--- Processing Accepted Repositories ---")
final_df_accept = finalize_dataframe(
    pr_readability_df_accept, 
    "issue_pr_readability_accepted.csv" # Save to a separate file
)

print("\nAccepted Repository Metrics DataFrame Created:")
print(f"Total rows in Accepted DataFrame: {len(final_df_accept)}")

# --- Processing Rejected Repositories ---
print("\n--- Processing Rejected Repositories ---")
final_df_reject = finalize_dataframe(
    pr_readability_df_reject, 
    "issue_pr_readability_rejected.csv" # Save to a separate file
)

print("\nRejected Repository Metrics DataFrame Created:")
print(f"Total rows in Rejected DataFrame: {len(final_df_reject)}")


--- Processing Accepted Repositories ---

Accepted Repository Metrics DataFrame Created:
Total rows in Accepted DataFrame: 69

--- Processing Rejected Repositories ---

Rejected Repository Metrics DataFrame Created:
Total rows in Rejected DataFrame: 113
