In [35]:
import requests
from datetime import datetime, timezone
from dateutil import parser 
import re
import numpy as np
import pandas as pd
from typing import Optional, Dict, List, Tuple, Any
import subprocess
from IPython.display import display
import xml.etree.ElementTree as ET
from collections import Counter
from dotenv import load_dotenv
import os

load_dotenv("./api_key.env")
GITHUB_TOKEN = os.getenv("GITHUB_API_KEY")

# Import the Hao-Li AIDev datasets

In [36]:
# Repositories
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")

# Pull Request
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")

# 1. Prepare the Dataset

In [37]:
# Filter the repository data for 'Java' language
java_repo_df = repo_df[repo_df['language'] == 'Java'].copy()
java_repo_select_df = java_repo_df[['id', 'full_name']]

# Join Repo and PR table based on repo id
merged_pr_df = pr_df.merge(
    java_repo_select_df,
    left_on='repo_id',
    right_on='id',
    how='inner'
)

# clean up extra attribute
merged_pr_df = merged_pr_df.drop(columns=['id_y'])
merged_pr_df = merged_pr_df.rename(columns={'id_x':'id'})

# Filter PRs that were rejected (not merged) and create a new attribute
accepted_prs = merged_pr_df[merged_pr_df['merged_at'].notnull()]
rejected_prs = merged_pr_df[merged_pr_df['merged_at'].isnull()]

# Prepare for Merge: Rename the key column
accepted_prs = accepted_prs[['full_name', 'number']]
rejected_prs = rejected_prs[['full_name', 'number']]

# print to csv for checking
accepted_prs.to_csv("accepted_PR.csv", index=False)
rejected_prs.to_csv("rejected_PR.csv", index=False)

## 1.1. Split the full_name of repo into owner and repo name

In [38]:
# ============================================================
# Helper: Split the name and put it in a List of Dict (not needed but ehh accidentally made the method like that)
# ============================================================
def process_repositories(pr_df):
    """
    Filters the DataFrame by status, splits the full_name, and creates a 
    list of (owner, repo) tuples for processing.
    """
    
    # 1. Split the 'full_name' column into 'owner' and 'repo' columns
    split_df = pr_df['full_name'].str.split('/', n=1, expand=True)
    split_df.columns = ['owner', 'repo']
    
    # 2. Combine the split columns and the 'number' column into a list of tuples
    # We use axis=1 to apply the tuple creation row-wise across the three columns
    repositories = pd.concat([split_df, pr_df['number']], axis=1).apply(tuple, axis=1).tolist()
    
    # Print the first 5 results for verification
    print(repositories[:5])
    
    return repositories


ACCEPTED_PULL_REQUEST = process_repositories(accepted_prs)
REJECTED_PULL_REQUEST = process_repositories(rejected_prs)

[('dotCMS', 'core', 32609), ('apache', 'pulsar', 24542), ('dotCMS', 'core', 32771), ('dotCMS', 'core', 32561), ('microsoft', 'ApplicationInsights-Java', 4293)]
[('dotCMS', 'core', 32656), ('dotCMS', 'core', 32657), ('dotCMS', 'core', 32658), ('dotCMS', 'core', 32659), ('dotCMS', 'core', 32660)]


# 2. Helper code block to limit the API rate request

In [39]:
import time
import requests

def safe_request(method, url, headers=None, params=None, timeout=10, sleep_between=0.4):
    """
    A rate-limit-safe GitHub request wrapper that handles:
    - Primary rate limits (5000/hour)
    - Secondary abuse limits (burst protection)
    - GET and HEAD requests
    """
    while True:
        response = requests.request(method, url, headers=headers, params=params, timeout=timeout)

        # Primary rate limit
        remaining = int(response.headers.get("X-RateLimit-Remaining", 1))
        reset_ts = int(response.headers.get("X-RateLimit-Reset", time.time()))

        if remaining == 0:
            wait = max(reset_ts - int(time.time()), 10)
            print(f"[Primary Limit] Waiting {wait} seconds...")
            time.sleep(wait)
            continue

        # Secondary rate limit (abuse detection)
        if response.status_code == 403:
            print("[Secondary Limit] Hit GitHub abuse limit. Backing off 60 seconds...")
            time.sleep(60)
            continue

        # Success or other errors handled normally
        if not response.ok:
            response.raise_for_status()

        # Small delay prevents triggering secondary limit
        time.sleep(sleep_between)

        return response

# 3. Git API to extract information

## 3.1. API to extract git metrics

In [40]:
# ============================================================
# Helper: Get the files name, patch code, addition, deletion, status, and RAW URL
# ============================================================
def get_pr_file_details(owner: str, repo: str, pr_number: int, github_token: Optional[str] = None) -> List[Dict]:
    """
    Fetches the details for all files changed in a Pull Request,
    INCLUDING the raw URL for the file content.
    """
    
    base_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files"
    all_file_details = []
    page = 1
    
    headers = {
        "Accept": "application/vnd.github.v3+json",
    }
    if github_token:
        headers["Authorization"] = f"Bearer {github_token}"

    print(f"Fetching file details for PR #{pr_number} (Paginating 100 files/page)...")

    while True:
        params = {"per_page": 100, "page": page}
        
        try:
            response = safe_request("GET", base_url, headers=headers, params=params,)
            response.raise_for_status()
            files_data = response.json()

            if not files_data:
                break

            for file in files_data:
                filename = file.get('filename')
                patch_content = file.get('patch')
                # final_patch = patch_content if patch_content else "NULL"
                raw_url = file.get('raw_url') 
                
                # All the file metrics here
                all_file_details.append({
                    "filename": filename,
                    #"patch": final_patch,
                    "status": file.get('status'),
                    #"additions": file.get('additions', 0),
                    #"deletions": file.get('deletions', 0),
                    "raw_url": raw_url 
                })
            
            # Check for the next page header
            if 'link' not in response.headers or 'rel="next"' not in response.headers['link']:
                break
                
            page += 1
            
        except requests.exceptions.RequestException as e:
            print(f"Error during API call on page {page}: {e}")
            break
            
    print(f"Finished fetching. Total files processed: {len(all_file_details)}")
    return all_file_details
        
# ============================================================
# Main Helper: Fetch the main metric functions 
# ============================================================
def fetch_metrics(repo_list, token):
    results = []
    # limit the number of repositories processed here for testing REPOSITORIES[:10]:
    for owner, repo, pr_number in repo_list[:50]: # Apply the test limit here
        metrics = get_pr_file_details(owner, repo, pr_number, token)
        if metrics:
            results.append(metrics)
    
    # Create the Metric DataFrame
    return results # Return a List[List[Dict]] to be process later

# ============================================================
# MAIN PROGRAM
# ============================================================
print("\nStarting data retrieval... (may take a moment due to multiple API calls)")
files_list_accepted = fetch_metrics(ACCEPTED_PULL_REQUEST, GITHUB_TOKEN)
files_list_rejected = fetch_metrics(REJECTED_PULL_REQUEST, GITHUB_TOKEN)


Starting data retrieval... (may take a moment due to multiple API calls)
Fetching file details for PR #32609 (Paginating 100 files/page)...
Finished fetching. Total files processed: 9
Fetching file details for PR #24542 (Paginating 100 files/page)...
Finished fetching. Total files processed: 4
Fetching file details for PR #32771 (Paginating 100 files/page)...
Finished fetching. Total files processed: 2
Fetching file details for PR #32561 (Paginating 100 files/page)...
Finished fetching. Total files processed: 126
Fetching file details for PR #4293 (Paginating 100 files/page)...
Finished fetching. Total files processed: 1
Fetching file details for PR #7783 (Paginating 100 files/page)...
Finished fetching. Total files processed: 2
Fetching file details for PR #7739 (Paginating 100 files/page)...
[Primary Limit] Waiting 1246 seconds...
Finished fetching. Total files processed: 2
Fetching file details for PR #4262 (Paginating 100 files/page)...
Finished fetching. Total files processed: 1


## 3.2. Restructure the File List and Filter the Java files

In [41]:
# ============================================================
# Helper: Filter and Aggregate PR Data
# ============================================================
def filter_and_aggregate_pr_data(pr_files_list: List[List[Dict]], repo_list: List[Tuple[str, str, int]]) -> List[Dict]:
    """
    Filters file details for Java files that are not deleted and aggregates the 
    relevant data (like raw_urls) at the Pull Request level.

    Args:
        pr_files_list: The nested list of file details from the GitHub API.
        repo_list: The original list of (owner, repo, pr_number) tuples 
                   used to fetch the data.

    Returns:
        A list of dictionaries, one for each PR, containing aggregated metrics.
    """
    aggregated_pr_data = []
    
    # Iterate through the results for each PR
    for pr_index, pr_files in enumerate(pr_files_list):
        
        # Safely retrieve metadata for the current PR
        if pr_index >= len(repo_list):
            print(f"Warning: Missing metadata for PR at index {pr_index}. Skipping.")
            continue
            
        owner, repo, pr_number = repo_list[pr_index]
        
        java_files_to_analyze = []
        
        # --- File-level filtering ---
        for file in pr_files:
            filename = file.get('filename', '')
            status = file.get('status', '')
            
            # 1. Detect .java file in the file name (case-insensitive)
            is_java = filename.lower().endswith('.java')
            
            # 2. Exclude status deleted (we only analyze added or modified code)
            is_not_deleted = status != 'deleted'
            
            # Store the file name and raw URL for the non-deleted Java file
            if is_java and is_not_deleted:
                java_files_to_analyze.append({
                    "file_name": filename,
                    "raw_url": file.get('raw_url')
                })
        
        # --- PR-level aggregation ---
        aggregated_pr_data.append({
            'owner': owner,
            'repo': repo,
            'pr_number': pr_number,
            'java_files_analyzed_count': len(java_files_to_analyze),
            'files_to_analyze': java_files_to_analyze, # List[Dict]
            # add the PMD violation counts later
            'pmd_violations': {} 
        })
        
    return aggregated_pr_data

# ============================================================
# MAIN PROGRAM: 
# ============================================================
print("\nStarting filtering and sorting")
pr_code_metrics_filtered_accepted = filter_and_aggregate_pr_data(files_list_accepted, ACCEPTED_PULL_REQUEST)
pr_code_metrics_filtered_rejected = filter_and_aggregate_pr_data(files_list_accepted, ACCEPTED_PULL_REQUEST)


Starting filtering and sorting


## 3.3. Download the changed Java files and do PMD static code analysis

In [None]:
# ============================================================
# Helper: Download the java files
# ============================================================
def download_file(raw_url, local_path, token=None):
    """Downloads a single file from GitHub's raw URL."""
    headers = {}
    if token:
        headers['Authorization'] = f"token {token}"
        
    response = requests.get(raw_url, headers=headers)
    
    if response.status_code == 200:
        # Create directories if they don't exist
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        with open(local_path, 'w', encoding='utf-8') as f:
            f.write(response.text)
        return True
    else:
        print(f"Failed to download {raw_url}. Status: {response.status_code}")
        return False
    
# ============================================================
# Helper: Run PMD static code analyzer
# ============================================================
PMD_EXECUTABLE = "pmd.bat"
RULESET_PATH = "D:/Program Files/PMD/pmd-bin-7.18.0/rulesets/java/quickstart.xml"
BASE_STAGING_DIR = "./pr_analysis_staging"

def run_pmd_for_pr(owner: str, repo: str, pr_number: int, ruleset_path: str = RULESET_PATH) -> Optional[str]:
    """
    Runs PMD on the staging directory for a specific Pull Request.

    Args:
        owner: The repository owner.
        repo: The repository name.
        pr_number: The Pull Request number.
        ruleset_path: Path to the PMD ruleset XML file.

    Returns:
        The path to the generated XML report file, or None if the analysis failed.
    """
    
    # Dynamically define the source directory based on the download structure
    source_dir = os.path.join(BASE_STAGING_DIR, owner, repo, str(pr_number))
    
    # Define a unique output file path inside the PR's staging folder
    output_file_name = f"pmd_report_{pr_number}.xml"
    output_file_path = os.path.join(source_dir, output_file_name)

    # 1. Construct the PMD command
    pmd_command = [
        PMD_EXECUTABLE,
        "check",
        "-d", source_dir,
        "-R", ruleset_path,
        "-f", "xml",
        "-r", output_file_path
    ]
    
    print(f"Executing PMD for PR #{pr_number}: {' '.join(pmd_command)}")

    try:
        # 2. Execute the command
        result = subprocess.run(
            pmd_command, 
            capture_output=True, 
            text=True, 
            check=False # PMD returns exit code 4 on violations
        )
        
        # Check success (0 or 4)
        if result.returncode == 0 or result.returncode == 4:
            print(f"PMD analysis completed. Report saved to: {output_file_path}")
            return output_file_path
        else:
            print(f"PMD command failed (code {result.returncode}). Error:\n{result.stderr}")
            return None

    except FileNotFoundError:
        print(f"Error: PMD executable '{PMD_EXECUTABLE}' not found.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during PMD execution: {e}")
        return None

# ============================================================
# Helper: Extract violation rules from the xml file and count them
# ============================================================
def parse_pmd_report_to_counts(xml_file_path: str, pr_number: int) -> Optional[Dict[str, Any]]:
    """
    Parses a PMD XML report, counts the violations for each unique rule, 
    and formats the result as a dictionary suitable for a DataFrame row.

    Args:
        xml_file_path: The full path to the generated PMD XML report.
        pr_number: The Pull Request number, added as an identifier in the output.

    Returns:
        A dictionary of rule counts (e.g., {'pr_number': 123, 'ruleid_avoidusingvolatile': 1}), 
        or None if the file cannot be parsed.
    """
    
    if not os.path.exists(xml_file_path):
        print(f"Error: PMD report not found at {xml_file_path}")
        return None

    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        # Handle the XML namespace (necessary for PMD reports)
        # Extracts the namespace URL from the root tag
        namespace_match = root.tag.split('}')
        namespace = namespace_match[0] + '}' if len(namespace_match) > 1 else ''

        # XPath to find all 'violation' elements within 'file' elements
        violation_elements = root.findall(f"{namespace}file/{namespace}violation")

        rule_ids = []
        for violation in violation_elements:
            # The rule ID is stored in the 'rule' attribute
            rule_id = violation.get('rule')
            if rule_id:
                rule_ids.append(rule_id)

        # 1. Count the Rules
        rule_counts = Counter(rule_ids)

        # 2. Format the Output Dictionary
        formatted_data = {}
        
        # Add PR number for merging with other datasets
        formatted_data['pr_number'] = pr_number
        
        #
        for rule, count in rule_counts.items():
            # Standardize and prefix the rule name
            # Note: This simple standardization handles the core formatting.
            sanitized_rule_name = rule.lower().replace(' ', '_').replace('-', '_')
            formatted_data[f"ruleid_{sanitized_rule_name}"] = count

        return formatted_data

    except ET.ParseError as e:
        print(f"Error parsing XML file {xml_file_path}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during parsing: {e}")
        return None
    
# ============================================================
# Main: Download the changed files of the PR
# ============================================================
for pr_data in pr_code_metrics_filtered_accepted:
    owner = pr_data['owner']
    repo = pr_data['repo']
    pr_number = pr_data['pr_number']
    files_to_analyze = pr_data['files_to_analyze'] 
    
    # ------------------------------------------------------------------
    # 1. Checkout the PR branch and download the right changed files
    # ------------------------------------------------------------------
    # Create a local staging directory structure. Prevents file path clashes and organizes your PMD reports
    base_dir = "./pr_analysis_staging"
    local_staging_dir = os.path.join(base_dir, owner, repo, str(pr_number))
    
    print(f"\n--- Processing PR: {owner}/{repo} #{pr_number} ({len(files_to_analyze)} files) ---")
    
    # Inner loop: Iterate over the list of file dictionaries
    for file_data in files_to_analyze:
        # Get the two keys you need from the file dictionary
        file_name = file_data['file_name']
        raw_url = file_data['raw_url']
        
        # Determine the full local path for this file
        local_path = os.path.join(local_staging_dir, file_name)
        download_file(raw_url, local_path, GITHUB_TOKEN)
        
for pr_data in pr_code_metrics_filtered_rejected:
    owner = pr_data['owner']
    repo = pr_data['repo']
    pr_number = pr_data['pr_number']
    files_to_analyze = pr_data['files_to_analyze'] 
    
    # ------------------------------------------------------------------
    # 1. Checkout the PR branch and download the right changed files
    # ------------------------------------------------------------------
    # Create a local staging directory structure. Prevents file path clashes and organizes your PMD reports
    base_dir = "./pr_analysis_staging"
    local_staging_dir = os.path.join(base_dir, owner, repo, str(pr_number))
    
    print(f"\n--- Processing PR: {owner}/{repo} #{pr_number} ({len(files_to_analyze)} files) ---")
    
    # Inner loop: Iterate over the list of file dictionaries
    for file_data in files_to_analyze:
        # Get the two keys you need from the file dictionary
        file_name = file_data['file_name']
        raw_url = file_data['raw_url']
        
        # Determine the full local path for this file
        local_path = os.path.join(local_staging_dir, file_name)
        download_file(raw_url, local_path, GITHUB_TOKEN)
    


--- Processing PR: dotCMS/core #32609 (0 files) ---

--- Processing PR: apache/pulsar #24542 (4 files) ---

--- Processing PR: dotCMS/core #32771 (0 files) ---

--- Processing PR: dotCMS/core #32561 (118 files) ---

--- Processing PR: microsoft/ApplicationInsights-Java #4293 (1 files) ---

--- Processing PR: microsoft/typespec #7783 (0 files) ---

--- Processing PR: microsoft/typespec #7739 (0 files) ---

--- Processing PR: valkey-io/valkey-glide #4262 (0 files) ---

--- Processing PR: microsoft/typespec #7661 (0 files) ---

--- Processing PR: microsoft/typespec #7667 (0 files) ---

--- Processing PR: microsoft/typespec #7779 (0 files) ---

--- Processing PR: microsoft/typespec #7439 (0 files) ---

--- Processing PR: valkey-io/valkey-glide #4456 (0 files) ---

--- Processing PR: microsoft/ApplicationInsights-Java #4326 (6 files) ---

--- Processing PR: microsoft/ApplicationInsights-Java #4252 (2 files) ---

--- Processing PR: microsoft/ApplicationInsights-Java #4257 (5 files) ---

---

### 3.3.1. Run PMD and count the violations rules

In [None]:
all_pmd_metrics = []
for pr_data in pr_code_metrics_filtered_accepted:
    owner = pr_data['owner']
    repo = pr_data['repo']
    pr_number = pr_data['pr_number']
    files_to_analyze = pr_data['files_to_analyze'] 
    
    # Create a local staging directory structure. Prevents file path clashes and organizes your PMD reports
    base_dir = "./pr_analysis_staging"
    local_staging_dir = os.path.join(base_dir, owner, repo, str(pr_number))
    
    print(f"\n--- Processing PR: {owner}/{repo} #{pr_number} ({len(files_to_analyze)} files) ---")
    
    # ------------------------------------------------------------------
    # 2. Run PMD Analysis on the downloaded files
    # ------------------------------------------------------------------
    pmd_report_path = run_pmd_for_pr(owner, repo, pr_number)
    
    # ------------------------------------------------------------------
    # 3. Next step: Parse the XML report and filter by the patch/diff data
    # ------------------------------------------------------------------
    if pmd_report_path:
        # Store the report path in your PR data dictionary for later parsing
        pr_data['pmd_report_path'] = pmd_report_path
        rule_counts_dict = parse_pmd_report_to_counts(pmd_report_path, pr_number)
        
        if rule_counts_dict:
            # Add the dictionary to your collection list
            all_pmd_metrics.append(rule_counts_dict)
            
            # Optionally, update the original PR data structure
            pr_data['pmd_violations'] = rule_counts_dict

        
    else:
        print(f"Skipping filtering for PR #{pr_number} due to PMD failure.")
        
# ============================================================
# Final Step: Convert the list of dictionaries to a single DataFrame
# ============================================================
if all_pmd_metrics:
    # Use fillna(0) to ensure PRs that didn't violate a rule get a count of 0
    df_pmd_counts = pd.DataFrame(all_pmd_metrics).fillna(0)
    
    # 2. Reorder the columns this way since there are too many weird columns name
    # Get all column names
    all_cols = df_pmd_counts.columns.tolist()
    
    # Filter out the 'pr_number' column 
    rule_id_cols = [col for col in all_cols if col != 'pr_number']
    
    # Define the new column order: ['pr_number', then all the rest]
    new_col_order = ['pr_number'] + sorted(rule_id_cols)
    
    # Apply the new order to the DataFrame
    df_pmd_counts = df_pmd_counts[new_col_order]
    
    # Save the final PMD features to a CSV
    df_pmd_counts.to_csv("all_pr_pmd_features_accepted.csv", index=False)


--- Processing PR: dotCMS/core #32609 (0 files) ---
Executing PMD for PR #32609: pmd.bat check -d ./pr_analysis_staging\dotCMS\core\32609 -R D:/Program Files/PMD/pmd-bin-7.18.0/rulesets/java/quickstart.xml -f xml -r ./pr_analysis_staging\dotCMS\core\32609\pmd_report_32609.xml
PMD command failed (code 1). Error:
[ERROR] No such file .\pr_analysis_staging\dotCMS\core\32609
[ERROR] Could not initialize analysis: java.nio.file.NoSuchFileException: .\pr_analysis_staging\dotCMS\core\32609\pmd_report_32609.xml

Skipping filtering for PR #32609 due to PMD failure.

--- Processing PR: apache/pulsar #24542 (4 files) ---
Executing PMD for PR #24542: pmd.bat check -d ./pr_analysis_staging\apache\pulsar\24542 -R D:/Program Files/PMD/pmd-bin-7.18.0/rulesets/java/quickstart.xml -f xml -r ./pr_analysis_staging\apache\pulsar\24542\pmd_report_24542.xml
PMD analysis completed. Report saved to: ./pr_analysis_staging\apache\pulsar\24542\pmd_report_24542.xml

--- Processing PR: dotCMS/core #32771 (0 files)

In [None]:
all_pmd_metrics = []
for pr_data in pr_code_metrics_filtered_rejected:
    owner = pr_data['owner']
    repo = pr_data['repo']
    pr_number = pr_data['pr_number']
    files_to_analyze = pr_data['files_to_analyze'] 
    
    # Create a local staging directory structure. Prevents file path clashes and organizes your PMD reports
    base_dir = "./pr_analysis_staging"
    local_staging_dir = os.path.join(base_dir, owner, repo, str(pr_number))
    
    print(f"\n--- Processing PR: {owner}/{repo} #{pr_number} ({len(files_to_analyze)} files) ---")
    
    # ------------------------------------------------------------------
    # 2. Run PMD Analysis on the downloaded files
    # ------------------------------------------------------------------
    pmd_report_path = run_pmd_for_pr(owner, repo, pr_number)
    
    # ------------------------------------------------------------------
    # 3. Next step: Parse the XML report and filter by the patch/diff data
    # ------------------------------------------------------------------
    if pmd_report_path:
        # Store the report path in your PR data dictionary for later parsing
        pr_data['pmd_report_path'] = pmd_report_path
        rule_counts_dict = parse_pmd_report_to_counts(pmd_report_path, pr_number)
        
        if rule_counts_dict:
            # Add the dictionary to your collection list
            all_pmd_metrics.append(rule_counts_dict)
            
            # Optionally, update the original PR data structure
            pr_data['pmd_violations'] = rule_counts_dict

        
    else:
        print(f"Skipping filtering for PR #{pr_number} due to PMD failure.")
        
# ============================================================
# Final Step: Convert the list of dictionaries to a single DataFrame
# ============================================================
if all_pmd_metrics:
    # Use fillna(0) to ensure PRs that didn't violate a rule get a count of 0
    df_pmd_counts = pd.DataFrame(all_pmd_metrics).fillna(0)
    
    # 2. Reorder the columns this way since there are too many weird columns name
    # Get all column names
    all_cols = df_pmd_counts.columns.tolist()
    
    # Filter out the 'pr_number' column 
    rule_id_cols = [col for col in all_cols if col != 'pr_number']
    
    # Define the new column order: ['pr_number', then all the rest]
    new_col_order = ['pr_number'] + sorted(rule_id_cols)
    
    # Apply the new order to the DataFrame
    df_pmd_counts = df_pmd_counts[new_col_order]
    
    # Save the final PMD features to a CSV
    df_pmd_counts.to_csv("all_pr_pmd_features_rejected.csv", index=False)