# Data Preparation Notebook (Annotating + Filtering)

## Imports and Loading Dataset

In [1]:
import pandas as pd
import os

# Basic
pr_df = pd.read_parquet("hf://datasets/hao-li/AIDev/pull_request.parquet")
repo_df = pd.read_parquet("hf://datasets/hao-li/AIDev/repository.parquet")
user_df = pd.read_parquet("hf://datasets/hao-li/AIDev/user.parquet")

# Comments and reviews
pr_comments_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/pr_comments.parquet")
pr_reviews_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/pr_reviews.parquet")
pr_review_comments_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/pr_review_comments_v2.parquet")

# Commits
pr_commits_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/pr_commits.parquet")
pr_commit_details_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/pr_commit_details.parquet")

# Related issues
related_issue_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/related_issue.parquet")
issue_df = pd.read_parquet("hf://datasets/hao-li/AIDev/issue.parquet")

# Events
pr_timeline_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/pr_timeline.parquet")

# Task type
pr_task_type_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/pr_task_type.parquet")

# Human-PR
human_pr_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/human_pull_request.parquet")
human_pr_task_type_df = pd.read_parquet(
    "hf://datasets/hao-li/AIDev/human_pr_task_type.parquet")

print(f"len(pr_df): {len(pr_df)}")
print(f"len(repo_df): {len(repo_df)}")
print(f"len(user_df): {len(user_df)}")

print(f"\nlen(pr_comments_df): {len(pr_comments_df)}")
print(f"len(pr_reviews_df): {len(pr_reviews_df)}")
print(f"len(pr_review_comments_df): {len(pr_review_comments_df)}")

print(f"\nlen(pr_commits_df): {len(pr_commits_df)}")
print(f"len(pr_commit_details_df): {len(pr_commit_details_df)}")


print(f"\nlen(related_issue_df): {len(related_issue_df)}")
print(f"len(issue_df): {len(issue_df)}")

print(f"\nlen(pr_timeline_df): {len(pr_timeline_df)}")

print(f"\nlen(pr_task_type_df): {len(pr_task_type_df)}")

print(f"\nlen(human_pr_df): {len(human_pr_df)}")
print(f"len(human_pr_task_type_df): {len(human_pr_task_type_df)}")

len(pr_df): 33596
len(repo_df): 2807
len(user_df): 1796

len(pr_comments_df): 39122
len(pr_reviews_df): 28875
len(pr_review_comments_df): 26868

len(pr_commits_df): 88576
len(pr_commit_details_df): 711923

len(related_issue_df): 4923
len(issue_df): 4614

len(pr_timeline_df): 325500

len(pr_task_type_df): 33596

len(human_pr_df): 6618
len(human_pr_task_type_df): 6618


## Save Initial Dataset to CSV

In [2]:
def save_dataframes_to_csv(subdirectory):
    # Check if ./data/{subdirectory} directory exists
    data_dir = "./data"
    sub_dir = os.path.join(data_dir, subdirectory)

    # Create the directory if it doesn't exist
    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)

    # Save all dataframes to CSV (will overwrite existing files)
    pr_df.to_csv(os.path.join(sub_dir, "pull_request.csv"), index=False)
    repo_df.to_csv(os.path.join(sub_dir, "repository.csv"), index=False)
    user_df.to_csv(os.path.join(sub_dir, "user.csv"), index=False)
    pr_comments_df.to_csv(os.path.join(
        sub_dir, "pr_comments.csv"), index=False)
    pr_reviews_df.to_csv(os.path.join(
        sub_dir, "pr_reviews.csv"), index=False)
    pr_review_comments_df.to_csv(os.path.join(
        sub_dir, "pr_review_comments.csv"), index=False)
    pr_commits_df.to_csv(os.path.join(
        sub_dir, "pr_commits.csv"), index=False)
    pr_commit_details_df.to_csv(os.path.join(
        sub_dir, "pr_commit_details.csv"), index=False)
    related_issue_df.to_csv(os.path.join(
        sub_dir, "related_issue.csv"), index=False)
    issue_df.to_csv(os.path.join(sub_dir, "issue.csv"), index=False)
    pr_timeline_df.to_csv(os.path.join(
        sub_dir, "pr_timeline.csv"), index=False)
    pr_task_type_df.to_csv(os.path.join(
        sub_dir, "pr_task_type.csv"), index=False)
    human_pr_df.to_csv(os.path.join(
        sub_dir, "human_pull_request.csv"), index=False)
    human_pr_task_type_df.to_csv(os.path.join(
        sub_dir, "human_pr_task_type.csv"), index=False)

save_dataframes_to_csv("original")

## Annotating Features

New columns added to both human and agentic PR dataframes:

- accepted (bool)
- rejected (bool)
- turnaround_time (seconds) - number of seconds between creation and decision (merged/closed)

New columns added to only agentic PR dataframes:

- related_issue (bool)
- touches_test_file (bool) - whether "test" appears in any of the filenames modified by the PR
- lines_added (int) - number of lines added throughout all PR commits
- lines_deleted (int) - number of lines deleted throughout all PR commits
- total_churn (int) - total churn across all PR commits (lines added + lines deleted)
- net_churn (int) - net churn across all PR commits (lines added - lines deleted)
- num_bot_users (int) - how many bots were involved in the PR (either in comments or reviews)
- num_human_users (int) - how many humans were involved in the PR (either in comments or reviews)
- num_total_users (int) - how many users (bots & humans) were involved in the PR (either in comments or reviews)
- num_comments (int) - total number of comments (bot & human)
- num_human_comments (int) - number of human comments
- num_bot_comments (int) - number of bot comments
- num_reviews (int) - total number of reviews (bot & human)
- num_human_reviews (int) - number of human reviews
- num_bot_reviews (int) - number of bot reviews

TODO:

- num_prior_prs - number of PRs submitted to the same repository by the author before the current PR
- fix error: current implementation of related issue times fail when PRs have multiple related issues
    - issue_created_at (time) - (if related issue exists) issue creation time
    - issue_closed_at (time) - (if related issue exists) issue closure time
    - issue_turnaround_time (seconds) - (if related issue exists) time between issue creation and closure
    - time_to_close_issue_after_pr (seconds) - (if related issue exists) time to close the issue after the PR was merged.

In [3]:
def check_duplicates(pr_dataframe):
    duplicate_count = pr_dataframe['id'].duplicated().sum()
    print(f"Number of duplicate PR IDs: {duplicate_count}")
    if duplicate_count > 0:
        print("Duplicate PR IDs:", pr_dataframe[pr_dataframe['id'].duplicated(keep=False)]['id'].unique())

In [4]:
def annotate_pr_features(pr_dataframe, human=False):
    """
    Annotate PR dataframe with additional features.

    Parameters:
    - pr_dataframe: DataFrame containing pull request data

    Returns:
    - Annotated PR DataFrame
    """
    # Create a copy to avoid modifying the original
    df = pr_dataframe.copy()

    # Drop columns if they already exist
    columns_to_drop = ['accepted', 'rejected', 'pending', 'turnaround_time', 'related_issue', 'issue_created_at', 'issue_closed_at', 'issue_turnaround_time', 'time_to_close_issue_after_pr', 'num_files_changed', 'touches_test_file',
                       'lines_added', 'lines_deleted', 'net_churn', 'total_churn', 'num_bot_users', 'num_human_users', 'num_total_users', 'num_comments', 'num_human_comments', 'num_bot_comments', 'num_reviews', 'num_human_reviews', 'num_bot_reviews']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

    df['accepted'] = df['merged_at'].notnull()
    df['rejected'] = (df['state'] == 'closed') & df['merged_at'].isnull()
    df['turnaround_time'] = (pd.to_datetime(
        df['closed_at']) - pd.to_datetime(df['created_at'])).dt.total_seconds()

    if not human:
        # Related Issue
        df['related_issue'] = df['id'].isin(
            related_issue_df['pr_id'].unique())
        
        print("After annotating related_issue:")
        check_duplicates(df)

        # TODO: Fix handling multiple related issues per PR
        # # Issue Turnaround Times (diff between issue creation and closure AND time to close issue after PR closure)
        # related_issues_with_times = related_issue_df.merge(
        #     issue_df[['id', 'created_at', 'closed_at']], left_on='issue_id', right_on='id', how='left')

        # related_issues_with_times.drop(columns=['id', 'source'], inplace=True)

        # related_issues_with_times['issue_turnaround_time'] = (pd.to_datetime(
        #     related_issues_with_times['closed_at']) - pd.to_datetime(related_issues_with_times['created_at'])).dt.total_seconds()

        # related_issues_with_times.columns = [
        #     'pr_id', 'issue_id', 'issue_created_at', 'issue_closed_at', 'issue_turnaround_time']

        # df = df.merge(
        #     related_issues_with_times[['pr_id', 'issue_created_at', 'issue_closed_at', 'issue_turnaround_time']], left_on='id', right_on='pr_id', how='left')

        # df['time_to_close_issue_after_pr'] = (pd.to_datetime(
        #     df['issue_closed_at']) - pd.to_datetime(df['closed_at'])).dt.total_seconds()

        # df.drop(columns=['pr_id'], inplace=True)

        # print("After annotating issue turnaround times:")
        # check_duplicates(df)

        # Number of files changed per PR
        pr_files_changed = pr_commit_details_df.groupby(
            'pr_id')['filename'].nunique().reset_index()
        pr_files_changed.columns = ['id', 'num_files_changed']
        df = df.merge(pr_files_changed, on='id', how='left')

        def touches_test_file(filenames):
            for filename in filenames:
                if filename is not None and 'test' in filename.lower():
                    return True
            return False

        pr_files = pr_commit_details_df.groupby(
            'pr_id')['filename'].apply(list).reset_index()
        pr_files['touches_test_file'] = pr_files['filename'].apply(
            touches_test_file)
        df = df.merge(pr_files[['pr_id', 'touches_test_file']],
                      left_on='id', right_on='pr_id', how='left')
        df = df.drop('pr_id', axis=1)
        df['touches_test_file'] = df['touches_test_file'].fillna(False)

        print("After annotating file change features:")
        check_duplicates(df)

        # PR Size (number of lines added and deleted across all commits)
        pr_size = pr_commit_details_df.groupby('pr_id').agg(
            {'additions': 'sum', 'deletions': 'sum'}).reset_index()
        pr_size.columns = ['id', 'lines_added', 'lines_deleted']
        df = df.merge(pr_size, on='id', how='left')

        # Net Churn & Total Churn
        df['net_churn'] = df['lines_added'] - df['lines_deleted']
        df['total_churn'] = df['lines_added'] + df['lines_deleted']

        ### Number of Human/Bot/Total Reviews/Comments/Involvement ###

        # Get unique users from comments
        comment_users = pr_comments_df.groupby('pr_id').apply(
            lambda x: list(zip(x['user'], x['user_type']))
        ).reset_index()
        comment_users.columns = ['pr_id', 'comment_users']

        # Get comment counts by type
        comment_counts = pr_comments_df.groupby('pr_id').apply(
            lambda x: pd.Series({
                'num_comments': len(x),
                'num_human_comments': sum(x['user_type'] == 'User'),
                'num_bot_comments': sum(x['user_type'] == 'Bot')
            })
        ).reset_index()

        # Get unique users from reviews
        review_users = pr_reviews_df.groupby('pr_id').apply(
            lambda x: list(zip(x['user'], x['user_type']))
        ).reset_index()
        review_users.columns = ['pr_id', 'review_users']

        # Get review counts by type
        review_counts = pr_reviews_df.groupby('pr_id').apply(
            lambda x: pd.Series({
                'num_reviews': len(x),
                'num_human_reviews': sum(x['user_type'] == 'User'),
                'num_bot_reviews': sum(x['user_type'] == 'Bot')
            })
        ).reset_index()

        # Merge and combine users
        user_interactions = comment_users.merge(
            review_users, on='pr_id', how='outer')
        user_interactions['comment_users'] = user_interactions['comment_users'].apply(
            lambda x: x if isinstance(x, list) else [])
        user_interactions['review_users'] = user_interactions['review_users'].apply(
            lambda x: x if isinstance(x, list) else [])

        # Combine all users and count unique bots and humans
        def count_user_types(row):
            all_users = row['comment_users'] + row['review_users']
            unique_users = list(set(all_users))

            num_bots = sum(
                1 for user, user_type in unique_users if user_type == 'Bot')
            num_humans = sum(
                1 for user, user_type in unique_users if user_type == 'User')
            num_total = len(unique_users)

            return pd.Series({
                'num_bot_users': num_bots,
                'num_human_users': num_humans,
                'num_total_users': num_total
            })

        user_counts = user_interactions.apply(count_user_types, axis=1)
        user_interactions = pd.concat(
            [user_interactions[['pr_id']], user_counts], axis=1)

        # Merge all counts together
        user_interactions = user_interactions.merge(
            comment_counts, on='pr_id', how='outer')
        user_interactions = user_interactions.merge(
            review_counts, on='pr_id', how='outer')

        # Merge with PR dataframe
        df = df.merge(user_interactions, left_on='id',
                      right_on='pr_id', how='left')
        df = df.drop('pr_id', axis=1)

        # Fill NaN values with 0
        for col in ['num_bot_users', 'num_human_users', 'num_total_users',
                    'num_comments', 'num_human_comments', 'num_bot_comments',
                    'num_reviews', 'num_human_reviews', 'num_bot_reviews']:
            df[col] = df[col].fillna(0).astype(int)
        
        print("After annotating interaction features:")
        check_duplicates(df)

    return df


# Apply annotation to both dataframes
pr_df = annotate_pr_features(pr_df)
human_pr_df = annotate_pr_features(human_pr_df, human=True)

After annotating related_issue:
Number of duplicate PR IDs: 0


  df['touches_test_file'] = df['touches_test_file'].fillna(False)


After annotating file change features:
Number of duplicate PR IDs: 0


  comment_users = pr_comments_df.groupby('pr_id').apply(
  comment_counts = pr_comments_df.groupby('pr_id').apply(
  review_users = pr_reviews_df.groupby('pr_id').apply(
  review_counts = pr_reviews_df.groupby('pr_id').apply(


After annotating interaction features:
Number of duplicate PR IDs: 0


### Interesting Example: Bot Labelled as Human

In [5]:
pr_df.head(1)

Unnamed: 0,id,number,title,body,agent,user_id,user,state,created_at,closed_at,...,total_churn,num_bot_users,num_human_users,num_total_users,num_comments,num_human_comments,num_bot_comments,num_reviews,num_human_reviews,num_bot_reviews
0,3264933329,2911,Fix: Wait for all partitions in load_collectio...,## Summary\n\nFixes an issue where `load_colle...,Claude_Code,108661493,weiliu1031,closed,2025-07-26T02:59:01Z,2025-07-29T07:01:20Z,...,396.0,0,2,2,2,2,0,0,0,0


### Interesting Example: Missing Human Review

In [6]:
pr_df[pr_df['id'] == 3231949586]

Unnamed: 0,id,number,title,body,agent,user_id,user,state,created_at,closed_at,...,total_churn,num_bot_users,num_human_users,num_total_users,num_comments,num_human_comments,num_bot_comments,num_reviews,num_human_reviews,num_bot_reviews
5,3231949586,32656,feat(swagger): Add Swagger annotations to Batc...,## Summary\nProgressive implementation of Swag...,Claude_Code,1236198,spbolton,open,2025-07-15T11:46:41Z,,...,3247.0,1,1,2,0,0,0,4,1,3


In [7]:
pr_comments_df[pr_comments_df['pr_id'] == 3231949586]

Unnamed: 0,id,pr_id,user,user_id,user_type,created_at,body


In [8]:
pr_reviews_df[pr_reviews_df['pr_id'] == 3231949586]

Unnamed: 0,id,pr_id,user,user_type,state,submitted_at,body
114,3019966388,3231949586,cursor[bot],Bot,COMMENTED,2025-07-15T11:51:49Z,<details open>\n<summary><h3>Bug: Generic Type...
115,3078120806,3231949586,cursor[bot],Bot,COMMENTED,2025-08-01T07:47:50Z,<details open>\n<summary><h3>Bug: GET Methods ...
116,3083336640,3231949586,cursor[bot],Bot,COMMENTED,2025-08-04T09:10:12Z,
117,3084119459,3231949586,wezell,User,APPROVED,2025-08-04T13:06:55Z,"If it builds and passes tests, lgtm"


In [9]:
print("Agentic PRs:")
pr_df.head()

Agentic PRs:


Unnamed: 0,id,number,title,body,agent,user_id,user,state,created_at,closed_at,...,total_churn,num_bot_users,num_human_users,num_total_users,num_comments,num_human_comments,num_bot_comments,num_reviews,num_human_reviews,num_bot_reviews
0,3264933329,2911,Fix: Wait for all partitions in load_collectio...,## Summary\n\nFixes an issue where `load_colle...,Claude_Code,108661493,weiliu1031,closed,2025-07-26T02:59:01Z,2025-07-29T07:01:20Z,...,396.0,0,2,2,2,2,0,0,0,0
1,3265118634,2,„Éï„Ç°„Ç§„É´„Éë„ÇπÂèÇÁÖß„ÇíÁõ∏ÂØæ„Éë„Çπ„Å´Áµ±‰∏Ä„Åó„ÄÅdoc/„Åã„Çâdocs/„Å´Áµ±‰∏Ä,## ËÉåÊôØ\n\nÁèæÂú®„ÄÅÊú¨„Éó„É≠„Ç∏„Çß„ÇØ„Éà„Å´„Åä„ÅÑ„Å¶‰ª•‰∏ã„ÅÆ„Éë„ÇπÊßãÊàê„ÅÆ‰∏çÊï¥Âêà„ÅåÁîü„Åò„Å¶„ÅÑ„Åæ„ÅôÔºö\n\n...,Claude_Code,61827001,cm-kojimat,closed,2025-07-26T04:56:55Z,2025-07-26T22:12:24Z,...,76.0,0,0,0,0,0,0,0,0,0
2,3265640341,30,Add build staleness detection for debug CLI,## Summary\r\n\r\n Implements comprehensive b...,Claude_Code,7475,MSch,closed,2025-07-26T13:31:19Z,2025-07-26T13:37:22Z,...,407.0,1,1,2,0,0,0,2,1,1
3,3265709660,205,feat: add comprehensive README screenshots wit...,## Type of Change\n\n- [ ] üêõ `bug` - Bug fix (...,Claude_Code,80381,sugyan,closed,2025-07-26T14:07:22Z,2025-07-26T14:45:30Z,...,300.0,1,0,1,0,0,0,1,0,1
4,3265782173,17625,chore: remove HashedPostStateProvider trait,## Summary\r\n\r\n#17545 \r\n\r\nRemove the un...,Claude_Code,47593288,adust09,open,2025-07-26T15:02:48Z,,...,221.0,0,0,0,0,0,0,0,0,0


In [10]:
print("Human PRs:")
human_pr_df.head()

Human PRs:


Unnamed: 0,id,number,title,user,user_id,state,created_at,closed_at,merged_at,repo_url,html_url,body,agent,accepted,rejected,turnaround_time
0,2336888723,85268,feat(aci): add automations index page,ameliahsu,55610339,closed,2025-02-14T19:04:59Z,2025-02-18T22:42:20Z,2025-02-18T22:42:19Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/85268,https://sentry-j41gpomr5.sentry.dev/automation...,Human,True,False,358641.0
1,2447123365,89131,ref(insights): Make use of `<FeatureBadge>` fo...,ryan953,187460,closed,2025-04-08T23:29:50Z,2025-04-09T15:56:55Z,2025-04-09T15:56:54Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/89131,Using the premade component reduces an import ...,Human,True,False,59225.0
2,2438086945,88748,:bug: fix: update how we fetch workflow_id and...,iamrajjoshi,33237075,closed,2025-04-03T21:36:59Z,2025-04-04T15:10:57Z,2025-04-04T15:10:57Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/88748,i realized i made a mistake for how i fetch th...,Human,True,False,63238.0
3,2265431531,83085,fix(org-stats): Require project membership,ArthurKnaus,7033940,closed,2025-01-08T07:47:13Z,2025-01-08T08:49:40Z,2025-01-08T08:49:40Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/83085,### Problem\r\n\r\nIf the user is not member o...,Human,True,False,3747.0
4,2332333882,85102,ref(consumers): Rename parallel -> batched-par...,evanpurkhiser,1421724,closed,2025-02-12T21:24:17Z,2025-02-12T22:20:33Z,2025-02-12T22:20:33Z,https://api.github.com/repos/getsentry/sentry,https://github.com/getsentry/sentry/pull/85102,Both crons and uptime consumers have a paralle...,Human,True,False,3376.0


## Filtering

We apply the following filters to the dataset:

- Closed (merged/rejected) PRs only
- At least one interaction that isn't by the author before PR decision

In [11]:
def df_filtering(pr_dataframe):
    """
    Filter PR dataframe.

    Parameters:
    - pr_dataframe: DataFrame containing pull request data

    Returns:
    - Filtered PR DataFrame
    """

    df = pr_dataframe.copy()
    
    # Get PR IDs that exist in this dataframe
    existing_pr_ids = df['id'].unique()

    # PRs with non-author reviews (filter to only PRs in this dataframe)
    relevant_reviews = pr_reviews_df[pr_reviews_df['pr_id'].isin(existing_pr_ids)]
    df_indexed = df.set_index('id')
    non_author_reviews = relevant_reviews[
        relevant_reviews['user'] != df_indexed.loc[relevant_reviews['pr_id'], 'user'].values
    ]['pr_id'].unique()

    # PRs with non-author comments (filter to only PRs in this dataframe)
    relevant_comments = pr_comments_df[pr_comments_df['pr_id'].isin(existing_pr_ids)]
    non_author_comments = relevant_comments[
        relevant_comments['user'] != df_indexed.loc[relevant_comments['pr_id'], 'user'].values
    ]['pr_id'].unique()

    # Union of PRs with non-author reviews and comments
    prs_with_non_author_interaction = set(
        non_author_reviews) | set(non_author_comments)

    # Filter df
    df = df[df['id'].isin(prs_with_non_author_interaction)]
    df = df[df['state'] == 'closed'] # Only closed PRs

    return df

# Filter dataframe
pr_df = df_filtering(pr_df)

In [12]:
print("Num. Agentic PRs:", pr_df.shape)

Num. Agentic PRs: (11962, 33)


In [13]:
pr_reviews_df.columns

Index(['id', 'pr_id', 'user', 'user_type', 'state', 'submitted_at', 'body'], dtype='object')

In [14]:
# Get the list of PR IDs from pr_df
filtered_pr_ids = pr_df['id'].unique()

# Filter all PR-related dataframes
pr_comments_df = pr_comments_df[pr_comments_df['pr_id'].isin(filtered_pr_ids)]
pr_reviews_df = pr_reviews_df[pr_reviews_df['pr_id'].isin(filtered_pr_ids)]
# pr_review_comments_df = pr_review_comments_df[pr_review_comments_df['pull_request_url'].str.extract(r'/(\d+)$')[0].astype(int).isin(filtered_pr_ids)]

filtered_review_ids = pr_reviews_df[pr_reviews_df['pr_id'].isin(filtered_pr_ids)]['id'].unique()
pr_review_comments_df = pr_review_comments_df[pr_review_comments_df['pull_request_review_id'].isin(filtered_review_ids)]


pr_commits_df = pr_commits_df[pr_commits_df['pr_id'].isin(filtered_pr_ids)]
pr_commit_details_df = pr_commit_details_df[pr_commit_details_df['pr_id'].isin(filtered_pr_ids)]
pr_timeline_df = pr_timeline_df[pr_timeline_df['pr_id'].isin(filtered_pr_ids)]
pr_task_type_df = pr_task_type_df[pr_task_type_df['id'].isin(filtered_pr_ids)]
related_issue_df = related_issue_df[related_issue_df['pr_id'].isin(filtered_pr_ids)]

# Get unique repo IDs and user IDs from filtered PRs
filtered_repo_ids = pr_df['repo_id'].unique()
filtered_user_ids = pr_df['user_id'].unique()

# Filter repo and user dataframes
repo_df = repo_df[repo_df['id'].isin(filtered_repo_ids)]
user_df = user_df[user_df['id'].isin(filtered_user_ids)]

# Get unique issue IDs from related_issue_df
filtered_issue_ids = related_issue_df['issue_id'].unique()
issue_df = issue_df[issue_df['id'].isin(filtered_issue_ids)]

print(f"Filtered PR count: {len(pr_df)}")
print(f"Filtered pr_comments_df: {len(pr_comments_df)}")
print(f"Filtered pr_reviews_df: {len(pr_reviews_df)}")
print(f"Filtered pr_review_comments_df: {len(pr_review_comments_df)}")
print(f"Filtered pr_commits_df: {len(pr_commits_df)}")
print(f"Filtered pr_commit_details_df: {len(pr_commit_details_df)}")
print(f"Filtered pr_timeline_df: {len(pr_timeline_df)}")
print(f"Filtered pr_task_type_df: {len(pr_task_type_df)}")
print(f"Filtered related_issue_df: {len(related_issue_df)}")
print(f"Filtered human_pr_df: {len(human_pr_df)}")
print(f"Filtered human_pr_task_type_df: {len(human_pr_task_type_df)}")
print(f"Filtered repo_df: {len(repo_df)}")
print(f"Filtered user_df: {len(user_df)}")
print(f"Filtered issue_df: {len(issue_df)}")

Filtered PR count: 11962
Filtered pr_comments_df: 32706
Filtered pr_reviews_df: 24853
Filtered pr_review_comments_df: 22332
Filtered pr_commits_df: 51430
Filtered pr_commit_details_df: 377263
Filtered pr_timeline_df: 193105
Filtered pr_task_type_df: 11962
Filtered related_issue_df: 3220
Filtered human_pr_df: 6618
Filtered human_pr_task_type_df: 6618
Filtered repo_df: 1623
Filtered user_df: 940
Filtered issue_df: 3084


## Save Filtered Dataset to CSV

In [15]:
save_dataframes_to_csv("filtered")

## Check Filtered Data

In [16]:
test_pr_df = pd.read_csv("./data/filtered/pull_request.csv")

print("Number of rows in test_pr_df:", len(test_pr_df))

test_pr_df.head()

Number of rows in test_pr_df: 11962


Unnamed: 0,id,number,title,body,agent,user_id,user,state,created_at,closed_at,...,total_churn,num_bot_users,num_human_users,num_total_users,num_comments,num_human_comments,num_bot_comments,num_reviews,num_human_reviews,num_bot_reviews
0,3264933329,2911,Fix: Wait for all partitions in load_collectio...,## Summary\n\nFixes an issue where `load_colle...,Claude_Code,108661493,weiliu1031,closed,2025-07-26T02:59:01Z,2025-07-29T07:01:20Z,...,396.0,0,2,2,2,2,0,0,0,0
1,3265640341,30,Add build staleness detection for debug CLI,## Summary\r\n\r\n Implements comprehensive b...,Claude_Code,7475,MSch,closed,2025-07-26T13:31:19Z,2025-07-26T13:37:22Z,...,407.0,1,1,2,0,0,0,2,1,1
2,3265709660,205,feat: add comprehensive README screenshots wit...,## Type of Change\n\n- [ ] üêõ `bug` - Bug fix (...,Claude_Code,80381,sugyan,closed,2025-07-26T14:07:22Z,2025-07-26T14:45:30Z,...,300.0,1,0,1,0,0,0,1,0,1
3,3214555104,16658,Add function signature breaking change detector,<details><summary>&#x1F6E0 DevTools &#x1F6E0</...,Claude_Code,17039389,harupy,closed,2025-07-09T05:35:26Z,2025-07-11T05:13:35Z,...,620.0,0,2,2,3,3,0,8,8,0
4,3214724259,5489,feat: add comprehensive test coverage for form...,## Summary\n\nThis PR enhances the forms plugi...,Claude_Code,82053242,wtfsayo,closed,2025-07-09T06:43:46Z,2025-07-09T06:44:02Z,...,1353.0,3,0,3,2,0,2,1,0,1


In [17]:
test_pr_review_comments_df = pd.read_csv("./data/filtered/pr_review_comments.csv")

print("Number of rows in test_pr_review_comments_df:", len(test_pr_review_comments_df))

test_pr_df.head()

Number of rows in test_pr_review_comments_df: 22332


Unnamed: 0,id,number,title,body,agent,user_id,user,state,created_at,closed_at,...,total_churn,num_bot_users,num_human_users,num_total_users,num_comments,num_human_comments,num_bot_comments,num_reviews,num_human_reviews,num_bot_reviews
0,3264933329,2911,Fix: Wait for all partitions in load_collectio...,## Summary\n\nFixes an issue where `load_colle...,Claude_Code,108661493,weiliu1031,closed,2025-07-26T02:59:01Z,2025-07-29T07:01:20Z,...,396.0,0,2,2,2,2,0,0,0,0
1,3265640341,30,Add build staleness detection for debug CLI,## Summary\r\n\r\n Implements comprehensive b...,Claude_Code,7475,MSch,closed,2025-07-26T13:31:19Z,2025-07-26T13:37:22Z,...,407.0,1,1,2,0,0,0,2,1,1
2,3265709660,205,feat: add comprehensive README screenshots wit...,## Type of Change\n\n- [ ] üêõ `bug` - Bug fix (...,Claude_Code,80381,sugyan,closed,2025-07-26T14:07:22Z,2025-07-26T14:45:30Z,...,300.0,1,0,1,0,0,0,1,0,1
3,3214555104,16658,Add function signature breaking change detector,<details><summary>&#x1F6E0 DevTools &#x1F6E0</...,Claude_Code,17039389,harupy,closed,2025-07-09T05:35:26Z,2025-07-11T05:13:35Z,...,620.0,0,2,2,3,3,0,8,8,0
4,3214724259,5489,feat: add comprehensive test coverage for form...,## Summary\n\nThis PR enhances the forms plugi...,Claude_Code,82053242,wtfsayo,closed,2025-07-09T06:43:46Z,2025-07-09T06:44:02Z,...,1353.0,3,0,3,2,0,2,1,0,1


In [18]:
test_pr_df.columns

Index(['id', 'number', 'title', 'body', 'agent', 'user_id', 'user', 'state',
       'created_at', 'closed_at', 'merged_at', 'repo_id', 'repo_url',
       'html_url', 'accepted', 'rejected', 'turnaround_time', 'related_issue',
       'num_files_changed', 'touches_test_file', 'lines_added',
       'lines_deleted', 'net_churn', 'total_churn', 'num_bot_users',
       'num_human_users', 'num_total_users', 'num_comments',
       'num_human_comments', 'num_bot_comments', 'num_reviews',
       'num_human_reviews', 'num_bot_reviews'],
      dtype='object')

## Test Top N% Filtering Function

In [19]:
from utils import filter_top_n_for_cols

exclusion_cols = ['num_total_users', 'num_comments',
                  'num_reviews', 'num_files_changed']
test_cleaned = filter_top_n_for_cols(
    test_pr_df, exclusion_cols, filter_percent=10)
print(f"Num. Rows in Cleaned DF = {len(test_cleaned)}")

PRs to exclude for num_total_users: 1573
PRs to exclude for num_comments: 1318
PRs to exclude for num_reviews: 1437
PRs to exclude for num_files_changed: 1206
Total Rows to Filter: 3442
Num. Rows in Cleaned DF = 8520


In [20]:
test_cleaned['num_total_users'].describe()

count    8520.000000
mean        2.032042
std         0.971011
min         1.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         4.000000
Name: num_total_users, dtype: float64

In [21]:
test_cleaned['num_reviews'].describe()

count    8520.000000
mean        0.792488
std         1.010168
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         4.000000
Name: num_reviews, dtype: float64

In [22]:
test_cleaned['num_comments'].describe()

count    8520.000000
mean        1.760211
std         1.250853
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         5.000000
Name: num_comments, dtype: float64