In [1]:
%load_ext autoreload
%autoreload 2

import sys
from tqdm.notebook import tqdm
from time import sleep
sys.path.append('../')
from src.combined import *

In [2]:
project = 'sahara'

Load commits and reviewers

In [3]:
df_commits = pd.read_csv(f'../data/commits/commits_{project}.csv')

In [4]:
df_reviews = pd.read_csv(f'../data/reviews/reviews_{project}.csv')

Keep only commits with reviews

In [5]:
commits_without_reviews = df_commits.loc[~df_commits['hash'].isin(df_reviews['hash']), 'hash'].values

In [6]:
len(commits_without_reviews)

202

In [7]:
df_commits = df_commits.loc[df_commits['hash'].isin(df_reviews['hash'])]

In [8]:
len(df_commits)

3164

Re-format `prior_commits`

In [9]:
df_commits['prior_commits'] = df_commits['prior_commits'].apply(
    lambda x : [c for c in split_prior_commits(x) if c not in commits_without_reviews]
)

Join dataframes

In [10]:
df = df_commits.set_index('hash').join(df_reviews.set_index('hash'), how = 'left')

df = df.reset_index()

In [11]:
len(df)

13934

In [12]:
# Should be 0 since we already removed beforehand
len(df.loc[df['change_id'].isnull()])

0

Compute expertise for author and reviewers

In [13]:
df['pct_prior_commits_author_authored'] = 0
df['pct_prior_commits_author_reviewed'] = 0
df['pct_prior_commits_reviewer_authored'] = 0
df['pct_prior_commits_reviewer_reviewed'] = 0

In [14]:
with tqdm(total = len(df)) as pbar:
    for idx, row in df.iterrows():
        pbar.set_postfix({'hash' : row['hash'], 'reviewer' : row['reviewer_name']})
        prior_commits = row['prior_commits']
        # No prior commit
        if len(prior_commits) == 0:
            pbar.update(1)
            sleep(0.001)
            continue
        # Initialize counters
        prior_authors_count = Counter()
        prior_reviewers_count = Counter()
        # Iterate through each prior commit
        for prior_commit in prior_commits:
            # Get the author
            prior_author = df_commits.loc[df_commits['hash'] == prior_commit, 'author_name'].values[0]
            # Increment count
            prior_authors_count[prior_author] += 1
            # Get the reviewers
            prior_reviewers = df_reviews.loc[df_reviews['hash'] == prior_commit, 'reviewer_name'].values
            # Increment counts
            for prior_reviewer in prior_reviewers:
                prior_reviewers_count[prior_reviewer] += 1
        # Calculate percentages for the author
        author = row['author_name']
        pct_prior_commits_author_authored = prior_authors_count[author] / len(prior_commits)
        assert 0 <= pct_prior_commits_author_authored <= 1
        pct_prior_commits_author_reviewed = prior_reviewers_count[author] / len(prior_commits)
        assert 0 <= pct_prior_commits_author_reviewed <= 1
        # Calculate percentages for the reviewer
        reviewer = row['reviewer_name']
        pct_prior_commits_reviewer_authored = prior_authors_count[reviewer] / len(prior_commits)
        assert 0 <= pct_prior_commits_reviewer_authored <= 1
        pct_prior_commits_reviewer_reviewed = prior_reviewers_count[reviewer] / len(prior_commits)
        assert 0 <= pct_prior_commits_reviewer_reviewed <= 1
        # Update values
        df.at[idx, 'pct_prior_commits_author_authored'] = pct_prior_commits_author_authored
        df.at[idx, 'pct_prior_commits_author_reviewed'] = pct_prior_commits_author_reviewed
        df.at[idx, 'pct_prior_commits_reviewer_authored'] = pct_prior_commits_reviewer_authored
        df.at[idx, 'pct_prior_commits_reviewer_reviewed'] = pct_prior_commits_reviewer_reviewed
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/13934 [00:00<?, ?it/s]

In [15]:
df = df.drop(columns = ['prior_commits'])

## Patches

In [16]:
df['project'] = 'sahara'

In [17]:
df['author_is_exp_author'] = df['pct_prior_commits_author_authored'] > 0.05
df['author_is_exp_reviewer'] = df['pct_prior_commits_author_reviewed'] > 0.05

In [19]:
df['reviewer_is_exp_author'] = df['pct_prior_commits_reviewer_authored'] > 0
df['reviewer_is_exp_reviewer'] = df['pct_prior_commits_reviewer_reviewed'] > 0

In [20]:
df = df.drop(columns=[
    'pct_prior_commits_author_authored',
    'pct_prior_commits_author_reviewed',
    'pct_prior_commits_reviewer_authored',
    'pct_prior_commits_reviewer_reviewed'
])

In [22]:
import json

In [23]:
with open('../data/core_devs.json', 'r') as j:
    CORE_DEVS = json.loads(j.read())

In [24]:
for idx, row in df.iterrows():
    project = row['project']
    project_core_devs = CORE_DEVS[project]
    author = row['author_name']
    if author in project_core_devs:
        df.at[idx, 'author_is_core'] = True
    else:
        df.at[idx, 'author_is_core'] = False

In [25]:
df['author_is_core'].value_counts()

False    13595
True       339
Name: author_is_core, dtype: int64

In [26]:
df['reviewer_vote'].value_counts()

 2    6807
 1    6345
-1     741
-2      41
Name: reviewer_vote, dtype: int64

In [27]:
df['pos_vote'] = df['reviewer_vote'] > 0

In [28]:
df['pos_vote'].value_counts()

True     13152
False      782
Name: pos_vote, dtype: int64

In [29]:
binary_cols = [
    'bug_fixing',
    'fix_inducing',
    'author_is_core',
    'reviewer_is_core',
    'pos_vote', 
    'author_is_exp_author',
    'author_is_exp_reviewer',
    'reviewer_is_exp_author',
    'reviewer_is_exp_reviewer'
]

In [30]:
for c in binary_cols:
    print(df[c].value_counts())

False    7317
True     6617
Name: bug_fixing, dtype: int64
True     7515
False    6419
Name: fix_inducing, dtype: int64
False    13595
True       339
Name: author_is_core, dtype: int64
False    12866
True      1068
Name: reviewer_is_core, dtype: int64
True     13152
False      782
Name: pos_vote, dtype: int64
False    12266
True      1668
Name: author_is_exp_author, dtype: int64
False    12190
True      1744
Name: author_is_exp_reviewer, dtype: int64
False    13520
True       414
Name: reviewer_is_exp_author, dtype: int64
False    10616
True      3318
Name: reviewer_is_exp_reviewer, dtype: int64


In [31]:
for c in binary_cols:
    df[c] = df[c].astype(int)

In [32]:
for c in binary_cols:
    print(df[c].value_counts())

0    7317
1    6617
Name: bug_fixing, dtype: int64
1    7515
0    6419
Name: fix_inducing, dtype: int64
0    13595
1      339
Name: author_is_core, dtype: int64
0    12866
1     1068
Name: reviewer_is_core, dtype: int64
1    13152
0      782
Name: pos_vote, dtype: int64
0    12266
1     1668
Name: author_is_exp_author, dtype: int64
0    12190
1     1744
Name: author_is_exp_reviewer, dtype: int64
0    13520
1      414
Name: reviewer_is_exp_author, dtype: int64
0    10616
1     3318
Name: reviewer_is_exp_reviewer, dtype: int64


In [33]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_lines_added,13934.0,190.649778,1224.478776,0.0,4.0,19.0,82.0,29847.0
num_lines_deleted,13934.0,143.617052,2684.783651,0.0,1.0,4.0,19.75,204063.0
num_lines_of_code,13934.0,830.763026,1797.340454,0.0,121.0,328.0,825.0,30286.0
num_file_impacted,13934.0,5.55849,20.527672,0.0,1.0,2.0,5.0,796.0
num_dirs_impacted,13934.0,3.091144,5.201366,0.0,1.0,2.0,3.0,172.0
min_complexity,13934.0,14.612459,25.260812,0.0,0.0,5.0,18.0,262.0
mean_complexity,13934.0,25.17868,28.481969,0.0,0.0,18.75,37.5,262.0
max_complexity,13934.0,43.627673,49.969417,0.0,0.0,29.0,71.0,263.0
entropy,13934.0,0.461953,0.403031,0.0,0.0,0.588527,0.840954,1.0
bug_fixing,13934.0,0.474882,0.499387,0.0,0.0,0.0,1.0,1.0


In [34]:
df.to_csv('../data/rq1/rq1_sahara.csv', index=False)

In [35]:
num_cols = [
    'min_complexity',
    'mean_complexity',
    'max_complexity',
    'num_prior_votes',
    'num_lines_added',
    'num_lines_deleted',
    'num_lines_of_code',
    'num_dirs_impacted',
    'num_file_impacted',
    'description_length',
    'num_prior_commits',
    'num_prior_comments',
    'num_prior_commits_bug_fixing',
    'num_future_commits_bug_fixing',
    'avg_prior_age'
]

In [36]:
for c in num_cols:
    df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())

In [37]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_lines_added,13934.0,0.006388,0.041025,0.0,0.000134,0.000637,0.002747,1.0
num_lines_deleted,13934.0,0.000704,0.013157,0.0,5e-06,2e-05,9.7e-05,1.0
num_lines_of_code,13934.0,0.027431,0.059346,0.0,0.003995,0.01083,0.02724,1.0
num_file_impacted,13934.0,0.006983,0.025789,0.0,0.001256,0.002513,0.006281,1.0
num_dirs_impacted,13934.0,0.017972,0.03024,0.0,0.005814,0.011628,0.017442,1.0
min_complexity,13934.0,0.055773,0.096415,0.0,0.0,0.019084,0.068702,1.0
mean_complexity,13934.0,0.096102,0.10871,0.0,0.0,0.071565,0.14313,1.0
max_complexity,13934.0,0.165885,0.189998,0.0,0.0,0.110266,0.269962,1.0
entropy,13934.0,0.461953,0.403031,0.0,0.0,0.588527,0.840954,1.0
bug_fixing,13934.0,0.474882,0.499387,0.0,0.0,0.0,1.0,1.0


In [38]:
df.to_csv('../data/rq1/rq1_sahara_norm.csv', index=False)