In [25]:
from tqdm.notebook import tqdm
from time import sleep
import numpy as np
import pandas as pd
import requests
import json

In [26]:
df_reviews = pd.read_csv('../data/rq1/rq1_all.csv')

In [27]:
df_reviews.columns

Index(['hash', 'author_name', 'committer_name', 'author_date', 'commit_date',
       'num_lines_added', 'num_lines_deleted', 'num_lines_of_code',
       'num_file_impacted', 'num_dirs_impacted', 'min_complexity',
       'mean_complexity', 'max_complexity', 'entropy', 'bug_fixing',
       'description_length', 'num_prior_commits', 'avg_prior_age',
       'num_prior_commits_bug_fixing', 'num_future_commits_bug_fixing',
       'fix_inducing', 'reviewer_id', 'reviewer_name', 'reviewer_vote',
       'reviewer_is_core', 'num_prior_comments',
       'pct_prior_comments_by_reviewer', 'num_prior_votes',
       'pct_prior_pos_votes', 'pct_prior_neg_votes',
       'pct_prior_pos_votes_core', 'pct_prior_neg_votes_core', 'reviewer_freq',
       'change_id', 'project', 'author_is_exp_author',
       'author_is_exp_reviewer', 'reviewer_is_exp_author',
       'reviewer_is_exp_reviewer', 'author_is_core', 'pos_vote'],
      dtype='object')

In [28]:
len(df_reviews)

107300

Columns

In [49]:
id_vars = [
    'hash',
    'change_id',
    'project'
]

In [50]:
target_var = 'fix_inducing'

In [51]:
patch_vars = [
  'num_lines_added',
  'num_lines_deleted',
  'num_lines_of_code', # number of lines in the files before the change
  'num_file_impacted',
  'num_dirs_impacted',
  'mean_complexity', # average Cyclomatic Complexity
  'entropy', # dispersion of modified lines across files
  'bug_fixing', # whether or not the patch is for fixing bugs
  'description_length',
  'num_prior_commits', # number of prior patches that impact the same modified lines
  'num_prior_commits_bug_fixing',
  'avg_prior_age', # average of time intervals (in days) between prior patches and this patch
  'author_is_core', # is author a core developer
  'author_is_exp_author', # % of prior patches that were made by the same author > 0.05
  'author_is_exp_reviewer'
]

Select only `id`, `patch` vars since they are unique to each patch

In [70]:
df = df_reviews[id_vars + patch_vars + [target_var]].copy()

In [71]:
df = df.drop_duplicates(keep = 'first', ignore_index = True)

In [72]:
len(df)

22047

Set strong relationship threshold

In [37]:
reviewer_freq_percentiles = np.percentile(
    df_reviews['reviewer_freq'],
    q=list(range(0, 101, 10))
)

In [38]:
reviewer_freq_percentiles

array([0.        , 0.        , 0.        , 0.        , 0.02222222,
       0.09090909, 0.17241379, 0.27118644, 0.41176471, 0.63636364,
       1.        ])

In [39]:
high_freq_threshold = 0.40

In [40]:
def get_metrics(reviews):
    num_reviewers = len(reviews)
    num_comments = float('-inf')
    num_pos_votes = 0
    num_pos_prior_pos = 0
    num_pos_prior_neg = 0
    num_pos_prior_no_comment = 0
    num_pos_prior_pos_core = 0
    num_pos_author_core = 0
    num_pos_high_freq = 0
    for i, row in reviews.iterrows():
        pos_vote = row['pos_vote']
        num_comments = max(row['num_prior_comments'], num_comments)
        if not pos_vote:
            continue
        num_pos_votes += 1
        if row['pct_prior_pos_votes'] > 0:
            num_pos_prior_pos += 1
        if row['pct_prior_neg_votes'] > 0:
            num_pos_prior_neg += 1
        if row['num_prior_comments'] == 0:
            num_pos_prior_no_comment += 1
        if row['pct_prior_pos_votes_core'] > 0:
            num_pos_prior_pos_core += 1
        if row['author_is_core'] == 1:
            num_pos_author_core += 1
        if row['reviewer_freq'] >= high_freq_threshold:
            num_pos_high_freq += 1
    return {
        'num_reviewers' : num_reviewers,
        'num_comments' : num_comments,
        'pct_pos' : num_pos_votes / num_reviewers,
        'pct_pos_prior_pos' : num_pos_prior_pos / num_reviewers,
        'pct_pos_prior_neg' : num_pos_prior_neg / num_reviewers,
        'pct_pos_prior_no_comment' : num_pos_prior_no_comment / num_reviewers,
        'pct_pos_prior_pos_core' : num_pos_prior_pos_core / num_reviewers,
        'pct_pos_author_core' : num_pos_author_core / num_reviewers,
        'pct_pos_high_freq' : num_pos_high_freq / num_reviewers
    }

In [41]:
from collections import Counter
all_statuses = Counter()

In [42]:
def get_merged(project, change_id):
    get_prefix = f'https://review.opendev.org/changes/openstack%2F{project}~master~'
    get_addr = get_prefix + change_id
    response = requests.get(get_addr)
    if response.status_code == 404:
        print("---change id {} had invalid response".format(change_id))
        return False
    response_json = json.loads(response.text[5:])
    if 'status' not in response_json:
        print("---change id {} had no status in response".format(change_id))
        return False
    status = response_json['status']
    all_statuses[status] += 1
    return status == 'MERGED'

In [43]:
all_metrics = []

In [44]:
num_not_merged = 0

In [45]:
with tqdm(total = len(df)) as pbar:
    for i, row in df.iterrows():
        pbar.set_postfix({'hash' : row['hash']})
        hash_id = row['hash']
        change_id = row['change_id']
        merged = get_merged(row['project'], row['change_id'])
        if not merged:
            num_not_merged += 1
        else:
            reviews = df_reviews.loc[df_reviews['hash'] == hash_id]
            metrics = get_metrics(reviews)
            metrics['hash'] = hash_id
            all_metrics.append(metrics)
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/22047 [00:00<?, ?it/s]

In [46]:
print(num_not_merged)

18


In [47]:
all_statuses

Counter({'MERGED': 22029, 'ABANDONED': 18})

In [48]:
df_metrics = pd.DataFrame(all_metrics)

Merge

In [73]:
df = df_metrics.set_index('hash').join(df.set_index('hash'), how = 'inner')

In [74]:
df = df.reset_index()

In [75]:
len(df)

22029

In [76]:
review_vars = [
    'num_reviewers',
    'num_comments',
    'pct_pos'
]

In [77]:
social_vars = [
    'pct_pos_prior_pos',
    'pct_pos_prior_neg',
    'pct_pos_prior_no_comment',
    'pct_pos_prior_pos_core',
    'pct_pos_author_core',
    'pct_pos_high_freq'
]

In [79]:
df = df[id_vars + patch_vars + review_vars + social_vars + [target_var]]

In [80]:
df.columns

Index(['hash', 'change_id', 'project', 'num_lines_added', 'num_lines_deleted',
       'num_lines_of_code', 'num_file_impacted', 'num_dirs_impacted',
       'mean_complexity', 'entropy', 'bug_fixing', 'description_length',
       'num_prior_commits', 'num_prior_commits_bug_fixing', 'avg_prior_age',
       'author_is_core', 'author_is_exp_author', 'author_is_exp_reviewer',
       'num_reviewers', 'num_comments', 'pct_pos', 'pct_pos_prior_pos',
       'pct_pos_prior_neg', 'pct_pos_prior_no_comment',
       'pct_pos_prior_pos_core', 'pct_pos_author_core', 'pct_pos_high_freq',
       'fix_inducing'],
      dtype='object')

In [81]:
df.to_csv(f'../data/rq2/rq2_all.csv', index = False)

In [82]:
num_cols = [
    'num_lines_added',
    'num_lines_deleted',
    'num_lines_of_code',
    'num_file_impacted',
    'num_dirs_impacted',
    'mean_complexity',
    'entropy',
    'description_length',
    'num_prior_commits',
    'num_prior_commits_bug_fixing',
    'avg_prior_age',
    'num_reviewers',
    'num_comments'
]

In [83]:
for c in num_cols:
    df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())

In [84]:
df.to_csv('../data/rq2/rq2_all_norm.csv', index=False)