In [1]:
from tqdm.notebook import tqdm
from time import sleep
import numpy as np
import pandas as pd
import requests
import json

In [2]:
df_reviews = pd.read_csv('../data/rq1/rq1_sahara.csv')

In [3]:
df_reviews.columns

Index(['hash', 'author_name', 'committer_name', 'author_date', 'commit_date',
       'num_lines_added', 'num_lines_deleted', 'num_lines_of_code',
       'num_file_impacted', 'num_dirs_impacted', 'min_complexity',
       'mean_complexity', 'max_complexity', 'entropy', 'bug_fixing',
       'description_length', 'num_prior_commits', 'avg_prior_age',
       'num_prior_commits_bug_fixing', 'num_future_commits_bug_fixing',
       'fix_inducing', 'reviewer_id', 'reviewer_name', 'reviewer_vote',
       'reviewer_is_core', 'num_prior_comments',
       'pct_prior_comments_by_reviewer', 'num_prior_votes',
       'pct_prior_pos_votes', 'pct_prior_neg_votes',
       'pct_prior_pos_votes_core', 'pct_prior_neg_votes_core', 'reviewer_freq',
       'change_id', 'project', 'author_is_exp_author',
       'author_is_exp_reviewer', 'reviewer_is_exp_author',
       'reviewer_is_exp_reviewer', 'author_is_core', 'pos_vote'],
      dtype='object')

In [4]:
len(df_reviews)

13934

Columns

In [5]:
id_vars = [
    'hash',
    'change_id',
    'project'
]

In [6]:
target_var = 'fix_inducing'

In [7]:
patch_vars = [
  'num_lines_added',
  'num_lines_deleted',
  'num_lines_of_code', # number of lines in the files before the change
  'num_file_impacted',
  'num_dirs_impacted',
  'mean_complexity', # average Cyclomatic Complexity
  'entropy', # dispersion of modified lines across files
  'bug_fixing', # whether or not the patch is for fixing bugs
  'description_length',
  'num_prior_commits', # number of prior patches that impact the same modified lines
  'num_prior_commits_bug_fixing',
  'avg_prior_age', # average of time intervals (in days) between prior patches and this patch
  'author_is_core', # is author a core developer
  'author_is_exp_author', # % of prior patches that were made by the same author > 0.05
  'author_is_exp_reviewer'
]

Select only `id`, `patch` vars since they are unique to each patch

In [8]:
df = df_reviews[id_vars + patch_vars + [target_var]].copy()

In [9]:
df = df.drop_duplicates(keep = 'first', ignore_index = True)

In [10]:
len(df)

3164

Set strong relationship threshold

In [11]:
reviewer_freq_percentiles = np.percentile(
    df_reviews['reviewer_freq'],
    q=list(range(0, 101, 10))
)

In [12]:
reviewer_freq_percentiles

array([0.        , 0.        , 0.02543891, 0.11426654, 0.22222222,
       0.33333333, 0.48717949, 0.66666667, 0.83333333, 0.97727273,
       1.        ])

In [13]:
high_freq_threshold = 0.40 # from training

In [14]:
def get_metrics(reviews):
    num_votes = len(reviews)
    num_comments = float('-inf')
    num_pos_votes = 0
    num_pos_prior_pos = 0
    num_pos_prior_neg = 0
    num_pos_prior_votes = 0
    num_pos_prior_comments = 0
    num_pos_prior_pos_core = 0
    num_pos_prior_neg_core = 0
    for i, row in reviews.iterrows():
        pos_vote = row['pos_vote']
        num_comments = max(num_comments, row['num_prior_comments'])
        if not pos_vote:
            continue
        num_pos_votes += 1
        if row['pct_prior_pos_votes'] > 0:
            num_pos_prior_pos += 1
        if row['pct_prior_neg_votes'] > 0:
            num_pos_prior_neg += 1
        if row['num_prior_votes'] > 0:
            num_pos_prior_votes += row['num_prior_votes']
        if row['num_prior_comments'] > 0:
            num_pos_prior_comments += row['num_prior_comments']
        if row['pct_prior_pos_votes_core'] > 0:
            num_pos_prior_pos_core += 1
        if row['pct_prior_neg_votes_core'] > 0:
            num_pos_prior_neg_core += 1
    
    if num_pos_votes == 0:
        return {
            'num_votes' : num_votes,
            'num_comments' : num_comments,
            'pct_pos' : 0,
            'pct_pos_prior_pos' : 0,
            'pct_pos_prior_neg' : 0,
            'avg_pos_prior_num_votes' : 0,
            'avg_pos_prior_num_comments' : 0,
            'pct_pos_prior_pos_core' : 0,
            'pct_pos_prior_neg_core' : 0
        }

    return {
        'num_votes' : num_votes,
        'num_comments' : num_comments,
        'pct_pos' : num_pos_votes / num_votes,
        'pct_pos_prior_pos' : num_pos_prior_pos / num_pos_votes,
        'pct_pos_prior_neg' : num_pos_prior_neg / num_pos_votes,
        'avg_pos_prior_num_votes' : num_pos_prior_votes / num_pos_votes,
        'avg_pos_prior_num_comments' : num_pos_prior_comments / num_pos_votes,
        'pct_pos_prior_pos_core' : num_pos_prior_pos_core / num_pos_votes,
        'pct_pos_prior_neg_core' : num_pos_prior_neg_core / num_pos_votes
    }

In [15]:
from collections import Counter
all_statuses = Counter()

In [16]:
def get_merged(project, change_id):
    get_prefix = f'https://review.opendev.org/changes/openstack%2F{project}~master~'
    get_addr = get_prefix + change_id
    response = requests.get(get_addr)
    if response.status_code == 404:
        print("---change id {} had invalid response".format(change_id))
        return False
    response_json = json.loads(response.text[5:])
    if 'status' not in response_json:
        print("---change id {} had no status in response".format(change_id))
        return False
    status = response_json['status']
    all_statuses[status] += 1
    return status == 'MERGED'

In [17]:
all_metrics = []

In [18]:
num_not_merged = 0

In [19]:
with tqdm(total = len(df)) as pbar:
    for i, row in df.iterrows():
        pbar.set_postfix({'hash' : row['hash']})
        hash_id = row['hash']
        change_id = row['change_id']
        merged = get_merged(row['project'], row['change_id'])
        if not merged:
            num_not_merged += 1
        else:
            reviews = df_reviews.loc[df_reviews['hash'] == hash_id]
            metrics = get_metrics(reviews)
            metrics['hash'] = hash_id
            all_metrics.append(metrics)
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/3164 [00:00<?, ?it/s]

In [34]:
print(num_not_merged)

0


In [21]:
all_statuses

Counter({'MERGED': 3164})

In [22]:
df_metrics = pd.DataFrame(all_metrics)

Merge

In [23]:
df = df_metrics.set_index('hash').join(df.set_index('hash'), how = 'inner')

In [24]:
df = df.reset_index()

In [25]:
len(df)

3164

In [26]:
review_vars = [
    'num_votes',
    'num_comments',
    'pct_pos'
]

In [27]:
social_vars = [
    'pct_pos_prior_pos',
    'pct_pos_prior_neg',
    'avg_pos_prior_num_votes',
    'avg_pos_prior_num_comments',
    'pct_pos_prior_pos_core',
    'pct_pos_prior_neg_core'
]

In [28]:
df = df[id_vars + patch_vars + review_vars + social_vars + [target_var]]

In [29]:
df.columns

Index(['hash', 'change_id', 'project', 'num_lines_added', 'num_lines_deleted',
       'num_lines_of_code', 'num_file_impacted', 'num_dirs_impacted',
       'mean_complexity', 'entropy', 'bug_fixing', 'description_length',
       'num_prior_commits', 'num_prior_commits_bug_fixing', 'avg_prior_age',
       'author_is_core', 'author_is_exp_author', 'author_is_exp_reviewer',
       'num_votes', 'num_comments', 'pct_pos', 'pct_pos_prior_pos',
       'pct_pos_prior_neg', 'avg_pos_prior_num_votes',
       'avg_pos_prior_num_comments', 'pct_pos_prior_pos_core',
       'pct_pos_prior_neg_core', 'fix_inducing'],
      dtype='object')

In [30]:
df.to_csv(f'../data/rq2/rq2_sahara.csv', index = False)

In [31]:
num_cols = [
    'num_lines_added',
    'num_lines_deleted',
    'num_lines_of_code',
    'num_file_impacted',
    'num_dirs_impacted',
    'mean_complexity',
    'entropy',
    'description_length',
    'num_prior_commits',
    'num_prior_commits_bug_fixing',
    'avg_prior_age',
    'num_votes',
    'num_comments',
    'avg_pos_prior_num_votes',
    'avg_pos_prior_num_comments'
]

In [32]:
for c in num_cols:
    df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())

In [33]:
df.to_csv('../data/rq2/rq2_sahara_norm.csv', index=False)