In [1]:
import json
import pandas as pd

In [2]:
dfs = []

In [3]:
for project in ['cinder', 'glance', 'neutron']:
    df = pd.read_csv(f'../data/rq1/rq1_{project}.csv')
    df['project'] = project
    dfs.append(df)

In [4]:
df = pd.concat(dfs, ignore_index=True)

In [5]:
len(df)

107213

## Patch - fill in NANs

In [6]:
df.isna().sum()

hash                                       0
author_name                                0
committer_name                             0
author_date                                0
commit_date                                0
num_lines_added                            0
num_lines_deleted                          0
num_lines_of_code                          0
num_file_impacted                          0
num_dirs_impacted                          0
min_complexity                             0
mean_complexity                            0
max_complexity                             0
entropy                                    0
bug_fixing                                 0
description_length                         0
num_prior_commits                          0
avg_prior_age                              0
num_prior_commits_bug_fixing               0
num_future_commits_bug_fixing              0
fix_inducing                               0
reviewer_id                                0
reviewer_n

In [7]:
df = df.fillna(0)

## Patch - make experience

In [8]:
df['author_is_exp_author'] = df['pct_prior_commits_author_authored'] > 0.05
df['author_is_exp_reviewer'] = df['pct_prior_commits_author_reviewed'] > 0.05

In [9]:
df['reviewer_is_exp_author'] = df['pct_prior_commits_reviewer_authored'] > 0
df['reviewer_is_exp_reviewer'] = df['pct_prior_commits_reviewer_reviewed'] > 0

In [10]:
df = df.drop(columns=['pct_prior_commits_author_authored', 'pct_prior_commits_author_reviewed', 'pct_prior_commits_reviewer_authored', 'pct_prior_commits_reviewer_reviewed'])

## Patch - fill in `author_is_core`

In [11]:
with open('../data/core_devs.json', 'r') as j:
    CORE_DEVS = json.loads(j.read())

In [12]:
CORE_DEVS

{'cinder': ['Brian Rosmaita',
  'Eric Harney',
  'Gorka Eguileor',
  'Ivan Kolodyazhny',
  'Jay Bryant',
  'Lucio Seki',
  'Rajat Dhasmana',
  'Sean McGinnis',
  'Hemna'],
 'glance': ['Abhishek Kekane',
  'Brian Rosmaita',
  'Dan Smith',
  'Glance Bot',
  'Nikhil Komawar',
  'Sean McGinnis'],
 'neutron': ['Akihiro Motoki',
  'Brian Haley',
  'Hongbin Lu',
  'Jakub Libosvar',
  'Lajos Katona',
  'Miguel Lavalle',
  'Nate Johnston',
  'Oleg Bondarev',
  'Rodolfo Alonso',
  'Slawek Kaplonski',
  'YAMAMOTO Takashi',
  'LIU Yulong'],
 'nova': ['Alex Xu',
  'Balazs Gibizer',
  'Dan Smith',
  'Eric Fried',
  'Ghanshyam',
  'John Garbutt',
  'Lee Yarwood',
  'stephenfin',
  'Sylvain Bauza',
  'melwitt']}

In [13]:
for idx, row in df.iterrows():
    project = row['project']
    project_core_devs = CORE_DEVS[project]
    author = row['author_name']
    if author in project_core_devs:
        df.at[idx, 'author_is_core'] = True
    else:
        df.at[idx, 'author_is_core'] = False

In [14]:
df['author_is_core'].value_counts()

False    94843
True     12370
Name: author_is_core, dtype: int64

## Target variable

In [15]:
df['reviewer_vote'].value_counts()

 2    47888
 1    47795
-1    10801
-2      729
Name: reviewer_vote, dtype: int64

In [16]:
df['pos_vote'] = df['reviewer_vote'] > 0

In [17]:
df['pos_vote'].value_counts()

True     95683
False    11530
Name: pos_vote, dtype: int64

## Make binary variables 1/0

In [18]:
binary_cols = ['bug_fixing', 'fix_inducing', 'author_is_core', 'reviewer_is_core', 'pos_vote', 'author_is_exp_author', 'author_is_exp_reviewer', 'reviewer_is_exp_author', 'reviewer_is_exp_reviewer']

In [19]:
for c in binary_cols:
    print(df[c].value_counts())

True     71377
False    35836
Name: bug_fixing, dtype: int64
True     65950
False    41263
Name: fix_inducing, dtype: int64
False    94843
True     12370
Name: author_is_core, dtype: int64
False    84248
True     22965
Name: reviewer_is_core, dtype: int64
True     95683
False    11530
Name: pos_vote, dtype: int64
False    81080
True     26133
Name: author_is_exp_author, dtype: int64
False    85834
True     21379
Name: author_is_exp_reviewer, dtype: int64
False    100414
True       6799
Name: reviewer_is_exp_author, dtype: int64
False    67853
True     39360
Name: reviewer_is_exp_reviewer, dtype: int64


In [20]:
for c in binary_cols:
    df[c] = df[c].astype(int)

In [21]:
for c in binary_cols:
    print(df[c].value_counts())

1    71377
0    35836
Name: bug_fixing, dtype: int64
1    65950
0    41263
Name: fix_inducing, dtype: int64
0    94843
1    12370
Name: author_is_core, dtype: int64
0    84248
1    22965
Name: reviewer_is_core, dtype: int64
1    95683
0    11530
Name: pos_vote, dtype: int64
0    81080
1    26133
Name: author_is_exp_author, dtype: int64
0    85834
1    21379
Name: author_is_exp_reviewer, dtype: int64
0    100414
1      6799
Name: reviewer_is_exp_author, dtype: int64
0    67853
1    39360
Name: reviewer_is_exp_reviewer, dtype: int64


## Export

In [22]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_lines_added,107213.0,134.443808,885.006429,0.0,5.0,25.0,88.0,110133.0
num_lines_deleted,107213.0,127.581375,7030.815267,0.0,1.0,6.0,27.0,1059841.0
num_lines_of_code,107213.0,2464.355367,7445.052297,0.0,321.0,1044.0,2767.0,840667.0
num_file_impacted,107213.0,5.174186,16.671576,0.0,1.0,2.0,5.0,1361.0
num_dirs_impacted,107213.0,3.25163,5.724486,0.0,1.0,2.0,3.0,319.0
min_complexity,107213.0,48.61403,78.599338,0.0,3.0,19.0,61.0,974.0
mean_complexity,107213.0,81.679091,91.216767,0.0,19.333333,56.818182,112.875,987.0
max_complexity,107213.0,136.948905,148.12181,0.0,26.0,94.0,197.0,1000.0
entropy,107213.0,0.542821,0.386003,0.0,0.0,0.694446,0.875624,1.0
bug_fixing,107213.0,0.665749,0.47173,0.0,0.0,1.0,1.0,1.0


In [23]:
df.to_csv('../data/rq1/rq1_all.csv', index=False)

## Normalize numerical columns

In [24]:
num_cols = [
    'min_complexity',
    'mean_complexity',
    'max_complexity',
    'num_prior_votes',
    'num_lines_added',
    'num_lines_deleted',
    'num_lines_of_code',
    'num_dirs_impacted',
    'num_file_impacted',
    'description_length',
    'num_prior_commits',
    'num_prior_commits_bug_fixing',
    'num_future_commits_bug_fixing',
    'avg_prior_age'
]

In [25]:
for c in num_cols:
    df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())

## Export Normalized

In [26]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_lines_added,107213.0,0.001221,0.008036,0.0,4.539965e-05,0.000227,0.000799,1.0
num_lines_deleted,107213.0,0.00012,0.006634,0.0,9.435378e-07,6e-06,2.5e-05,1.0
num_lines_of_code,107213.0,0.002931,0.008856,0.0,0.0003818397,0.001242,0.003291,1.0
num_file_impacted,107213.0,0.003802,0.01225,0.0,0.0007347539,0.00147,0.003674,1.0
num_dirs_impacted,107213.0,0.010193,0.017945,0.0,0.003134796,0.00627,0.009404,1.0
min_complexity,107213.0,0.049912,0.080697,0.0,0.003080082,0.019507,0.062628,1.0
mean_complexity,107213.0,0.082755,0.092418,0.0,0.01958798,0.057567,0.114362,1.0
max_complexity,107213.0,0.136949,0.148122,0.0,0.026,0.094,0.197,1.0
entropy,107213.0,0.542821,0.386003,0.0,0.0,0.694446,0.875624,1.0
bug_fixing,107213.0,0.665749,0.47173,0.0,0.0,1.0,1.0,1.0


In [27]:
df.to_csv('../data/rq1/rq1_all_norm.csv', index=False)