In [1]:
import json
import pandas as pd

In [2]:
dfs = []

In [3]:
for project in ['cinder', 'glance', 'neutron']:
    df = pd.read_csv(f'../data/rq1/rq1_{project}.csv')
    df['project'] = project
    dfs.append(df)

In [4]:
df = pd.concat(dfs, ignore_index=True)

In [5]:
len(df)

107300

## Patch - make experience variables

In [6]:
df['author_is_exp_author'] = df['pct_prior_commits_author_authored'] > 0.05
df['author_is_exp_reviewer'] = df['pct_prior_commits_author_reviewed'] > 0.05

In [7]:
df['reviewer_is_exp_author'] = df['pct_prior_commits_reviewer_authored'] > 0
df['reviewer_is_exp_reviewer'] = df['pct_prior_commits_reviewer_reviewed'] > 0

In [8]:
df = df.drop(columns=[
    'pct_prior_commits_author_authored',
    'pct_prior_commits_author_reviewed',
    'pct_prior_commits_reviewer_authored',
    'pct_prior_commits_reviewer_reviewed'
])

## Patch - fill in `author_is_core`

In [9]:
with open('../data/core_devs.json', 'r') as j:
    CORE_DEVS = json.loads(j.read())

In [10]:
CORE_DEVS

{'cinder': ['Brian Rosmaita',
  'Eric Harney',
  'Gorka Eguileor',
  'Ivan Kolodyazhny',
  'Jay Bryant',
  'Lucio Seki',
  'Rajat Dhasmana',
  'Sean McGinnis',
  'Hemna'],
 'glance': ['Abhishek Kekane',
  'Brian Rosmaita',
  'Dan Smith',
  'Glance Bot',
  'Nikhil Komawar',
  'Sean McGinnis'],
 'neutron': ['Akihiro Motoki',
  'Brian Haley',
  'Hongbin Lu',
  'Jakub Libosvar',
  'Lajos Katona',
  'Miguel Lavalle',
  'Nate Johnston',
  'Oleg Bondarev',
  'Rodolfo Alonso',
  'Slawek Kaplonski',
  'YAMAMOTO Takashi',
  'LIU Yulong'],
 'nova': ['Alex Xu',
  'Balazs Gibizer',
  'Dan Smith',
  'Eric Fried',
  'Ghanshyam',
  'John Garbutt',
  'Lee Yarwood',
  'stephenfin',
  'Sylvain Bauza',
  'melwitt']}

In [11]:
for idx, row in df.iterrows():
    project = row['project']
    project_core_devs = CORE_DEVS[project]
    author = row['author_name']
    if author in project_core_devs:
        df.at[idx, 'author_is_core'] = True
    else:
        df.at[idx, 'author_is_core'] = False

In [12]:
df['author_is_core'].value_counts()

False    94930
True     12370
Name: author_is_core, dtype: int64

## Make target variable

In [13]:
df['reviewer_vote'].value_counts()

 2    47922
 1    47843
-1    10804
-2      731
Name: reviewer_vote, dtype: int64

In [14]:
df['pos_vote'] = df['reviewer_vote'] > 0

In [15]:
df['pos_vote'].value_counts()

True     95765
False    11535
Name: pos_vote, dtype: int64

## Make binary variables 1/0

In [16]:
binary_cols = [
    'bug_fixing',
    'fix_inducing',
    'author_is_core',
    'reviewer_is_core',
    'pos_vote', 
    'author_is_exp_author',
    'author_is_exp_reviewer',
    'reviewer_is_exp_author',
    'reviewer_is_exp_reviewer'
]

In [17]:
for c in binary_cols:
    print(df[c].value_counts())

True     71432
False    35868
Name: bug_fixing, dtype: int64
True     66018
False    41282
Name: fix_inducing, dtype: int64
False    94930
True     12370
Name: author_is_core, dtype: int64
False    84335
True     22965
Name: reviewer_is_core, dtype: int64
True     95765
False    11535
Name: pos_vote, dtype: int64
False    98903
True      8397
Name: author_is_exp_author, dtype: int64
False    101638
True       5662
Name: author_is_exp_reviewer, dtype: int64
False    106032
True       1268
Name: reviewer_is_exp_author, dtype: int64
False    95801
True     11499
Name: reviewer_is_exp_reviewer, dtype: int64


In [18]:
for c in binary_cols:
    df[c] = df[c].astype(int)

In [19]:
for c in binary_cols:
    print(df[c].value_counts())

1    71432
0    35868
Name: bug_fixing, dtype: int64
1    66018
0    41282
Name: fix_inducing, dtype: int64
0    94930
1    12370
Name: author_is_core, dtype: int64
0    84335
1    22965
Name: reviewer_is_core, dtype: int64
1    95765
0    11535
Name: pos_vote, dtype: int64
0    98903
1     8397
Name: author_is_exp_author, dtype: int64
0    101638
1      5662
Name: author_is_exp_reviewer, dtype: int64
0    106032
1      1268
Name: reviewer_is_exp_author, dtype: int64
0    95801
1    11499
Name: reviewer_is_exp_reviewer, dtype: int64


## Export

In [20]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_lines_added,107300.0,134.446347,884.733974,0.0,5.0,25.0,88.0,110133.0
num_lines_deleted,107300.0,127.558052,7027.968578,0.0,1.0,6.0,27.0,1059841.0
num_lines_of_code,107300.0,2462.982889,7442.339279,0.0,321.0,1042.0,2767.0,840667.0
num_file_impacted,107300.0,5.179776,16.725542,0.0,1.0,2.0,5.0,1361.0
num_dirs_impacted,107300.0,3.252721,5.734251,0.0,1.0,2.0,3.0,319.0
min_complexity,107300.0,48.586999,78.575332,0.0,3.0,19.0,61.0,974.0
mean_complexity,107300.0,81.629744,91.198439,0.0,19.272727,56.666667,112.666667,987.0
max_complexity,107300.0,136.870587,148.096808,0.0,26.0,94.0,197.0,1000.0
entropy,107300.0,0.542703,0.38603,0.0,0.0,0.69422,0.875624,1.0
bug_fixing,107300.0,0.665722,0.47174,0.0,0.0,1.0,1.0,1.0


In [21]:
df.to_csv('../data/rq1/rq1_all.csv', index=False)

## Normalize numerical columns

In [22]:
num_cols = [
    'min_complexity',
    'mean_complexity',
    'max_complexity',
    'num_prior_votes',
    'num_lines_added',
    'num_lines_deleted',
    'num_lines_of_code',
    'num_dirs_impacted',
    'num_file_impacted',
    'description_length',
    'num_prior_commits',
    'num_prior_comments',
    'num_prior_commits_bug_fixing',
    'num_future_commits_bug_fixing',
    'avg_prior_age'
]

In [23]:
for c in num_cols:
    df[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min())

## Export normalized

In [24]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_lines_added,107300.0,0.001221,0.008033,0.0,4.539965e-05,0.000227,0.000799,1.0
num_lines_deleted,107300.0,0.00012,0.006631,0.0,9.435378e-07,6e-06,2.5e-05,1.0
num_lines_of_code,107300.0,0.00293,0.008853,0.0,0.0003818397,0.001239,0.003291,1.0
num_file_impacted,107300.0,0.003806,0.012289,0.0,0.0007347539,0.00147,0.003674,1.0
num_dirs_impacted,107300.0,0.010197,0.017976,0.0,0.003134796,0.00627,0.009404,1.0
min_complexity,107300.0,0.049884,0.080673,0.0,0.003080082,0.019507,0.062628,1.0
mean_complexity,107300.0,0.082705,0.0924,0.0,0.01952657,0.057413,0.114151,1.0
max_complexity,107300.0,0.136871,0.148097,0.0,0.026,0.094,0.197,1.0
entropy,107300.0,0.542703,0.38603,0.0,0.0,0.69422,0.875624,1.0
bug_fixing,107300.0,0.665722,0.47174,0.0,0.0,1.0,1.0,1.0


In [25]:
df.to_csv('../data/rq1/rq1_all_norm.csv', index=False)