In [2]:
import json
import pandas as pd

In [3]:
dfs = []

In [4]:
for project in ['cinder', 'glance', 'neutron']:
    df = pd.read_csv(f'../data/rq1/rq1_{project}.csv')
    df['project'] = project
    dfs.append(df)

In [5]:
df = pd.concat(dfs, ignore_index=True)

In [6]:
len(df)

107213

## Patch - fill in NANs

In [7]:
df.isna().sum()

hash                                       0
author_name                                0
committer_name                             0
author_date                                0
commit_date                                0
num_lines_added                            0
num_lines_deleted                          0
num_lines_of_code                          0
num_file_impacted                          0
num_dirs_impacted                          0
min_complexity                             0
mean_complexity                            0
max_complexity                             0
entropy                                    0
bug_fixing                                 0
description_length                         0
num_prior_commits                          0
avg_prior_age                              0
num_prior_commits_bug_fixing               0
num_future_commits_bug_fixing              0
fix_inducing                               0
reviewer_id                                0
reviewer_n

In [8]:
df = df.fillna(0)

## Patch - make experience

In [9]:
df['author_is_exp_author'] = df['pct_prior_commits_author_authored'] > 0.05
df['author_is_exp_reviewer'] = df['pct_prior_commits_author_reviewed'] > 0.05

In [10]:
df['reviewer_is_exp_author'] = df['pct_prior_commits_reviewer_authored'] > 0
df['reviewer_is_exp_reviewer'] = df['pct_prior_commits_reviewer_reviewed'] > 0

In [11]:
df = df.drop(columns=['pct_prior_commits_author_authored', 'pct_prior_commits_author_reviewed', 'pct_prior_commits_reviewer_authored', 'pct_prior_commits_reviewer_reviewed'])

## Patch - fill in `author_is_core`

In [12]:
with open('../data/core_devs.json', 'r') as j:
    CORE_DEVS = json.loads(j.read())

In [13]:
CORE_DEVS

{'cinder': ['Brian Rosmaita',
  'Eric Harney',
  'Gorka Eguileor',
  'Ivan Kolodyazhny',
  'Jay Bryant',
  'Lucio Seki',
  'Rajat Dhasmana',
  'Sean McGinnis',
  'Hemna'],
 'glance': ['Abhishek Kekane',
  'Brian Rosmaita',
  'Dan Smith',
  'Glance Bot',
  'Nikhil Komawar',
  'Sean McGinnis'],
 'neutron': ['Akihiro Motoki',
  'Brian Haley',
  'Hongbin Lu',
  'Jakub Libosvar',
  'Lajos Katona',
  'Miguel Lavalle',
  'Nate Johnston',
  'Oleg Bondarev',
  'Rodolfo Alonso',
  'Slawek Kaplonski',
  'YAMAMOTO Takashi',
  'LIU Yulong'],
 'nova': ['Alex Xu',
  'Balazs Gibizer',
  'Dan Smith',
  'Eric Fried',
  'Ghanshyam',
  'John Garbutt',
  'Lee Yarwood',
  'stephenfin',
  'Sylvain Bauza',
  'melwitt']}

In [14]:
for idx, row in df.iterrows():
    project = row['project']
    project_core_devs = CORE_DEVS[project]
    author = row['author_name']
    if author in project_core_devs:
        df.at[idx, 'author_is_core'] = True
    else:
        df.at[idx, 'author_is_core'] = False

In [15]:
df['author_is_core'].value_counts()

False    94843
True     12370
Name: author_is_core, dtype: int64

## Target variable

In [16]:
df['reviewer_vote'].value_counts()

 2    47888
 1    47795
-1    10801
-2      729
Name: reviewer_vote, dtype: int64

In [17]:
df['pos_vote'] = df['reviewer_vote'] > 0

In [18]:
df['pos_vote'].value_counts()

True     95683
False    11530
Name: pos_vote, dtype: int64

## Make binary variables 1/0

In [19]:
binary_cols = ['bug_fixing', 'author_is_core', 'reviewer_is_core', 'pos_vote', 'author_is_exp_author', 'author_is_exp_reviewer', 'reviewer_is_exp_author', 'reviewer_is_exp_reviewer']

In [20]:
for c in binary_cols:
    print(df[c].value_counts())

True     71377
False    35836
Name: bug_fixing, dtype: int64
False    94843
True     12370
Name: author_is_core, dtype: int64
False    84248
True     22965
Name: reviewer_is_core, dtype: int64
True     95683
False    11530
Name: pos_vote, dtype: int64
False    81080
True     26133
Name: author_is_exp_author, dtype: int64
False    85834
True     21379
Name: author_is_exp_reviewer, dtype: int64
False    100414
True       6799
Name: reviewer_is_exp_author, dtype: int64
False    67853
True     39360
Name: reviewer_is_exp_reviewer, dtype: int64


In [21]:
for c in binary_cols:
    df[c] = df[c].astype(int)

In [22]:
for c in binary_cols:
    print(df[c].value_counts())

1    71377
0    35836
Name: bug_fixing, dtype: int64
0    94843
1    12370
Name: author_is_core, dtype: int64
0    84248
1    22965
Name: reviewer_is_core, dtype: int64
1    95683
0    11530
Name: pos_vote, dtype: int64
0    81080
1    26133
Name: author_is_exp_author, dtype: int64
0    85834
1    21379
Name: author_is_exp_reviewer, dtype: int64
0    100414
1      6799
Name: reviewer_is_exp_author, dtype: int64
0    67853
1    39360
Name: reviewer_is_exp_reviewer, dtype: int64


## Export

In [23]:
df.to_csv('../data/rq1/rq1_all.csv', index=False)