In [1]:
%load_ext autoreload
%autoreload 2

import sys
from tqdm.notebook import tqdm
from time import sleep
sys.path.append('../')
from src.combined import *

In [2]:
project = 'cinder'

Load commits and reviewers

In [3]:
df_commits = pd.read_csv(f'..\data\commits\commits_{project}.csv')

In [4]:
df_reviews = pd.read_csv(f'..\data\\reviews\\reviews_{project}.csv')

Keep only commits with reviews

In [5]:
commits_without_reviews = df_commits.loc[~df_commits['hash'].isin(df_reviews['hash']), 'hash'].values

In [6]:
len(commits_without_reviews)

15

In [7]:
df_commits = df_commits.loc[df_commits['hash'].isin(df_reviews['hash'])]

In [8]:
len(df_commits)

8520

Re-format `prior_commits`

In [9]:
df_commits['prior_commits'] = df_commits['prior_commits'].apply(
    lambda x : [c for c in split_prior_commits(x) if c not in commits_without_reviews]
)

Join dataframes

In [10]:
df = df_commits.set_index('hash').join(df_reviews.set_index('hash'), how = 'left')

df = df.reset_index()

In [11]:
len(df)

38545

In [12]:
# Should be 0 since we already removed beforehand
len(df.loc[df['change_id'].isnull()])

0

Compute expertise for author and reviewers

In [13]:
df['pct_prior_commits_author_authored'] = np.nan
df['pct_prior_commits_author_reviewed'] = np.nan
df['pct_prior_commits_reviewer_authored'] = np.nan
df['pct_prior_commits_reviewer_reviewed'] = np.nan

In [14]:
with tqdm(total = len(df)) as pbar:
    for idx, row in df.iterrows():
        pbar.set_postfix({'hash' : row['hash'], 'reviewer' : row['reviewer_name']})
        prior_commits = row['prior_commits']
        # No prior commit
        if len(prior_commits) == 0:
            pbar.update(1)
            sleep(0.001)
            continue
        # Initialize counters
        prior_authors_count = Counter()
        prior_reviewers_count = Counter()
        # Iterate through each prior commit
        for prior_commit in prior_commits:
            # Get the author
            prior_author = df_commits.loc[df_commits['hash'] == prior_commit, 'author_name'].values[0]
            # Increment count
            prior_authors_count[prior_author] += 1
            # Get the reviewers
            prior_reviewers = df_reviews.loc[df_reviews['hash'] == prior_commit, 'reviewer_name'].values
            # Increment counts
            for prior_reviewer in prior_reviewers:
                prior_reviewers_count[prior_reviewer] += 1
        # Calculate percentages for the author
        author = row['author_name']
        pct_prior_commits_author_authored = prior_authors_count[author] / len(prior_commits)
        assert 0 <= pct_prior_commits_author_authored <= 1
        pct_prior_commits_author_reviewed = prior_reviewers_count[author] / len(prior_commits)
        assert 0 <= pct_prior_commits_author_reviewed <= 1
        # Calculate percentages for the reviewer
        reviewer = row['reviewer_name']
        pct_prior_commits_reviewer_authored = prior_authors_count[reviewer] / len(prior_commits)
        assert 0 <= pct_prior_commits_reviewer_authored <= 1
        pct_prior_commits_reviewer_reviewed = prior_reviewers_count[reviewer] / len(prior_commits)
        assert 0 <= pct_prior_commits_reviewer_reviewed <= 1
        # Update values
        df.at[idx, 'pct_prior_commits_author_authored'] = pct_prior_commits_author_authored
        df.at[idx, 'pct_prior_commits_author_reviewed'] = pct_prior_commits_author_reviewed
        df.at[idx, 'pct_prior_commits_reviewer_authored'] = pct_prior_commits_reviewer_authored
        df.at[idx, 'pct_prior_commits_reviewer_reviewed'] = pct_prior_commits_reviewer_reviewed
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/38545 [00:00<?, ?it/s]

Save output

In [15]:
df.drop(columns = ['prior_commits']).to_csv(f'..\data\\rq1\\rq1_{project}.csv', index = False)