In [1]:
from tqdm.notebook import tqdm
from time import sleep
import numpy as np
import pandas as pd
import requests
import json

%load_ext autoreload
%autoreload 2
sys.path.append('../')
from src.reviews import *

In [2]:
project = 'nova'

In [3]:
df_commits = pd.read_csv(f'..\data\commits\commits_{project}.csv')

In [4]:
len(df_commits)

21796

Change IDs

In [5]:
get_prefix = f"https://review.opendev.org/projects/openstack%2F{project}/commits/"

In [6]:
df_commits['change_id'] = ''

In [7]:
with tqdm(total = len(df_commits)) as pbar:
    for i, row in df_commits.iterrows():
        pbar.set_postfix({'hash' :  row['hash']})
        get_addr = get_prefix + row['hash']
        response = requests.get(get_addr)
        response_json = json.loads(response.text[5:]) # some leading nonsense
        message = response_json['message']
        change_id = get_change_id(message)
        if change_id == '':
            print("---hash {} does not contain a change id in message:\n{}".format(
                row['hash'], message))
        df_commits.at[i, 'change_id'] = change_id
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/21796 [00:00<?, ?it/s]

---hash 4180e0f147608aebf4d74033553f526b21329946 does not contain a change id in message:
Merge "Revert "Fix migration and instance resize update order""


In [8]:
np.sum(df_commits['change_id'] == '')

1

In [9]:
df_commits = df_commits.loc[df_commits['change_id'] != '']

Change Messages

In [10]:
get_prefix = f'https://review.opendev.org/changes/openstack%2F{project}~master~'

In [11]:
reviews = []

In [15]:
with tqdm(total = len(df_commits)) as pbar:
    for i, row in df_commits.iterrows():
        if (i > 6150):
            pbar.set_postfix({'hash' :  row['hash']})
            get_addr = get_prefix + row['change_id'] + "/messages"
            response = requests.get(get_addr)
            if response.status_code == 404:
                print("hash {} with change id {} had invalid response".format(
                    row['hash'], row['change_id']))
            else:
                response_json = json.loads(response.text[5:])
                reviews_for_single_change = get_review_info(project, response_json)
                for re in reviews_for_single_change:
                    re['hash'] = row['hash']
                    re['change_id'] = row['change_id']
                reviews.extend(reviews_for_single_change)
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/21795 [00:00<?, ?it/s]

hash 1fb1b058211f08c0b993372e734ed62cd9267193 with change id I91289cc4a60f5dab89bca852e6f52b4b83831e47 had invalid response
hash a88d9d5936aabc52046ed3ae566741422bfaed78 with change id Ife909bdf3277ef33c2fb1eae16ae261fa6374c63 had invalid response
hash 10a5eecd0973096b57efd31f8b27d7295a44ab89 with change id I64b2b468f7edd44dbb445b5b4e68b65c3fa53d9e had invalid response
hash 22d7547c6b62fb9dabd861e4941edd34eedabfc6 with change id I6356513ac42b79402dbde8ee5e75cbbd1aee7eef had invalid response
hash bd6a40fecde943a3ded0124481a12c27dbb167de with change id I0e9ef00182a2229602d23b8a67a02f0be62ee239 had invalid response
hash 51d74bdf18df39695ad5e500db6db830f21da36a with change id I9690bc4f6f1dca8f2bf8c5a83f16af6200015506 had invalid response
hash 6df6ad3ff32f2b1fe2978df1032002548ad8eb66 with change id I1e06e77308a7dd23209124f0807d61fb52470188 had invalid response
hash a970127bf7d0de2328f8bdfe1f8201c3938b6d77 with change id Ie60313913fdb70d6fbbdcceced621e14d2cf54eb had invalid response


Save to output

In [16]:
df_reviews = pd.DataFrame(reviews)

In [17]:
df_reviews.to_csv(f"..\data\\reviews\\reviews_{project}.csv", index = False)