In [1]:
from tqdm.notebook import tqdm
from time import sleep
import numpy as np
import pandas as pd
import requests
import json

%load_ext autoreload
%autoreload 2
sys.path.append('../')
from src.reviews import *

In [2]:
project = 'neutron'

In [3]:
df_commits = pd.read_csv(f'../data/commits/commits_{project}.csv')

In [4]:
len(df_commits)

10751

Change IDs

In [5]:
get_prefix = f"https://review.opendev.org/projects/openstack%2F{project}/commits/"

In [6]:
df_commits['change_id'] = ''

In [7]:
with tqdm(total = len(df_commits)) as pbar:
    for i, row in df_commits.iterrows():
        pbar.set_postfix({'hash' :  row['hash']})
        get_addr = get_prefix + row['hash']
        response = requests.get(get_addr)
        response_json = json.loads(response.text[5:]) # some leading nonsense
        message = response_json['message']
        change_id = get_change_id(message)
        if change_id == '':
            print("---hash {} does not contain a change id in message:\n{}".format(
                row['hash'], message))
        df_commits.at[i, 'change_id'] = change_id
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/10751 [00:00<?, ?it/s]

In [8]:
np.sum(df_commits['change_id'] == '')

0

In [9]:
df_commits = df_commits.loc[df_commits['change_id'] != '']

Change Messages

In [10]:
get_prefix = f'https://review.opendev.org/changes/openstack%2F{project}~master~'

In [11]:
reviews = []

In [12]:
with tqdm(total = len(df_commits)) as pbar:
    for i, row in df_commits.iterrows():
        pbar.set_postfix({'hash' :  row['hash']})
        get_addr = get_prefix + row['change_id'] + "/messages"
        response = requests.get(get_addr)
        if response.status_code == 404:
            print("---hash {} with change id {} had invalid response".format(
                row['hash'], row['change_id']))
        else:
            response_json = json.loads(response.text[5:])
            reviews_for_single_change = get_review_info(project, response_json, row['author_name'])
            for re in reviews_for_single_change:
                re['hash'] = row['hash']
                re['change_id'] = row['change_id']
            reviews.extend(reviews_for_single_change)
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/10751 [00:00<?, ?it/s]

7a0eec64c0e0513df3f3e9f1f3489086f6316f9 had invalid response
---hash 6e3da8a952cc9a52f375cb70e01e27e8a71c1fff with change id Ifb5cac5b1529fef7862f5a63a0d1592f5bcc01d0 had invalid response
---hash 2add4e5ad4d12c817737d04ddb973b3aeeb25af3 with change id Ie61a9f0c0b0b4896da33a201e42b1c4bc4bae49b had invalid response
---hash f7ae3a04b541767c638fc4c8ff1e0db78ab94996 with change id I1b7bd1773bcd12ab282e77ee0dc41c27846fb66b had invalid response
---hash 4d5ae8852a79eb4ba041122e65052abf8c196efb with change id I2d35d0659bd3f06c570ba99e8b8a41b620253e75 had invalid response
---hash 148bf97ef562241e9d5bb67a55497f8b82c5a829 with change id I3be4ae2a62e92e758b2719161ab1674d0f8bb6af had invalid response
---hash 1cfed745d54a6ce9cb3dd4e6f454666d9e6676c2 with change id I373e1bf2d9b0efc9b1aff01695405f7a70ca6bef had invalid response
---hash 96d1cb1ae2f0188988102a56c2886870af94d88e with change id I86e8048e2d9b84690dbede9a94cfc884985069c5 had invalid response
---hash 7f759c077f8f860c13db92d2ea6b353ef6b70900 w

Save to output

In [13]:
df_reviews = pd.DataFrame(reviews)

In [14]:
df_reviews.to_csv(f"../data/reviews/reviews_{project}.csv", index = False)