In [1]:
from tqdm.notebook import tqdm
from time import sleep
import numpy as np
import pandas as pd
import requests
import json

%load_ext autoreload
%autoreload 2
sys.path.append('../')
from src.reviews import *

In [2]:
project = 'glance'

In [3]:
df_commits = pd.read_csv(f'../data/commits/commits_{project}.csv')

In [4]:
len(df_commits)

2942

Change IDs

In [5]:
get_prefix = f"https://review.opendev.org/projects/openstack%2F{project}/commits/"

In [6]:
df_commits['change_id'] = ''

In [7]:
with tqdm(total = len(df_commits)) as pbar:
    for i, row in df_commits.iterrows():
        pbar.set_postfix({'hash' :  row['hash']})
        get_addr = get_prefix + row['hash']
        response = requests.get(get_addr)
        response_json = json.loads(response.text[5:]) # some leading nonsense
        message = response_json['message']
        change_id = get_change_id(message)
        if change_id == '':
            print("---hash {} does not contain a change id in message:\n{}".format(
                row['hash'], message))
        df_commits.at[i, 'change_id'] = change_id
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/2942 [00:00<?, ?it/s]

---hash 014bd35767dcaef5244f6a0a5f3236e140a5d37b does not contain a change id in message:
Merge "Replace openstack.org git:// URLs with https://"


In [8]:
np.sum(df_commits['change_id'] == '')

1

In [9]:
df_commits = df_commits.loc[df_commits['change_id'] != '']

Change Messages

In [10]:
get_prefix = f'https://review.opendev.org/changes/openstack%2F{project}~master~'

In [11]:
reviews = []

In [13]:
with tqdm(total = len(df_commits)) as pbar:
    for i, row in df_commits.iterrows():
        pbar.set_postfix({'hash' :  row['hash']})
        get_addr = get_prefix + row['change_id'] + "/messages"
        response = requests.get(get_addr)
        if response.status_code == 404:
            print("---hash {} with change id {} had invalid response".format(
                row['hash'], row['change_id']))
        else:
            response_json = json.loads(response.text[5:])
            reviews_for_single_change = get_review_info(project, response_json, row['author_name'])
            for re in reviews_for_single_change:
                re['hash'] = row['hash']
                re['change_id'] = row['change_id']
            reviews.extend(reviews_for_single_change)
        # Progress bar
        pbar.update(1)
        sleep(0.001)

  0%|          | 0/2941 [00:00<?, ?it/s]

---hash 819f28a0b8863bd18f8a14491b5966c8b2723432 with change id I15686708fc9460948a58cfea3d18dae40ba1fda9 had invalid response
---hash 00f030e34608a7311fea9fe8383e55871c13b5e7 with change id If1bdff3f7330c3eb58bd56b08299472ae3d4b552 had invalid response
---hash e3bed85d5123764dfa25059bba31acbf5fa9c035 with change id Ia0ce51683a21b7d9acd465d6e43da0808ea258cc had invalid response
---hash 7387674374ed776f9b0df49e9176af2ca29c0c7f with change id I14c42886cbeef94ba8cfcaf5ec36d30e5cf22500 had invalid response
---hash 040d5fa69cdc60684f45afc6690a2f6226b7c007 with change id Ic3bdd0977e1ad891ef84af37693eccbed3ccbd5e had invalid response


Save to output

In [14]:
df_reviews = pd.DataFrame(reviews)

In [15]:
df_reviews.to_csv(f"../data/reviews/reviews_{project}.csv", index = False)