In [1]:
## numpy repository의 PR 카테고리 분석. 각 카테고리별 Merge 율

In [2]:
import requests
import json

In [3]:
 # https://docs.github.com/en/free-pro-team@latest/rest/reference/pulls#list-pull-requests
class PullRequests:
    def __init__(self):
        self.url = 'https://api.github.com/repos/numpy/numpy/pulls'
        self.page_size = 100
        self.result = list()
    
    def download(self, page):
        headers = {'Accept': 'application/vnd.github.v3+json'}
        params = {'state': 'closed', 'per_page': self.page_size, 'page': page}
        response = requests.get(self.url, params=params, headers=headers)
        if response.status_code != 200:
            print('Can not download pull requests')
            return None
        else:
            response.encoding = 'utf-8'
            return response.text
        
    def __parse_category__(self, labels):
        categories = list()
        if labels is not None and len(labels) >= 1:
            for label in labels:
                categories.append(label['name'])
        else:
            return None
        return categories
        
    def parse(self, response):
        data_list = json.loads(response)
        for data in data_list:
            parsed_item = dict()
            parsed_item['issue_number'] = data['number']
            parsed_item['title'] = data['title']
            parsed_item['category'] = self.__parse_category__(data['labels'])
            parsed_item['is_merged'] = data['merged_at'] is not None
            self.result.append(parsed_item)

    def scraping(self):
        self.result = list()
        for page in range(1, 11):
            response = self.download(page)
            self.parse(response)

In [4]:
numpy_pull_requests = PullRequests()

In [5]:
numpy_pull_requests.scraping()

In [6]:
numpy_pull_requests.result[:3]

[{'issue_number': 17775,
  'title': 'BUG: Fixed file handle leak in array_tofile.',
  'category': ['00 - Bug', '08 - Backport', 'component: numpy.core'],
  'is_merged': True},
 {'issue_number': 17774,
  'title': "BUG: fix np.timedelta64('nat').__format__ throwing an exception",
  'category': ['00 - Bug', '08 - Backport', 'component: numpy.core'],
  'is_merged': True},
 {'issue_number': 17773,
  'title': 'MAINT: Add BLD and STY to labeler prefixes.',
  'category': ['03 - Maintenance'],
  'is_merged': True}]

In [7]:
# "숫자 -"로 구성된 카테고리 다듬기
import re

for item in numpy_pull_requests.result:
    if item['category'] is None:
        continue
    new_categories = list()
    for category in item['category']:
        new_categories.append(re.sub(r'^[0-9]+\s-\s', '', category))
    item['category'] = new_categories

In [8]:
numpy_pull_requests.result[:3]

[{'issue_number': 17775,
  'title': 'BUG: Fixed file handle leak in array_tofile.',
  'category': ['Bug', 'Backport', 'component: numpy.core'],
  'is_merged': True},
 {'issue_number': 17774,
  'title': "BUG: fix np.timedelta64('nat').__format__ throwing an exception",
  'category': ['Bug', 'Backport', 'component: numpy.core'],
  'is_merged': True},
 {'issue_number': 17773,
  'title': 'MAINT: Add BLD and STY to labeler prefixes.',
  'category': ['Maintenance'],
  'is_merged': True}]

In [9]:
# component category, static typing, Py3K, Backport 제거
for item in numpy_pull_requests.result:
    if item['category'] is None:
        continue
    new_categories = list()
    for category in item['category']:
        is_component_label = re.compile('^component:').search(category)
        if is_component_label is not None:
            continue
        if category is 'static typing' or category is 'Py3K' or category is 'Backport':
            continue
        new_categories.append(category.strip())
    item['category'] = new_categories


In [10]:
numpy_pull_requests.result[:3]

[{'issue_number': 17775,
  'title': 'BUG: Fixed file handle leak in array_tofile.',
  'category': ['Bug', 'Backport'],
  'is_merged': True},
 {'issue_number': 17774,
  'title': "BUG: fix np.timedelta64('nat').__format__ throwing an exception",
  'category': ['Bug', 'Backport'],
  'is_merged': True},
 {'issue_number': 17773,
  'title': 'MAINT: Add BLD and STY to labeler prefixes.',
  'category': ['Maintenance'],
  'is_merged': True}]

In [11]:
# pull_requests_with_category.csv 로 저장
import os

file_name = 'pull_requests_with_category.csv'
file_path = './data/' + file_name
os.remove(file_path)

headers = ['issue_number', 'title', 'category', 'is_merged']

new_file = open(file_path, 'w', encoding='utf-8')
new_file.write(','.join(headers) + '\n')

for item in numpy_pull_requests.result:
    for header in item:
        line = str(item[header])
        if type(item[header]) == list:
            line = '' if item[header] is None else ','.join(item[header])
        line = '"' + line + '"'
        if header != headers[len(headers) - 1]:
            line += ','
        new_file.write(line)
    new_file.write('\n')

new_file.close()

In [12]:
# 정상적으로 csv 파일이 만들어졌는지 확인
import pandas as pd

csv_numpy_pull_requests = pd.read_csv(file_path)

csv_numpy_pull_requests.shape

(1000, 4)

In [14]:
csv_numpy_pull_requests.head()

Unnamed: 0,issue_number,title,category,is_merged
0,17775,BUG: Fixed file handle leak in array_tofile.,"Bug,Backport",True
1,17774,BUG: fix np.timedelta64('nat').__format__ thro...,"Bug,Backport",True
2,17773,MAINT: Add BLD and STY to labeler prefixes.,Maintenance,True
3,17768,MAINT: update link to website in FUNDING.yml,Maintenance,True
4,17763,"SIMD, BUG: fix reuses the previous values duri...",Bug,True
