In [1]:
import pandas as pd
import dotenv
import os
from ghapi.all import GhApi
dotenv.load_dotenv()
from tqdm import tqdm
from datetime import datetime, timedelta
import time

In [2]:

GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
GITHUB_USER_ID = os.getenv("GITHUB_USER_ID")
api = GhApi(owner=GITHUB_USER_ID, token=GITHUB_TOKEN)
repo_name = "TeamNewPipe/NewPipe"

In [3]:
query_string = f"is:issue is:open repo:{repo_name}"
#response = api.search.issues_and_pull_requests(q=query_string, sort="created", order="desc", per_page=100)
#print(response)

In [4]:
def save_csv(path, data):
    pd.DataFrame.from_records(data).to_csv(path, mode='a', header=not os.path.exists(path))

In [8]:
def request_to_github(month, next_month, page):
    query_string = f"is:issue is:closed created:{month}..{next_month} repo:{repo_name}"
    tries = 1
    while True:
        try:
            return api.search.issues_and_pull_requests(q=query_string, sort="created", order="desc", per_page=100, page=page)
        except Exception as e:
            print("Exception received, probably rate limiting")
            time.sleep(60 * 10) # 10 mins?
            print(f"Retry #{tries} for pull request from {next_month} to {month}")
            tries += 1

def perform_issue_ingestion(year: int, *, save_to="issues.csv"):
    start_date = datetime(year-1, 12, 31).isoformat().split('T')[0]
    end_date = datetime(year+1, 1, 1).isoformat().split('T')[0]
    months = [(datetime(year, i, 1)).isoformat().split('T')[0] for i in range(2, 13)]
    months = [start_date, *months]
    months.append(end_date)
    for idx, month in enumerate(months[:-1]):
        next_month = months[idx+1]
        page = 1
        response = request_to_github(month, next_month, page)
        ingested = len(response.get("items"))
        total_count = response.total_count
        if total_count < 1:
            print(f"no issues found from {month} to {next_month}")
            continue
        save_csv(save_to, response.get("items"))
        with tqdm(desc=f"Issues from {month} to {next_month}", total=total_count) as pbar:
            while(ingested < total_count):
                response = request_to_github(month, next_month, page)
                items = response.get("items")
                ingested += len(items)
                pbar.update(len(items))
                pbar.display()
                page += 1
                save_csv(save_to, response.get("items"))
    print("done")


In [9]:
for year in reversed(range(2010, 2020)):
    print(f"Ingesting closed issues from {year}")
    perform_issue_ingestion(year)

Ingesting closed issues from 2019


Issues from 2018-12-31 to 2019-02-01:   0%|          | 0/86 [00:00<?, ?it/s]
Issues from 2019-02-01 to 2019-03-01:   0%|          | 0/63 [00:00<?, ?it/s]
Issues from 2019-03-01 to 2019-04-01:   0%|          | 0/69 [00:00<?, ?it/s]
Issues from 2019-04-01 to 2019-05-01:   0%|          | 0/37 [00:00<?, ?it/s]
Issues from 2019-05-01 to 2019-06-01:   0%|          | 0/51 [00:00<?, ?it/s]
Issues from 2019-06-01 to 2019-07-01:   0%|          | 0/51 [00:00<?, ?it/s]
Issues from 2019-07-01 to 2019-08-01:   0%|          | 0/34 [00:00<?, ?it/s]
Issues from 2019-08-01 to 2019-09-01:   0%|          | 0/78 [00:00<?, ?it/s]
Issues from 2019-09-01 to 2019-10-01:   0%|          | 0/71 [00:00<?, ?it/s]
Issues from 2019-10-01 to 2019-11-01:   0%|          | 0/63 [00:00<?, ?it/s]
Issues from 2019-11-01 to 2019-12-01:   0%|          | 0/48 [00:00<?, ?it/s]
Issues from 2019-12-01 to 2020-01-01:   0%|          | 0/57 [00:00<?, ?it/s]


done
Ingesting closed issues from 2018


Issues from 2017-12-31 to 2018-02-01:   0%|          | 0/92 [00:00<?, ?it/s]
Issues from 2018-02-01 to 2018-03-01:   0%|          | 0/67 [00:00<?, ?it/s]
Issues from 2018-03-01 to 2018-04-01:   0%|          | 0/71 [00:00<?, ?it/s]
Issues from 2018-04-01 to 2018-05-01:   0%|          | 0/93 [00:00<?, ?it/s]
Issues from 2018-05-01 to 2018-06-01:   0%|          | 0/64 [00:00<?, ?it/s]
Issues from 2018-06-01 to 2018-07-01:   0%|          | 0/54 [00:00<?, ?it/s]
Issues from 2018-07-01 to 2018-08-01:   0%|          | 0/42 [00:00<?, ?it/s]
Issues from 2018-08-01 to 2018-09-01:   0%|          | 0/55 [00:00<?, ?it/s]
Issues from 2018-09-01 to 2018-10-01:  90%|█████████ | 100/111 [00:01<00:00, 54.60it/s]
Issues from 2018-10-01 to 2018-11-01:   0%|          | 0/55 [00:00<?, ?it/s]
Issues from 2018-11-01 to 2018-12-01:   0%|          | 0/41 [00:00<?, ?it/s]
Issues from 2018-12-01 to 2019-01-01:   0%|          | 0/46 [00:00<?, ?it/s]


done
Ingesting closed issues from 2017


Issues from 2016-12-31 to 2017-02-01:   0%|          | 0/33 [00:00<?, ?it/s]
Issues from 2017-02-01 to 2017-03-01:   0%|          | 0/25 [00:00<?, ?it/s]
Issues from 2017-03-01 to 2017-04-01:   0%|          | 0/16 [00:00<?, ?it/s]
Issues from 2017-04-01 to 2017-05-01:   0%|          | 0/27 [00:00<?, ?it/s]
Issues from 2017-05-01 to 2017-06-01:   0%|          | 0/38 [00:00<?, ?it/s]
Issues from 2017-06-01 to 2017-07-01:   0%|          | 0/15 [00:00<?, ?it/s]
Issues from 2017-07-01 to 2017-08-01:   0%|          | 0/25 [00:00<?, ?it/s]
Issues from 2017-08-01 to 2017-09-01:   0%|          | 0/17 [00:00<?, ?it/s]
Issues from 2017-09-01 to 2017-10-01:   0%|          | 0/58 [00:00<?, ?it/s]
Issues from 2017-10-01 to 2017-11-01:   0%|          | 0/40 [00:00<?, ?it/s]
Issues from 2017-11-01 to 2017-12-01:   0%|          | 0/61 [00:00<?, ?it/s]
Issues from 2017-12-01 to 2018-01-01:   0%|          | 0/51 [00:00<?, ?it/s]


done
Ingesting closed issues from 2016


Issues from 2015-12-31 to 2016-02-01:   0%|          | 0/20 [00:00<?, ?it/s]
Issues from 2016-02-01 to 2016-03-01:   0%|          | 0/53 [00:00<?, ?it/s]
Issues from 2016-03-01 to 2016-04-01:   0%|          | 0/30 [00:00<?, ?it/s]
Issues from 2016-04-01 to 2016-05-01:   0%|          | 0/24 [00:00<?, ?it/s]
Issues from 2016-05-01 to 2016-06-01:   0%|          | 0/15 [00:00<?, ?it/s]
Issues from 2016-06-01 to 2016-07-01:   0%|          | 0/13 [00:00<?, ?it/s]
Issues from 2016-07-01 to 2016-08-01:   0%|          | 0/9 [00:00<?, ?it/s]
Issues from 2016-08-01 to 2016-09-01:   0%|          | 0/13 [00:00<?, ?it/s]
Issues from 2016-09-01 to 2016-10-01:   0%|          | 0/6 [00:00<?, ?it/s]
Issues from 2016-10-01 to 2016-11-01:   0%|          | 0/8 [00:00<?, ?it/s]
Issues from 2016-11-01 to 2016-12-01:   0%|          | 0/8 [00:00<?, ?it/s]
Issues from 2016-12-01 to 2017-01-01:   0%|          | 0/7 [00:00<?, ?it/s]


done
Ingesting closed issues from 2015
no issues found from 2014-12-31 to 2015-02-01
Exception received, probably rate limiting
Retry #1 for pull request from 2015-03-01 to 2015-02-01
no issues found from 2015-02-01 to 2015-03-01
no issues found from 2015-03-01 to 2015-04-01
Exception received, probably rate limiting
Retry #1 for pull request from 2015-05-01 to 2015-04-01
no issues found from 2015-04-01 to 2015-05-01
no issues found from 2015-05-01 to 2015-06-01
no issues found from 2015-06-01 to 2015-07-01
no issues found from 2015-07-01 to 2015-08-01
no issues found from 2015-08-01 to 2015-09-01


Issues from 2015-09-01 to 2015-10-01:   0%|          | 0/34 [00:00<?, ?it/s]
Issues from 2015-10-01 to 2015-11-01:   0%|          | 0/13 [00:00<?, ?it/s]
Issues from 2015-11-01 to 2015-12-01:   0%|          | 0/22 [00:00<?, ?it/s]
Issues from 2015-12-01 to 2016-01-01:   0%|          | 0/14 [00:00<?, ?it/s]


done
Ingesting closed issues from 2014
no issues found from 2013-12-31 to 2014-02-01
no issues found from 2014-02-01 to 2014-03-01
no issues found from 2014-03-01 to 2014-04-01
no issues found from 2014-04-01 to 2014-05-01
no issues found from 2014-05-01 to 2014-06-01
no issues found from 2014-06-01 to 2014-07-01
no issues found from 2014-07-01 to 2014-08-01
no issues found from 2014-08-01 to 2014-09-01
no issues found from 2014-09-01 to 2014-10-01
no issues found from 2014-10-01 to 2014-11-01
no issues found from 2014-11-01 to 2014-12-01
no issues found from 2014-12-01 to 2015-01-01
done
Ingesting closed issues from 2013
no issues found from 2012-12-31 to 2013-02-01
no issues found from 2013-02-01 to 2013-03-01
no issues found from 2013-03-01 to 2013-04-01
no issues found from 2013-04-01 to 2013-05-01
no issues found from 2013-05-01 to 2013-06-01
no issues found from 2013-06-01 to 2013-07-01
no issues found from 2013-07-01 to 2013-08-01
no issues found from 2013-08-01 to 2013-09-01
no

In [9]:
query_string = "repo:TeamNewPipe/NewPipe"
response_dict = api.search.repos(q=query_string, sort="stars", order="desc", per_page=10, page=1)



Total repos:  1
Resultado incompleto:  False
Lista de repositórios encontrados:
- [NewPipe] ~> owner: TeamNewPipe, created: 2015-09-03T23:39:26Z, size: 63517, watchers: 22045, url: https://github.com/TeamNewPipe/NewPipe


In [11]:
df_issues = pd.read_csv("issues.csv")
df_issues.shape

(8673, 30)

In [12]:
df_issues_normal = df_issues.drop_duplicates("url")


In [35]:
import json
from ast import literal_eval
#arra = json.loads(df_issues_normal.loc[0]["labels"])
labels = df_issues_normal.loc[3]["labels"]
ll = literal_eval(labels)
print([x["name"] for x in ll])

['template missing', 'duplicate', 'waiting for author']


In [46]:

df_issues_normal.loc[df_issues_normal["labels"].str.contains("bug")].shape

(1776, 30)