In [4]:
import sys
sys.path.append("../")

In [152]:
from src.download_commit_data import Config
from tqdm import tqdm_notebook

import os
import github
import typing
import jsonlines

In [163]:
config = Config(filename="../config/github.yml")
output_dir = "../data/repositories/"

In [164]:
g = github.Github(login_or_token=config.token, per_page=100)

In [131]:
class Parser:
    def parse_repo(self, repo: github.Repository):
        return {
            "full_name": repo.full_name,
            "language": repo.language,
            "commits": repo.get_commits().totalCount,
            "stargazers_count": repo.stargazers_count,
            "watchers_count": repo.watchers_count,
            "forks_count": repo.forks_count,
            "size": repo.size,
            "archived": repo.archived,
            "fork": repo.fork
        }

class RepoLoader:
    def __init__(self, parser: Parser, min_followers: int = 0):
        self.parser = parser
        self.min_followers = min_followers
        
    def search_popular_repos(self, followers: typing.Optional[int] = None) -> typing.Dict[str, typing.Any]:
        query: str = f"language:python followers:<{followers}" if followers else "language:python"
        pages = g.search_repositories(query=query, sort="stars", order="desc")
        repos = [self.parser.parse_repo(repo) for repo in tqdm_notebook(pages, total=pages.totalCount)]
        return repos
    
    def search_all_popular_repos(self) -> typing.Dict[str, typing.Any]:
        repos: typing.Dict[str, typing.Any] = []
        followers: typing.Optional[int] = None
        while True:
            repos += self.search_popular_repos(followers)
            if not repos:
                return repos
            if followers and followers <= self.min_followers:
                return repos
            followers = repos[-1]["stargazers_count"]

In [132]:
loader = RepoLoader(parser=Parser(), min_followers=0)

In [133]:
top1000 = loader.search_popular_repos()

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [151]:
top2000 = loader.search_popular_repos(followers=2127)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [153]:
top1000[-1]

{'full_name': 'freqtrade/freqtrade',
 'language': 'Python',
 'commits': 6703,
 'stargazers_count': 2112,
 'watchers_count': 2112,
 'forks_count': 665,
 'size': 38585,
 'archived': False,
 'fork': False}

In [156]:
top2000[0]

{'full_name': 'rochacbruno-archive/quokka',
 'language': 'Python',
 'commits': 117,
 'stargazers_count': 2126,
 'watchers_count': 2126,
 'forks_count': 452,
 'size': 11550,
 'archived': False,
 'fork': False}

In [165]:
def save(filename: str, data: typing.List[typing.Dict[str, typing.Any]]):
    filepath = os.path.join(output_dir, filename)
    os.makedirs(output_dir, exist_ok=True)
    with jsonlines.open(filepath, mode='w') as writer:
        writer.write(data)

In [167]:
save(filename="top1000_page1.jsonl", data=top1000)

In [168]:
save(filename="top1000_page2.jsonl", data=top2000)