In [1]:
import sys
sys.path.append("/workspace")

In [32]:
from src.download_commit_data import Config
from tqdm import tqdm

import os
import github
import typing
import jsonlines

import pandas as pd

In [27]:
class Parser:
    def parse_repo(self, repo: github.Repository):
        return {
            "full_name": repo.full_name,
            "language": repo.language,
            "commits": -1,
            "stargazers_count": repo.stargazers_count,
            "watchers_count": repo.watchers_count,
            "forks_count": repo.forks_count,
            "size": repo.size,
            "archived": repo.archived,
            "fork": repo.fork
        }

class RepoLoader:
    def __init__(self, parser: Parser, min_followers: int = 0):
        self.parser = parser
        self.min_followers = min_followers
        
    def search_popular_repos(self, followers: typing.Optional[int] = None) -> typing.Dict[str, typing.Any]:
        query: str = f"language:python followers:<{followers}" if followers else "language:python"
        pages = g.search_repositories(query=query, sort="stars", order="desc")
        repos = [self.parser.parse_repo(repo) for repo in tqdm(pages, total=pages.totalCount)]
        return repos
    
    def search_all_popular_repos(self) -> typing.Dict[str, typing.Any]:
        repos: typing.Dict[str, typing.Any] = []
        followers: typing.Optional[int] = None
        while True:
            repos += self.search_popular_repos(followers)
            if not repos:
                return repos
            if followers and followers <= self.min_followers:
                return repos
            followers = repos[-1]["stargazers_count"]

In [28]:
config = Config(filename="/workspace/config/github.yml")
output_path = "/workspace/data/repositories/"
g = github.Github(login_or_token=config.token, per_page=100)

In [29]:
loader = RepoLoader(parser=Parser(), min_followers=100)

In [30]:
top_k_repos = loader.search_all_popular_repos()

100%|██████████| 1000/1000 [00:47<00:00, 21.28it/s]
100%|██████████| 1000/1000 [00:46<00:00, 21.51it/s]
100%|██████████| 1000/1000 [00:44<00:00, 22.60it/s]
100%|██████████| 1000/1000 [00:44<00:00, 22.24it/s]
100%|██████████| 1000/1000 [00:47<00:00, 20.99it/s]
100%|██████████| 1000/1000 [00:43<00:00, 22.85it/s]
100%|██████████| 1000/1000 [00:44<00:00, 22.45it/s]
100%|██████████| 1000/1000 [00:46<00:00, 21.34it/s]
100%|██████████| 1000/1000 [00:47<00:00, 20.89it/s]
100%|██████████| 1000/1000 [00:43<00:00, 22.93it/s]
100%|██████████| 1000/1000 [00:45<00:00, 22.22it/s]
100%|██████████| 1000/1000 [00:46<00:00, 21.44it/s]
100%|██████████| 1000/1000 [00:45<00:00, 22.14it/s]
100%|██████████| 1000/1000 [00:48<00:00, 20.49it/s]
100%|██████████| 1000/1000 [00:49<00:00, 20.24it/s]
100%|██████████| 1000/1000 [00:45<00:00, 22.10it/s]
100%|██████████| 1000/1000 [00:45<00:00, 21.99it/s]
100%|██████████| 1000/1000 [00:45<00:00, 21.78it/s]
100%|██████████| 1000/1000 [00:45<00:00, 22.15it/s]
100%|███████

In [33]:
df = pd.DataFrame(top_k_repos)

In [34]:
df[:10]

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork
0,donnemartin/system-design-primer,Python,-1,80884,80884,13476,4337,False,False
1,vinta/awesome-python,Python,-1,78356,78356,15408,5438,False,False
2,public-apis/public-apis,Python,-1,69093,69093,7802,2549,False,False
3,TheAlgorithms/Python,Python,-1,65656,65656,19814,9394,False,False
4,tensorflow/models,Python,-1,61146,61146,38883,524409,False,False
5,ytdl-org/youtube-dl,Python,-1,60460,60460,10492,57303,False,False
6,nvbn/thefuck,Python,-1,51569,51569,2584,2801,False,False
7,pallets/flask,Python,-1,48475,48475,13187,7536,False,False
8,django/django,Python,-1,46632,46632,20103,193755,False,False
9,keras-team/keras,Python,-1,46467,46467,17601,13334,False,False


In [41]:
df[df.duplicated("full_name")]

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork
300,nate-parrott/Flashlight,Python,-1,5410,5410,443,90051,False,False
301,samuelclay/NewsBlur,Python,-1,5403,5403,929,525726,False,False
305,MobSF/Mobile-Security-Framework-MobSF,Python,-1,5346,5346,1521,384919,False,False
306,flask-restful/flask-restful,Python,-1,5328,5328,851,1021,False,False
307,instabot-py/instabot.py,Python,-1,5327,5327,2250,801,False,False
...,...,...,...,...,...,...,...,...,...
20830,drov0/python-imagesearch,Python,-1,92,92,45,279,False,False
20831,archerhu/scel2mmseg,Python,-1,92,92,66,89,False,False
20832,kutoga/going_deeper,Python,-1,92,92,3,2505,False,False
20833,ziweipolaris/watermark-removal,Python,-1,92,92,25,3411,False,False


In [42]:
df[df["full_name"]=="nate-parrott/Flashlight"]

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork
278,nate-parrott/Flashlight,Python,-1,5410,5410,443,90051,False,False
300,nate-parrott/Flashlight,Python,-1,5410,5410,443,90051,False,False


In [44]:
df_deduplicated = df.drop_duplicates("full_name")

In [52]:
df_deduplicated.to_json("/workspace/data/repositories/top_18k.jsonl", orient="records", lines=True)

In [57]:
def name_to_url(name: str) -> str:
    return f"https://github.com/{name}.git"

In [62]:
repo_urls = "\n".join([name_to_url(name) for name in df_deduplicated.sort_values("size")[:100]["full_name"]])
with open("/workspace/tmp/code2ast_large/repo_list.txt", "w") as file:
    file.write(repo_urls)