In [None]:
import os
from github import Github
from github.Auth import Token
from tqdm.notebook import tqdm, trange
import requests
import datetime
import pandas as pd

from dotenv import load_dotenv
load_dotenv('../.env')

github = Github(auth=Token(token=os.environ.get('GITHUB_TOKEN')))

In [None]:
!pip install google-cloud-bigquery db-dtypes

In [None]:
!gcloud auth login

In [None]:
!gcloud auth application-default login

In [None]:
from google.cloud import bigquery
bigquery_client = bigquery.Client()

## Build Open Source Repo Names

In [22]:
def get_unique_os_repos(start_date, num_days):
    dfs = []
    with trange(num_days) as pbar:
        for i in pbar:
            cur_date = datetime.datetime.strftime(
                datetime.datetime.strptime(start_date, '%Y%m%d') + datetime.timedelta(days=-i), 
                '%Y%m%d')
            pbar.set_description(f'Loading for {cur_date}')
            db_name = f'githubarchive.day.{cur_date}'
            repos = get_unique_os_repos_from(db_name)
            print(f'Got {len(repos)} repos from {db_name}')
            dfs.append(repos)
    return pd.concat(dfs)

def get_unique_os_repos_from(db_name):
    query = f"SELECT distinct repo.name FROM `{db_name}`"
    results = bigquery_client.query(query)
    return results.to_dataframe()
    

In [28]:
df_oss_repos_30 = get_unique_os_repos('20240416', 30)

  0%|          | 0/30 [00:00<?, ?it/s]

Got 949954 repos from githubarchive.day.20240416
Got 954671 repos from githubarchive.day.20240415
Got 688458 repos from githubarchive.day.20240414
Got 666693 repos from githubarchive.day.20240413
Got 863177 repos from githubarchive.day.20240412
Got 899095 repos from githubarchive.day.20240411
Got 888282 repos from githubarchive.day.20240410
Got 915528 repos from githubarchive.day.20240409
Got 917833 repos from githubarchive.day.20240408
Got 716793 repos from githubarchive.day.20240407
Got 675982 repos from githubarchive.day.20240406
Got 853263 repos from githubarchive.day.20240405
Got 917006 repos from githubarchive.day.20240404
Got 939921 repos from githubarchive.day.20240403
Got 940064 repos from githubarchive.day.20240402
Got 897242 repos from githubarchive.day.20240401
Got 663143 repos from githubarchive.day.20240331
Got 667819 repos from githubarchive.day.20240330
Got 814805 repos from githubarchive.day.20240329
Got 907999 repos from githubarchive.day.20240328
Got 933472 repos fro

In [33]:
df_oss_repos_60 = get_unique_os_repos('20240317', 60)

  0%|          | 0/60 [00:00<?, ?it/s]

Got 718689 repos from githubarchive.day.20240317
Got 718277 repos from githubarchive.day.20240316
Got 902128 repos from githubarchive.day.20240315
Got 926071 repos from githubarchive.day.20240314
Got 946707 repos from githubarchive.day.20240313
Got 975366 repos from githubarchive.day.20240312
Got 940936 repos from githubarchive.day.20240311
Got 704679 repos from githubarchive.day.20240310
Got 687659 repos from githubarchive.day.20240309
Got 856612 repos from githubarchive.day.20240308
Got 921919 repos from githubarchive.day.20240307
Got 948022 repos from githubarchive.day.20240306
Got 949512 repos from githubarchive.day.20240305
Got 952063 repos from githubarchive.day.20240304
Got 708460 repos from githubarchive.day.20240303
Got 707478 repos from githubarchive.day.20240302
Got 886594 repos from githubarchive.day.20240301
Got 942048 repos from githubarchive.day.20240229
Got 943516 repos from githubarchive.day.20240228
Got 957320 repos from githubarchive.day.20240227
Got 920551 repos fro

In [34]:
print(f'30 days: {len(df_oss_repos_30)}; 60 days: {len(df_oss_repos_60)}')

30 days: 25698474; 60 days: 49812024


In [35]:
df_repos = pd.concat([df_oss_repos_30, df_oss_repos_60])

In [36]:
len(df_repos)

75510498

In [38]:
%%time
pd.Series(df_repos.name.unique()).to_csv('oss_repos.csv')

CPU times: user 35.5 s, sys: 2.21 s, total: 37.7 s
Wall time: 38.3 s
