In [None]:
import os
from github import Github
from github.Auth import Token
from tqdm.notebook import tqdm, trange
import requests
import datetime
import pandas as pd

from dotenv import load_dotenv
load_dotenv('../.env')

github = Github(auth=Token(token=os.environ.get('GITHUB_TOKEN')))

In [None]:
!pip install google-cloud-bigquery db-dtypes

In [None]:
!gcloud auth login

In [None]:
!gcloud auth application-default login

In [None]:
from google.cloud import bigquery
bigquery_client = bigquery.Client()

## Build Open Source Repo Names

In [None]:
def get_unique_os_repos(start_date, num_days):
    dfs = []
    with trange(num_days) as pbar:
        for i in pbar:
            cur_date = datetime.datetime.strftime(
                datetime.datetime.strptime(start_date, '%Y%m%d') + datetime.timedelta(days=-i), 
                '%Y%m%d')
            pbar.set_description(f'Loading for {cur_date}')
            db_name = f'githubarchive.day.{cur_date}'
            repos = get_unique_os_repos_from(db_name)
            print(f'Got {len(repos)} repos from {db_name}')
            dfs.append(repos)
    return pd.concat(dfs)

def get_unique_os_repos_from(db_name):
    query = f"SELECT distinct repo.name FROM `{db_name}`"
    results = bigquery_client.query(query)
    return results.to_dataframe()
    

In [None]:
df_oss_repos_30 = get_unique_os_repos('20240416', 30)

In [None]:
df_oss_repos_60 = get_unique_os_repos('20240317', 60)

In [None]:
print(f'30 days: {len(df_oss_repos_30)}; 60 days: {len(df_oss_repos_60)}')

In [None]:
df_repos = pd.concat([df_oss_repos_30, df_oss_repos_60])

In [None]:
len(df_repos)

In [None]:
%%time
pd.Series(df_repos.name.unique()).to_csv('oss_repos.csv')