In [1]:
import pandas as pd
import numpy as np
import requests
import os
from typing import List
import concurrent.futures

In [2]:
def get_csv_recursive(github_token:str, repo:str, path:str)->List[str]:
        """ 
        Recursively retrieve .csv downlaod paths from the specified github repo 
        and data path.

        :param str github_token: Github API token with public_repo download access.
        :param str repo: GH repo in the form gituser/repo
        :param str path: path within repo to desired data folder /path/to/data

        :returns: A list of recursively retrieved download urls for the csv 
            files in the provided repo and folder path
        :rtype: List[str]
        """
        headers = {
            "Authorization": f"token {github_token}"
        }
        url = f"https://api.github.com/repos/{repo}/contents/{path}"
        response = requests.get(url, headers=headers)
        response_data = response.json()

        csv_files = []
        for item in response_data:
            if item["type"] == "file" and item["name"].endswith(".csv"):
                csv_files.append(item["download_url"])
            elif item["type"] == "dir":
                # Recursively get CSV files in subdirectories.
                subfolder_path = os.path.join(path, item["name"])
                csv_files.extend(get_csv_recursive(github_token, repo, subfolder_path))
        return csv_files

def download_fpl_data(github_token:str,
                      fpl_repo:str, 
                      season:str = "2021-22", 
                      save_dir:str = "fpl_data",
                      verbose:bool = False) -> None:
    """ 
    Download FPL data by player for the provided season.

    :param str github_token: Github API token with public_repo download access.
    :param str fpl_repo: FPL data repo path in form gituser/repo
    :param str seasont: Which season of EPL data to download. Should 
        follow the format 20XX-X(X+1), with the earliest data available being
        2016-17.
    :param str save_dir: Directory to save data in.
    

    :return: None
    :rtype: None
    """
   
    # Initialize headers with the personal access token.
    headers = {
        "Authorization": f"token {github_token}"
    }

    if fpl_repo is None:
        fpl_repo = os.path.join("vaastav", "Fantasy-Premier-League")

    data_folder_path = os.path.join("data")
    season_path = os.path.join(f"{season}", "players") 
    path = os.path.join(data_folder_path, season_path)
    
    if verbose:
        print("Retrieving list of CSV download URLs for requested repo folder.")
    csv_files = get_csv_recursive(github_token, fpl_repo, path)

    # Download and save CSV files locally.
    for csv_url in csv_files:
        response = requests.get(csv_url, headers=headers)
        if response.status_code == 200:
            filename = os.path.join("raw_data", 
                                    os.path.dirname(csv_url), 
                                    os.path.basename(csv_url))
            if verbose:
                print(f"Downloading CSV File: {filename}...")
            with open(filename, "wb") as f:
                f.write(response.content)

    return

In [3]:
#provide season in form 20XX-X(X+1)
FPL_REPO = os.path.join("vaastav", "Fantasy-Premier-League")
SEASON = "2021-22"
GH_KEY = os.getenv('GITHUB_TOKEN', None)
assert GH_KEY is not None

download_fpl_data(GH_KEY, FPL_REPO, SEASON, "raw_data", verbose=True)

Retrieving list of CSV download URLs for requested repo folder.
