In [2]:
import pandas as pd
import numpy as np
import requests
import os
from typing import List
import concurrent.futures

In [10]:
def get_csv_recursive(github_token:str, repo:str, path:str)->List[str]:
        """ 
        Recursively retrieve .csv downlaod paths from the specified github repo 
        and data path.

        :param str github_token: Github API token with public_repo download access.
        :param str repo: GH repo in the form gituser/repo
        :param str path: path within repo to desired data folder /path/to/data

        :returns: A list of recursively retrieved download urls for the csv 
            files in the provided repo and folder path
        :rtype: List[str]
        """
        headers = {
            "Authorization": f"token {github_token}"
        }
        url = f"https://api.github.com/repos/{repo}/contents/{path}"
        response = requests.get(url, headers=headers)
        response_data = response.json()

        csv_files = []
        for item in response_data:
            if item["type"] == "file" and item["name"].endswith(".csv"):
                csv_files.append(item["download_url"])
            elif item["type"] == "dir":
                # Recursively get CSV files in subdirectories.
                subfolder_path = os.path.join(path, item["name"])
                csv_files.extend(get_csv_recursive(github_token, repo, subfolder_path))
        return csv_files

def identify_fpl_data(github_token:str,
                      fpl_repo:str, 
                      season:str = "2021-22", 
                      verbose:bool = False) -> List[str]:
    """ 
    Identify FPL data by player for the provided season.

    :param str github_token: Github API token with public_repo download access.
    :param str fpl_repo: FPL data repo path in form gituser/repo
    :param str seasont: Which season of EPL data to download. Should 
        follow the format 20XX-X(X+1), with the earliest data available being
        2016-17.

    :return: List of CSV files to download from desired repo section.
    :rtype: List[str]
    """
   
    # Initialize headers with the personal access token.
    headers = {
        "Authorization": f"token {github_token}"
    }

    if fpl_repo is None:
        fpl_repo = os.path.join("vaastav", "Fantasy-Premier-League")

    data_folder_path = os.path.join("data")
    season_path = os.path.join(f"{season}", "players") 
    path = os.path.join(data_folder_path, season_path)
    
    if verbose:
        print("Retrieving list of CSV download URLs for requested repo folder.")
    csv_urls = get_csv_recursive(github_token, fpl_repo, path)
    print("CSVS Identified.")

    return csv_urls

def download_fpl_data(github_token:str,
                      csv_urls:List[str],
                      save_dir:str = "raw_data",
                      verbose:bool = False) -> None:
    """ 
    Download FPL data by player for the provided season.

    :param str github_token: Github API token with public_repo download access.
    :param List[str] csv_urls: csv urls to download
    :param str save_dir: Directory to save data in.
    

    :return: None
    :rtype: None
    """
    # Initialize headers with the personal access token.
    headers = {
        "Authorization": f"token {github_token}"
    }

    # Download and save CSV files locally.
    for csv_url in csv_urls:
        response = requests.get(csv_url, headers=headers)
        
        if response.status_code == 200:
            #construct filename to write to
            dirs = os.path.dirname(csv_url)
            split_dirs = dirs.split(os.path.sep)
            year_dir = split_dirs[-3]
            players_dir = split_dirs[-2]
            name_dir = split_dirs[-1]
            write_dirs = os.path.join(os.path.abspath(os.path.curdir), 
                                    save_dir, 
                                    year_dir,
                                    players_dir,
                                    name_dir)

            player_filename = os.path.basename(csv_url)

            filename = os.path.join(write_dirs,
                                    player_filename)
            
            #add directories if needed
            if not os.path.exists(write_dirs):
                os.makedirs(write_dirs)

            #write the data
            if verbose:
                print(f"Downloading CSV File: {filename}...")
            with open(filename, "wb") as f:
                f.write(response.content)

    return

In [4]:
#provide season in form 20XX-X(X+1)
FPL_REPO = os.path.join("vaastav", "Fantasy-Premier-League")
SEASON = "2021-22"
GH_KEY = os.getenv('GITHUB_TOKEN', None)
assert GH_KEY is not None

csv_urls_2021 = identify_fpl_data(GH_KEY, FPL_REPO, SEASON, verbose=True)


Retrieving list of CSV download URLs for requested repo folder.
CSVS Identified.


In [5]:
print(csv_urls_2021)

['https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2021-22/players/Aaron_Connolly_72/gw.csv', 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2021-22/players/Aaron_Connolly_72/history.csv', 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2021-22/players/Aaron_Cresswell_411/gw.csv', 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2021-22/players/Aaron_Cresswell_411/history.csv', 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2021-22/players/Aaron_Lennon_575/gw.csv', 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2021-22/players/Aaron_Lennon_575/history.csv', 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2021-22/players/Aaron_Ramsdale_559/gw.csv', 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2021-22/players/Aaron_Ramsdale_559/history.csv', 'https:

In [11]:
download_fpl_data(GH_KEY, csv_urls_2021, "raw_data", verbose=True)

Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2021-22/players/Aaron_Connolly_72/gw.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2021-22/players/Aaron_Connolly_72/history.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2021-22/players/Aaron_Cresswell_411/gw.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2021-22/players/Aaron_Cresswell_411/history.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2021-22/players/Aaron_Lennon_575/gw.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2021-22/players/Aaron_Lennon_575/history.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2021-22/players/Aaron_Ramsdale_559/gw.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2021-22/pl

In [12]:
#provide season in form 20XX-X(X+1)
FPL_REPO = os.path.join("vaastav", "Fantasy-Premier-League")
SEASON = "2020-21"
GH_KEY = os.getenv('GITHUB_TOKEN', None)
assert GH_KEY is not None

csv_urls_2020 = identify_fpl_data(GH_KEY, FPL_REPO, SEASON, verbose=True)
download_fpl_data(GH_KEY, csv_urls_2020, "raw_data", verbose=True)

Retrieving list of CSV download URLs for requested repo folder.
CSVS Identified.
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2020-21/players/Aaron_Connolly_78/gw.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2020-21/players/Aaron_Connolly_78/history.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2020-21/players/Aaron_Cresswell_435/gw.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2020-21/players/Aaron_Cresswell_435/history.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2020-21/players/Aaron_Mooy_60/gw.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2020-21/players/Aaron_Mooy_60/history.csv...
Downloading CSV File: /Users/danielfrees/Desktop/mlpremier/mlpremier/data/raw_data/2020-21/players/Aaron_Ramsdale_483/gw.csv...
Downloading CSV Fi