In [1]:
import requests
from requests.auth import HTTPBasicAuth
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
import pickle



import sys
from multiprocessing import Pool
import time

from bs4 import BeautifulSoup



In [2]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36"
}


In [3]:
import logging

logger = logging.Logger(__name__)

In [4]:
def get_page(l):
    passed = 0
    while passed == 0:
        try:
            page = requests.get(l, headers=headers)
            # print(page)
            if page.status_code == 429:
                logger.info("retrying after ", int(page.headers["Retry-After"]))
                time.sleep(int(page.headers["Retry-After"]))
            else:
                passed = 1
        except:
            passed = 0
            logger.info("trying to download ", l)
            time.sleep(5)

    return page

In [5]:
import re

In [6]:
def get_adp(year_start: int, year_end: int, bl) -> pd.DataFrame:
    years = np.arange(year_start, year_end + 1)

    dfs = []
    for y in tqdm(years):
        logger.info(f"scraping {y}")
        page = get_page(f"{bl}{y}")

        logger.info(f"parsing for {y}...")

        df = pd.read_html(page.content)[0]

        # return df

        # add a few metadata columns
        # df["bye"] = df["Player Team (Bye)"].apply(
        #     lambda x: re.findall(r"\d+", x)[0] if re.findall(r"\d+", x) else None
        # )
        df["Season"] = [y] * len(df)

        df.columns = df.columns.str.replace(" ", "").str.lower()

        code = BeautifulSoup(page.content)
        trs = code.find("tbody").find_all("tr")

        ids = []
        for tr in trs:
            # print(tr)
            nid = int(tr.find_all("a")[0]["class"][-1].split("-")[-1])
            # print(nid)
            ids.append(nid)

        df["pid"] = ids
        df.assign(scrape_datetime=pd.Timestamp.now())
        dfs.append(df)
        dfs

    return dfs

In [7]:
def get_projs(year_start: int, year_end: int, bl) -> pd.DataFrame:
    years = np.arange(year_start, year_end + 1)

    dfs = []
    for y in tqdm(years):
        logger.info(f"scraping {y}")
        page = get_page(f"{bl}{y}")

        logger.info(f"parsing for {y}...")

        df = pd.read_html(page.content)[0]

        if isinstance(df.columns, pd.MultiIndex):
            df.columns = ["_".join(col).lower() for col in df.columns]
            df = df.rename({"unnamed: 0_level_0_player": "player"}, axis=1)

        else:
            df.columns = df.columns.str.lower()

        # return df

        # add a few metadata columns
        # df["bye"] = df["Player Team (Bye)"].apply(
        #     lambda x: re.findall(r"\d+", x)[0] if re.findall(r"\d+", x) else None
        # )
        df["Season"] = [y] * len(df)

        code = BeautifulSoup(page.content)
        trs = code.find("tbody").find_all("tr")

        ids = []
        for tr in trs:
            # print(tr)
            nid = int(tr.find_all("a")[0]["class"][-1].split("-")[-1])
            # print(nid)
            ids.append(nid)

        df["pid"] = ids
        df.assign(scrape_datetime=pd.Timestamp.now())
        dfs.append(df)
        dfs

    return dfs

In [122]:
adps = get_adp(2019, 2024, "https://www.fantasypros.com/nfl/adp/ppr-overall.php?year=")


100%|██████████| 6/6 [00:03<00:00,  1.96it/s]


In [79]:
adps = pd.concat(adps)

In [82]:
adps.to_csv("data/adp_data.csv", index=False)

In [165]:
stats = pd.DataFrame()
for pos in ["qb", "rb", "wr", "te"]:  # , "k", "dst"]:
    dat = pd.concat(
        get_projs(
            2019,
            2024,
            f"https://www.fantasypros.com/nfl/stats/{pos}.php?scoring=PPR&roster=e&range=full&year=",
        )
    )
    dat['pos'] = pos


    stats = pd.concat([stats, dat])



100%|██████████| 6/6 [00:08<00:00,  1.36s/it]
100%|██████████| 6/6 [00:09<00:00,  1.55s/it]
100%|██████████| 6/6 [00:11<00:00,  1.89s/it]
100%|██████████| 6/6 [00:10<00:00,  1.75s/it]


In [166]:
off_stats = stats.copy()
off_stats = off_stats.rename({"unnamed: 1_level_0_player": "player"}, axis=1)
off_stats = off_stats.drop("unnamed: 0_level_0_rank", axis=1)

In [164]:
off_stats.to_csv("data/offensive_stats.csv")

In [167]:
stats = pd.DataFrame()
for pos in ["k", "dst"]:
    dat = pd.concat(
        get_projs(
            2019,
            2024,
            f"https://www.fantasypros.com/nfl/stats/{pos}.php?scoring=PPR&roster=e&range=full&year=",
        )
    )

    stats = pd.concat([stats, dat])
    stats['pos'] = pos

    stats.to_csv(f"data/{pos}_stats.csv")


  0%|          | 0/6 [00:00<?, ?it/s]

100%|██████████| 6/6 [00:09<00:00,  1.55s/it]
100%|██████████| 6/6 [00:04<00:00,  1.28it/s]


In [168]:
stats = pd.DataFrame()
for pos in ["qb", "rb", "wr", "te", "flex"]:
    dat = pd.concat(
        get_projs(
            2019,
            2024,
            f"https://www.fantasypros.com/nfl/projections/{pos}.php?week=draft&scoring=PPR&week=draft&year=",
        )
    )

    dat['pos'] = pos


    stats = pd.concat([stats, dat])

    # stats.to_csv(f"data/{pos}_projections.csv")


100%|██████████| 6/6 [00:02<00:00,  2.23it/s]
100%|██████████| 6/6 [00:04<00:00,  1.43it/s]
100%|██████████| 6/6 [00:05<00:00,  1.13it/s]
100%|██████████| 6/6 [00:03<00:00,  1.97it/s]
100%|██████████| 6/6 [00:08<00:00,  1.39s/it]


In [169]:
off_stats = stats.copy()
# off_stats = off_stats.rename({"unnamed: 1_level_0_player": "player"}, axis=1)
# off_stats = off_stats.drop("unnamed: 0_level_0_rank", axis=1)
off_stats.to_csv("data/offensive_projections.csv")

In [8]:
stats = pd.DataFrame()
for pos in ["dst"]:
    dat = pd.concat(
        get_projs(
            2024,
            2024,
            f"https://www.fantasypros.com/nfl/projections/{pos}.php?week=draft&scoring=PPR&week=draft&year=",
        )
    )

    dat['pos'] = pos

    stats = pd.concat([stats, dat])

    # stats.to_csv(f"data/{pos}_projections.csv")


100%|██████████| 1/1 [00:00<00:00,  1.26it/s]


In [10]:
stats.to_csv('data/dst_projections.csv', index=False)

In [11]:
stats = pd.DataFrame()
for pos in ["k"]:
    dat = pd.concat(
        get_projs(
            2024,
            2024,
            f"https://www.fantasypros.com/nfl/projections/{pos}.php?week=draft&scoring=PPR&week=draft&year=",
        )
    )

    dat['pos'] = pos

    stats = pd.concat([stats, dat])

    # stats.to_csv(f"data/{pos}_projections.csv")

    stats.to_csv('data/k_projections.csv', index=False)

100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
