In [102]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [103]:
ITER_YEARS = [
    [2014, False],
    [2015, False],
    [2016, True],
    [2017, False],
    [2018, False],
    [2019, False],
    [2020, True]
]

ITER_MONTHS = [
    [1, 31],
    [2, 28, 29],
    [3, 31],
    [4, 30],
    [5, 31],
    [6, 30],
    [7, 31],
    [8, 31],
    [9, 30],
    [10, 31],
    [11, 30],
    [12, 31]
]

In [104]:
# Get the url data
def scrape_data(mon, day, year):
    url = "http://rotoguru1.com/cgi-bin/hyday.pl?game=dk&mon=%d&day=%d&year=%d" % (mon, day, year)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    player_data = parse_data(soup)
    if len(player_data) > 0:
        df = pd.DataFrame(player_data, columns=['Position', 'Name', 'Points', 'Salary', 'Team', 'Opponent', 'Location'])
        df.to_csv('data/salaries/DKSalaries_%d_%02d_%02d.csv' % (year, mon, day), index=False)

In [105]:
# Parse the text
def parse_data(soup):
    positions = ['C', 'PF', 'SF', 'SG', 'PG', 'G', 'F', 'NA']
    player_data = []
    record = False
    lines = soup.find_all('tr')
    for line in lines:
        vals = line.find_all('td')
        if vals[0].string:
            pos = vals[0].string.split('/')[0]
            if pos in positions:
                # Get the position
                p0 = vals[0].string
                # Get the name
                p1 = vals[1].string if vals[1].string else vals[1].find_all('a')[0].string
                # Get the points
                p2 = vals[2].string
                # Get the salary
                p3 = vals[3].string.replace(',', '').replace('$', '')
                p4, p5, p6 = None, None, None
                try:
                    # Get the team
                    p4 = vals[4].string.upper()
                    # Get the opponent
                    p5 = vals[5].string.replace('v ', '').replace('@ ', '').upper()
                    # Get home/away
                    p6 = 'H' if vals[5].string == 'v' else 'A'
                except:
                    print("Issue parsing the team data")
                data = [p0, p1, p2, p3, p4, p5, p6]
                player_data.append(data)
    return player_data

In [106]:
for year in ITER_YEARS:
    for month in ITER_MONTHS:
        tot_days = month[2] if (year[1] and month[0] == 2) else month[1]
        for day in range(tot_days):
            print('Scraping Data for: %d-%d-%d' % (month[0], day, year[0]))
            scrape_data(month[0], day, year[0])

Scraping Data for: 1-0-2014
Scraping Data for: 1-1-2014
Scraping Data for: 1-2-2014
Scraping Data for: 1-3-2014
Scraping Data for: 1-4-2014
Scraping Data for: 1-5-2014
Scraping Data for: 1-6-2014
Scraping Data for: 1-7-2014
Scraping Data for: 1-8-2014
Scraping Data for: 1-9-2014
Scraping Data for: 1-10-2014
Scraping Data for: 1-11-2014
Scraping Data for: 1-12-2014
Scraping Data for: 1-13-2014
Scraping Data for: 1-14-2014
Scraping Data for: 1-15-2014
Scraping Data for: 1-16-2014
Scraping Data for: 1-17-2014
Scraping Data for: 1-18-2014
Scraping Data for: 1-19-2014
Scraping Data for: 1-20-2014
Scraping Data for: 1-21-2014
Scraping Data for: 1-22-2014
Scraping Data for: 1-23-2014
Scraping Data for: 1-24-2014
Scraping Data for: 1-25-2014
Scraping Data for: 1-26-2014
Scraping Data for: 1-27-2014
Scraping Data for: 1-28-2014
Scraping Data for: 1-29-2014
Scraping Data for: 1-30-2014
Scraping Data for: 2-0-2014
Scraping Data for: 2-1-2014
Scraping Data for: 2-2-2014
Scraping Data for: 2-3-2014

KeyboardInterrupt: 