In [1]:
import pandas as pdF
import numpy as np
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time
from threading import Thread, Event
import pandas as pd
from typing import List

In [2]:
def get_date_range_for_season(year):
    switcher = {
        2020: ["october-2019", "november", "december", "january", "february", "march", "july", "august", "september", "october-2020"],
        2019: pd.date_range('2018-11-01', '2019-3-31'),
        2018: pd.date_range('2017-11-01', '2018-3-31'),
        2017: pd.date_range('2016-11-01', '2017-3-31'),
        2016: pd.date_range('2015-11-01', '2016-3-31'),
        2015: pd.date_range('2014-11-01', '2015-3-31'),
        2014: pd.date_range('2013-11-01', '2014-3-31'),
        2013: pd.date_range('2012-11-01', '2013-3-31'),
        2012: pd.date_range('2012-1-01', '2012-3-31'),
        2011: pd.date_range('2010-11-01', '2011-3-31'),
        2010: pd.date_range('2009-11-01', '2010-3-31')
    }
    return switcher.get(year, "out of range... range is 2010-2020")

In [3]:
class DataHolder():
    def __init__(self, stat_url_endpoints):
        self.stat_dict = {}
        self.keys = stat_url_endpoints
        for key in self.keys:
            self.stat_dict[key] = []

    def print(self):
        print(self.stat_dict)
    
    def getDataDict(self):
        return self.stat_dict

In [9]:
def scrape_stat_data(url, daterange):
    print(f"going to {url} starting at: {daterange[0].date()}")
    data_array = []
    for date in daterange:
        if date == pd.Timestamp('2013-12-31'): # skip this date there is no data on the website...
            continue
        response = requests.get(url + date.strftime("%Y-%m-%d"))
        pts = [row.find_all('td')[2].text for row in BeautifulSoup(response.text, 'html.parser').find_all('tr')[1:]]
        teams = [team.text for team in BeautifulSoup(response.text, 'html.parser').find_all('td', class_='text-left nowrap')]
        zip_list = list(zip(*[teams, pts]))
        print(date.date())
        zip_list.sort()
        teams_list, pts_list = zip(*zip_list)
        pts_list = [float(x) for x in pts_list]
        # data_array.append([[date for i in range(30)], list(zip(*[teams, pts]))])
        data_array.append([[date for i in range(30)], teams_list, pts_list])
    return data_array

In [5]:
def send_scrape_threads(DHolder: DataHolder, years):
    dateRanges = [get_date_range_for_season(year) for year in years]
    print(dateRanges)
    numDateRanges = len(dateRanges)
    print(numDateRanges)
    for endpoint in DHolder.keys: 
        url = f"https://www.teamrankings.com/nba/stat/{endpoint}?date="
        data_list = []
        # ppg threads
        t0 = time.time()
        with concurrent.futures.ThreadPoolExecutor(max_workers=numDateRanges) as executor:
            results = executor.map(scrape_stat_data, [url]*numDateRanges, dateRanges)
        for val in results:
            print(f"val: {val[0]} \n")  
            data_list += val
            # print(val[0])
        t1 = time.time()
        DHolder.stat_dict[endpoint] = data_list
        # DHolder.print()
        print(f"this took {round(t1-t0,2)} seconds USING {numDateRanges} WORKERS!!!.")

In [6]:
def get_team_data(startYear: int, endYear: int, endpoints: List[str]):
    DHolder = DataHolder(endpoints)
    years = [x + startYear for x in range(endYear-startYear)]
    print(years)
    # print(years) # loops over different stats
    send_scrape_threads(DHolder, years)
    
    keys = DHolder.keys
    number_of_days = len(DHolder.stat_dict[keys[0]])
    print(number_of_days)
    date_index, team_index =  [], []
    for i in range(number_of_days):
        date_index += DHolder.stat_dict[keys[0]][i][0]
        team_index += DHolder.stat_dict[keys[0]][i][1]
    # get DHolder.stat_dict ready to be made into a data frame
    # turn the original list into a long list that holds JUST the data for that endpoint. It will be in the proper order since the teams were sorted (see zipping) in scrape_stat_data. By doing this we will bea able to make a dataframa out of Dholder.stat_dict.
    for key in keys:
        data_list = []
        for j in range (number_of_days):
            data_list += DHolder.stat_dict[key][j][2] # this j is important 
        DHolder.stat_dict[key] = data_list # replace list containing team and date info with only the stat list(datalist)

    DF =  pd.DataFrame(data=DHolder.stat_dict, index=[date_index, team_index])
    return DF

In [10]:
endpoints = ['opponent-points-per-game', 'points-per-game']
DF = get_team_data(2015, 2020, endpoints) # 2015

[2015, 2016, 2017, 2018, 2019]
[DatetimeIndex(['2014-11-01', '2014-11-02', '2014-11-03', '2014-11-04',
               '2014-11-05', '2014-11-06', '2014-11-07', '2014-11-08',
               '2014-11-09', '2014-11-10',
               ...
               '2015-03-22', '2015-03-23', '2015-03-24', '2015-03-25',
               '2015-03-26', '2015-03-27', '2015-03-28', '2015-03-29',
               '2015-03-30', '2015-03-31'],
              dtype='datetime64[ns]', length=151, freq='D'), DatetimeIndex(['2015-11-01', '2015-11-02', '2015-11-03', '2015-11-04',
               '2015-11-05', '2015-11-06', '2015-11-07', '2015-11-08',
               '2015-11-09', '2015-11-10',
               ...
               '2016-03-22', '2016-03-23', '2016-03-24', '2016-03-25',
               '2016-03-26', '2016-03-27', '2016-03-28', '2016-03-29',
               '2016-03-30', '2016-03-31'],
              dtype='datetime64[ns]', length=152, freq='D'), DatetimeIndex(['2016-11-01', '2016-11-02', '2016-11-03', '2016-11-

In [12]:
DF.loc[('2015-12-03', 'Atlanta')][0]

100.0

In [13]:
DF.to_pickle('data_df_2015-2020.pickle')