In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time
from threading import Thread, Event
import pandas as pd
from typing import List
import datetime

In [2]:
def get_date_range_for_season(year):
    switcher = {
        2020: ["october-2019", "november", "december", "january", "february", "march", "july", "august", "september", "october-2020"],
        2019: pd.date_range('2018-11-05', '2019-3-31'),
        2018: pd.date_range('2017-11-05', '2018-3-31'),
        2017: pd.date_range('2016-11-05', '2017-3-31'),
        2016: pd.date_range('2015-11-05', '2016-3-31'),
        2015: pd.date_range('2014-11-05', '2015-3-31'),
        2014: pd.date_range('2013-11-01', '2014-3-31'),
        2013: pd.date_range('2012-11-01', '2013-3-31'),
        2012: pd.date_range('2012-1-01', '2012-3-31'),
        2011: pd.date_range('2010-11-01', '2011-3-31'),
        2010: pd.date_range('2009-11-01', '2010-3-31')
    }
    return switcher.get(year, "out of range... range is 2010-2020")

In [3]:
class DataHolder():
    def __init__(self, stat_url_endpoints):
        self.stat_dict = {}
        self.endpoints = stat_url_endpoints
        self.keys = []

    def updateKeys(self):
        self.keys = [key for key in self.stat_dict.keys()]

In [4]:
def scrape_column(response, col: int):
    # changed to '3' to get the 'last 3 games data' on the website
    teams = [team.text for team in BeautifulSoup(response.text, 'html.parser').find_all('td', class_='text-left nowrap')]
    data = [row.find_all('td')[col].text for row in BeautifulSoup(response.text, 'html.parser').find_all('tr')[1:]]
    zip_list = list(zip(*[teams, data]))
    zip_list.sort()
    teams_list, data_list = zip(*zip_list)
    if '%' in data_list[0]: # handles the percent symbol in certain data points
        data_list = [float(x[0:-1]) for x in data_list]
    else:
        data_list = [float(x) for x in data_list]
    # data_array.append([[date for i in range(30)], list(zip(*[teams, pts]))])
    return teams_list, data_list

In [5]:
def scrape_stat_data(url, daterange, column:int):
    print(f"going to {url}, for col: {column}, starting at: {daterange[0].date()}")
    data_array = []
    for date in daterange:
        if date == pd.Timestamp('2013-12-31'): # skip this date there is no data on the website...
            continue
        print(date.date())
        response = requests.get(url + date.strftime("%Y-%m-%d"))
        # col = 2 means season stat, col = 3 means last 3 stats, cols = 4 means last 1 stats, col = 5 means home stats, col = 6 means away stats
        try:
            teams_list, data_list = scrape_column(response, column)
        except ValueError as e:
            print(f'Value Error for {date}')
            continue
        data_array.append([[date for i in range(30)], teams_list, data_list])
    
    return data_array

In [6]:
def send_scrape_threads(DHolder: DataHolder, years, columns: List[int]):
    dateRanges = [get_date_range_for_season(year) for year in years]
    numDateRanges = len(dateRanges)
    for r in dateRanges:
        print(r)
    print(numDateRanges)
    # numEndpoints = len(endpoints)
    # with concurrent.futures.ThreadPoolExecutor(max_workers=numEndpoints) as executor:
    #     executor.map(send_threads_for_columms, DHolder.endpoints, [columns]*numEndpoints, [dateRanges]*numEndpoints, [DHolder]*numEndpoints)
    for endpoint in DHolder.endpoints:
        t0 = time.time()
        for col in columns:
            url = f"https://www.teamrankings.com/nba/stat/{endpoint}?date="
            data_list = []
            # ppg threads
            with concurrent.futures.ThreadPoolExecutor(max_workers=numDateRanges) as executor:
                results = executor.map(scrape_stat_data, [url]*numDateRanges, dateRanges, [col]*numDateRanges)
            for val in results:
                print(f"val: {val[0]} \n")  
                data_list += val
                # print(val[0])
            DHolder.stat_dict[f'{endpoint}-col={col}'] = data_list
            # DHolder.print()
            # print(f"this took {round(t1-t0,2)} seconds USING {numDateRanges} WORKERS!!!.")
        t1 = time.time()
        print(f"It took {round(t1-t0,2)} seconds to parse cols {columns} for {endpoint}.")

In [7]:
def get_team_data(startYear: int, endYear: int, endpoints: List[str], columns:List[int]):
    DHolder = DataHolder(endpoints)
    years = [x + startYear for x in range(endYear-startYear)]
    print(years)
    # print(years) # loops over different stats
    send_scrape_threads(DHolder, years, columns)
    
    DHolder.updateKeys()
    keys = DHolder.keys
    first_key = next(iter(DHolder.stat_dict)) # first key in dict
    number_of_days = len(DHolder.stat_dict[first_key])
    print(f'keys {keys}')
    date_index, team_index =  [], []
    for i in range(number_of_days):
        date_index += DHolder.stat_dict[keys[0]][i][0]
        team_index += DHolder.stat_dict[keys[0]][i][1]
    # get DHolder.stat_dict ready to be made into a data frame
    # turn the original list into a long list that holds JUST the data for that endpoint. It will be in the proper order since the teams were sorted (see zipping) in scrape_stat_data. By doing this we will bea able to make a dataframa out of Dholder.stat_dict.
    for key in keys:
        data_list = []
        for j in range (number_of_days):
            data_list += DHolder.stat_dict[key][j][2] # this j is important 
        DHolder.stat_dict[key] = data_list # replace list containing team and date info with only the stat list(datalist)

    DF =  pd.DataFrame(data=DHolder.stat_dict, index=[date_index, team_index])
    return DF

In [12]:
endpoints = ['opponent-points-per-game', 'points-per-game','true-shooting-percentage', 'opponent-true-shooting-percentage','assists-per-game', 'assists-per-possession', 'assist--per--turnover-ratio', 'turnovers-per-possession','defensive-efficiency', 'opponent-effective-field-goal-pct', 'opponent-4th-quarter-points-per-game', 'offensive-efficiency', 'average-scoring-margin', 'opponent-defensive-rebounding-pct', 'opponent-offensive-rebounding-pct', 'defensive-rebounding-pct', 'offensive-rebounding-pct','win-pct-all-games']
len(endpoints)


18

In [10]:
t0 = time.time()
# col = 2 means season stat, col = 3 means last 3 stats, cols = 4 means last 1 stats, col = 5 means home stats, col = 6 means away stats 2, 3, 5, 6
DF = get_team_data(2015, 2020, endpoints, columns=[2,3,5,6]) # 2015
t1 = time.time()
secs = t1-t0
time_taken = str(datetime.timedelta(seconds=secs))
print(f"The time for {len(endpoints)} endpoints was {time_taken}")

[2015, 2016, 2017, 2018, 2019]
DatetimeIndex(['2014-11-04', '2014-11-05', '2014-11-06', '2014-11-07',
               '2014-11-08', '2014-11-09', '2014-11-10', '2014-11-11',
               '2014-11-12', '2014-11-13',
               ...
               '2015-03-22', '2015-03-23', '2015-03-24', '2015-03-25',
               '2015-03-26', '2015-03-27', '2015-03-28', '2015-03-29',
               '2015-03-30', '2015-03-31'],
              dtype='datetime64[ns]', length=148, freq='D')
DatetimeIndex(['2015-11-05', '2015-11-06', '2015-11-07', '2015-11-08',
               '2015-11-09', '2015-11-10', '2015-11-11', '2015-11-12',
               '2015-11-13', '2015-11-14',
               ...
               '2016-03-22', '2016-03-23', '2016-03-24', '2016-03-25',
               '2016-03-26', '2016-03-27', '2016-03-28', '2016-03-29',
               '2016-03-30', '2016-03-31'],
              dtype='datetime64[ns]', length=148, freq='D')
DatetimeIndex(['2016-11-03', '2016-11-04', '2016-11-05', '2016-11-06'

In [11]:
# DF.to_pickle('data_df_2015-2020-Last3.pickle') TOOK 0:46:36.133975 to scrape this :0
# The above DF has these endpoints 
# endpoints = ['true-shooting-percentage', 'defensive-efficiency', 'average-margin-thru-3-quarters', 'opponent-4th-quarter-points-per-game', 'offensive-efficiency', 'opponent-shooting-pct', 'average-scoring-margin', 'opponent-defensive-rebounding-pct', 'opponent-offensive-rebounding-pct' , 'defensive-rebounding-pct', 'offensive-rebounding-pct' ] 
DF

Unnamed: 0,Unnamed: 1,win-pct-all-games-col=2,win-pct-all-games-col=3,win-pct-all-games-col=5,win-pct-all-games-col=6
2014-11-04,Atlanta,0.500,0.500,1.000,0.000
2014-11-04,Boston,0.333,0.333,1.000,0.000
2014-11-04,Brooklyn,0.667,0.667,1.000,0.500
2014-11-04,Charlotte,0.333,0.333,0.500,0.000
2014-11-04,Chicago,0.667,0.667,0.000,1.000
...,...,...,...,...,...
2019-03-31,Sacramento,0.487,0.333,0.605,0.368
2019-03-31,San Antonio,0.579,0.667,0.789,0.368
2019-03-31,Toronto,0.701,1.000,0.769,0.632
2019-03-31,Utah,0.605,1.000,0.684,0.526


In [None]:
DF.to_pickle('data_df_2015-2020-18-endpoints-4-cols.pickle')

In [None]:
random = {}
random['hello'] = [1,23,4,5]
random['hwh'] = [24,[33]]

In [None]:
x = next(iter(random)) # first key in dict
x = [key for key in random.keys()]
x

In [None]:
for elem in random.keys():
    print(elem)
    print(type(elem))

In [14]:
years = [2010,2011,2012,2013,2014,2015,2016,2017]
dateRanges = [x*2 for x in years if x not in [2012, 2014]]
dateRanges

[4020, 4022, 4026, 4030, 4032, 4034]