# Importing Oddsshark.com

In [None]:
#Import scraping libraries, pandas, and numpy
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import pickle
from sklearn.preprocessing import OneHotEncoder

def extract_team_data(team_name,team_link,season='2021',n = 5):
    #Season can go back to 2017
    #n is the lag for the spread wins
    
    #From this page, extract the html
    team_response = requests.get(team_link +'/' + str(season))
    assert team_response.status_code == 200, '{0} returned a status code that was not 200'.format(team_links[0])

    #Extract the appropriate table
    game_logs_child = BeautifulSoup(team_response.text)
    
    #Capture the table and check for other tables that could disturb this process
    table_request = game_logs_child.find_all(class_="table table--striped table--fixed-column")
    if len(table_request) != 1:
        raise RuntimeError('The wrong table may have been captured by this request.')
    game_table = table_request[0]

    #Capture column names
    th_objects = game_table.find_all('th')
    columns = [th.text for th in th_objects]

    #Create all rows of the data frame
    game_table_rows = game_table.find('tbody').find_all('tr')
    table_list = []
    for row in game_table_rows:
        td_list = [td.text for td in row.find_all('td')]
        table_list.append(td_list)

    #Create the pandas DataFrame
    team_df = pd.DataFrame(table_list,columns=columns)

    #Add team name as an additional column
    team_df['Team'] = team_name

    #Clean this dataframe and return
    final_df = clean_team_df(team_df,season,n)
    
    #Convert team names to abbreviations for easy comparison
    final_df = (final_df.merge(pd.read_csv('input-data/nba_abbr_map.csv').rename({'Team':'Home_Team','Abbr':'Home_Team_Abbr'},axis=1,errors='raise'),how='left',on='Home_Team')
                .drop('Home_Team',axis=1)
                .rename({'Home_Team_Abbr':'Home_Team'},axis=1,errors='raise'))
    final_df = (final_df.merge(pd.read_csv('input-data/nba_abbr_map.csv').rename({'Team':'Away_Team','Abbr':'Away_Team_Abbr'},axis=1,errors='raise'),how='left',on='Away_Team')
                .drop('Away_Team',axis=1)
                .rename({'Away_Team_Abbr':'Away_Team'},axis=1,errors='raise'))

    return final_df.set_index(['Date','Home_Team','Away_Team'])

def clean_team_df(clean_team_df, season, n):
    #Remove leading/trailing white-space in the table
    clean_team_df = clean_team_df.rename(columns=lambda x: x.strip())
    for column in clean_team_df:
        clean_team_df[column] = clean_team_df[column].str.strip()

    #Drop empty rows, i.e. score column contains empty string
    clean_team_df['Score'] = clean_team_df['Score'].replace('',np.nan)
    clean_team_df.dropna(inplace=True)

    ######################
    # Home/Away Teams
    ######################
    #Set Home to True/False depending on if 'vs' is the first string in 'Opponent'
    #Then update Opponent to the correct team name (the rest of the strings in 'Opponent')
    clean_team_df['Opponent'] = clean_team_df['Opponent'].str.split()
    clean_team_df.loc[:,'Home'] = clean_team_df['Opponent'].map(lambda x: x[0]=='vs')
    clean_team_df['Opponent'] = clean_team_df['Opponent'].map(lambda x: ' '.join(x[1:]) )

    #Now modify the dataframe to set 'Home_Team' to 'Team' and 'Away_Team' to 'Opponent' if 'Home' is True
    clean_team_df.loc[clean_team_df['Home'], 'Home_Team'] = clean_team_df.loc[:,'Team']
    clean_team_df.loc[clean_team_df['Home'], 'Away_Team'] = clean_team_df.loc[:,'Opponent']

    #Else if 'Home' is False, set 'Home_Team' to 'Opponent' and 'Away_Team' to 'Team'
    clean_team_df.loc[clean_team_df['Home'] == False, 'Home_Team'] = clean_team_df.loc[:,'Opponent']
    clean_team_df.loc[clean_team_df['Home'] == False, 'Away_Team'] = clean_team_df.loc[:,'Team']

    ######################
    # Score
    ######################
    #For 'Score' in the format '122-121':
    clean_team_df['Score'] = clean_team_df['Score'].str.split('-')

    #If Home == True & Result == W: Set Home Score equal to score[0], Away Score equal to score[1]
    clean_team_df.loc[ (clean_team_df['Home']) & (clean_team_df['Result'] == 'W'),'Home_Score'] = clean_team_df['Score'].str[0]
    clean_team_df.loc[ (clean_team_df['Home']) & (clean_team_df['Result'] == 'W'),'Away_Score'] = clean_team_df['Score'].str[1]

    #if Home == True & Result == L: Set Home Score equal to score[1], Away Score equal to score[0]
    clean_team_df.loc[ (clean_team_df['Home']) & (clean_team_df['Result'] == 'L'),'Home_Score'] = clean_team_df['Score'].str[1]
    clean_team_df.loc[ (clean_team_df['Home']) & (clean_team_df['Result'] == 'L'),'Away_Score'] = clean_team_df['Score'].str[0]

    #If Home == False & Result == W: Set Home Score equal to score[1], Away Score equal to score[0]
    clean_team_df.loc[ (clean_team_df['Home'] == False) & (clean_team_df['Result'] == 'W'),'Home_Score'] = clean_team_df['Score'].str[1]
    clean_team_df.loc[ (clean_team_df['Home'] == False) & (clean_team_df['Result'] == 'W'),'Away_Score'] = clean_team_df['Score'].str[0]

    #if Home == False & Result == L: Set Home Score equal to score[0], Away Score equal to score[1]
    clean_team_df.loc[ (clean_team_df['Home'] == False) & (clean_team_df['Result'] == 'L'),'Home_Score'] = clean_team_df['Score'].str[0]
    clean_team_df.loc[ (clean_team_df['Home'] == False) & (clean_team_df['Result'] == 'L'),'Away_Score'] = clean_team_df['Score'].str[1]

    #Convert Home_Score and Away Score to integers
    clean_team_df['Home_Score'],clean_team_df['Away_Score'] = clean_team_df['Home_Score'].astype(int),clean_team_df['Away_Score'].astype(int)

    ######################
    # Spread
    ######################
    #If Home = True, set Home Spread to Spread 
    #If Home = False, set Away Spread to Spread
    #Will be many NANs for every other spread value.  These should be eliminated when merging if our assumption is true.

    clean_team_df.loc[ (clean_team_df['Home']), 'Home_Spread' ] = clean_team_df.loc[:,'Spread']
    clean_team_df.loc[ (clean_team_df['Home'] == False), 'Away_Spread' ] = clean_team_df.loc[:,'Spread']

    clean_team_df['Home_Spread'] = clean_team_df['Home_Spread'].astype(float)
    clean_team_df['Away_Spread'] = clean_team_df['Away_Spread'].astype(float)

    #Change the Date to a usable format to compare against other dates
    clean_team_df['Date'] = pd.to_datetime(clean_team_df['Date'])

    #
    clean_team_df['ATS'] = clean_team_df['ATS'].mask(clean_team_df['ATS']=='W',1)
    clean_team_df['ATS'] = clean_team_df['ATS'].mask(clean_team_df['ATS']=='P',0)
    clean_team_df['ATS'] = clean_team_df['ATS'].mask(clean_team_df['ATS']=='L',-1)
    
    #Create another column for the times that this team has won against the spread in the last 'n' games, and place it into a 'home_spread_n_wins' variable or 'away_spread_n_wins' variable
    #Change min_periods to the desired number of values before starting to show this rolling sum.  min_periods = None defaults to n periods.
    #Convert this to a percentage of the last n games they have won ATS
    #n = 3 as the default per the function header
    clean_team_df['prev_ATS'] = clean_team_df['ATS'].shift(1)
    clean_team_df['last_n_ATS'] = clean_team_df['prev_ATS'].rolling(n,min_periods=1).sum()
    
    #Set the Home_last_n_ATS and Away_last_n_ATS
    clean_team_df.loc[ (clean_team_df['Home']), 'Home_last_n_ATS' ] = clean_team_df.loc[:,'last_n_ATS']
    clean_team_df.loc[ (clean_team_df['Home']==False), 'Away_last_n_ATS' ] = clean_team_df.loc[:,'last_n_ATS']

    #Set the given season year
    clean_team_df['season'] = season

    #Return this to the overall
    final_df = clean_team_df.filter(['Date','season','Home_Team','Away_Team','Home_Score','Away_Score','Home_Spread','Away_Spread','Home_last_n_ATS','Away_last_n_ATS'],axis=1)

    return final_df

In [135]:
#request the original oddsshark webpage
def request_season(season):
    #Season can be between 2017 and 2021

    url = 'https://www.oddsshark.com/nba/game-logs'
    response = requests.get(url)
    assert response.status_code == 200, '{0} returned a status code that was not 200'.format('https://www.oddsshark.com/nba/game-logs')

    #Within the id='block-system-main', extract all list items.
    #These list items include a link for all of the resulting pages that we're looking for.
    #Add those links to a list including the current link
    game_logs_parent = BeautifulSoup(response.text,'lxml')

    link_objects = game_logs_parent.find(id="block-system-main").find_all('a')
    team_links = ['https://www.oddsshark.com'+link.get('href') for link in link_objects]

    #Also extract the (unique) team names from this page
    team_names = [link.text for link in link_objects]

    #Merge together all games
    #Loop through all team names and links and extract the appropriate team data to get a list of dataframes for each page
    #for name,link in zip(team_names,team_links):
    combined_df = pd.DataFrame()
    for name,link in zip(team_names,team_links):
        if len(combined_df.index) == 0:
            combined_df = extract_team_data(name,link,season)
        else:
            temp_df = extract_team_data(name,link,season)
            combined_df = combined_df.combine_first(temp_df)
            combined_df.to_pickle('season_data/pickles/temp_season_{0}.pkl'.format(season))
            time.sleep(.7)

    combined_df.reset_index(inplace=True)
    
    return combined_df


In [136]:
#########################
# Main Import
#########################
#Data between 2017 and 2020 will not change

df_list = []
all_seasons = [2017,2018,2019,2020,2021]
for year in all_seasons:
    main_scrape_df = request_season(year)
    main_scrape_df.to_csv('season_data/nba_spreads_{0}'.format(year),index=False)
    df_list.append(main_scrape_df)
