In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

path = './odds_data/'

files = [filename for filename in os.listdir(path) if os.path.isdir(os.path.join(path,filename))]
odds = []
scores = []

for file in files:
    temp_odds = pd.read_csv(path+file+'/vegas.txt',index_col=0,parse_dates=True)
    temp_playoff_odds = pd.read_csv(path+file+'/vegas_playoff.txt',index_col=0,parse_dates=True)
    temp_scores = pd.read_csv(path+file+'/raw_scores.txt',index_col=0,parse_dates=True)
    odds.append(temp_odds)
    odds.append(temp_playoff_odds)
    scores.append(temp_scores)
    
master_odds = pd.concat(odds,axis=0)
master_scores = pd.concat(scores,axis=0)

master_scores['GAME_ID'] = master_scores['GAME_ID'].apply(lambda row: str(row).zfill(10))
master_odds['GameId'] = master_odds['GameId'].apply(lambda row: str(row).zfill(10))

## EDA on O/U Betting Data
----

In [None]:
def find_columns(columns, keyword):
    #retains and returns the columns (list) that contain the keyword (string).
    
    retained_cols = []
    for feature in columns:
        if feature.find(keyword)!= -1:
            retained_cols.append(feature)
    
    print(f'Found columns with keyword {keyword}: {retained_cols}')
    
    return retained_cols

In [None]:
OU_cols = find_columns(master_odds, 'OU')
basic_cols = ['Location', 'Team', 'OppTeam', 'TeamId', 'GameId', 'Pts', 'Spread', 'Result', 'Total']

from nba_api.stats.static import teams

nba_teams = teams.get_teams()
teams = [team for team in nba_teams]
id_team_dict={}
for i in range(0,len(teams)):
    id_team_dict.update({teams[i]['id']:teams[i]['abbreviation']})


master_OU = master_odds[basic_cols+OU_cols]
master_OU['year'] = master_OU.index.year

master_OU = master_OU.replace({'TeamId':id_team_dict})
master_OU = master_OU.rename(columns={'TeamId':'TeamAbbr'})

master_OU.head()

In [None]:
avg_OU_years = master_OU[master_OU['Location'] == 'away'][['Total','year']]
avg_OU_years['Year'] = avg_OU_years.index.year

plt.subplots(figsize=(10,6))
sns.boxplot(x='Year',y='Total',data=avg_OU_years)

plt.ylim(140,280)

In [None]:
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(int(nGraphRow), int(nGraphPerRow), int(i + 1))
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

In [None]:
plotPerColumnDistribution(master_odds, 57,3)

In [None]:
plotPerColumnDistribution(master_scores, 27,3)

In [None]:
master_OU[['Total','Open_Line_OU','Spread']]

In [None]:
#Adding columns (binary) indicating over or under for our test data. If total points over: 1, if total points under: 0, if tied: 2.

def OU(OUline, pts):
    if pts > OUline:
        return 1
    elif pts < OUline:
        return 0
    elif pts == OUline:
        return 2

master_OU['O/U'] = master_OU.apply(lambda row: OU(row['Open_Line_OU'],row['Total']),axis = 1 )
master_OU.reset_index(inplace=True)

The distribution and spread for all the features are fine and do not need to be tuned. We now need to consider several additional questions before training the model.

1. What additional game statistics should we include?

2. What subset of games should we train our model with? Playoffs only? Do we include regular season games? Do we use both?

3. How far back do we train the models. As we saw earlier, the average number of points scored per game increases every year. Will using older games in 2013/2014 in our model skew results?

According to wikipedia and other websites, key component used by Vegas for determining initial O/U line are:

1. Pace on offence

2. Efficiency on offence and defense

API fetching is slow so we'll simply import advanced box scores and merge dataframe. The above two statistics are included with the additional boxscore statistics.

In [None]:
path = './boxscore_data/'

files = [filename for filename in os.listdir(path)]
scores=[]

for file in files:
    temp_scores = pd.read_csv(path+file,index_col=0,parse_dates=True)
    scores.append(temp_scores)
    
master_boxscore = pd.concat(scores,axis=0)
master_boxscore = master_boxscore.drop(['season','teamWins','teamLosses'],axis=1)
master_boxscore.reset_index(inplace=True)

master_boxscore.head()

Now we can merge the O/U data with the boxscore data using pd.merge(). The dataframe contains all the O/U data and advanced boxscore statistics. As we saw in the EDA earlier, the game changes quite drastically. Since we only have data for up to the 2018-2019 season, we should train the on the three previous season games.

In [None]:
from nba_api.stats.static import teams

away_OU = master_OU[master_OU['Location']=='away']

data = pd.merge(away_OU,master_boxscore,how='left',left_on=['Date','TeamAbbr'],right_on=['gmDate','teamAbbr'])
data = data.drop(columns=['Location','Team','teamLoc','opptLoc'])
data = data.dropna().drop_duplicates(subset=['GameId'])

nba_teams = teams.get_teams()
teams = [team for team in nba_teams]
city_team_dict={}
for i in range(0,len(teams)):
    city_team_dict.update({teams[i]['city']:teams[i]['abbreviation']})

data = data.replace({'OppTeam':city_team_dict})
data = data.rename(columns={'OppTeam':'opptAbbr', 'TeamAbbr':'teamAbbr'})

data.head()

To train our model, we will use averaged data from the previous 5 games and use various statistics from that to calcuate the expected O/U result. We also want to include some additional statistics:

- Number of rest days the players had in the last 5 games.

- Consistency (variance) of the offensive/defensive rating and offensive pace in the last 5 games.

In [31]:
#additional cleanup for findal dataset.

columns_to_drop =[
                'gmDate',
                'Result', 
                'seasonType', 
                'teamRslt', 
                'year', 
                'teamMin', 
                'opptWins', 
                'opptLosses', 
                'opptRslt', 
                'opptMin',
                'matchWinner',
                'GameId',
                'PercentBet_OU',
                'Pinnacle_Line_OU',
                'Pinnacle_Odds_OU',
                '5dimes_Line_OU',
                '5dimes_Odds_OU',
                'Heritage_Line_OU',
                'Heritage_Odds_OU',
                'Bovada_Line_OU',
                'Bovada_Odds_OU',
                'Betonline_Line_OU',
                'Betonline_Odds_OU',
                'Best_Line_OU',
                'Worst_Line_OU',
                'Best_Odds_OU',
                'Worst_Odds_OU',
                'Average_Line_OU',
                'Average_Odds_OU',
                'Open_Odds_OU']

filtered_data = data.drop(columns=columns_to_drop)

#save filtered_data as npy file 
np.save('filtered_data.npy', filtered_data)

filtered_data.head()

Unnamed: 0,Date,opptAbbr,teamAbbr,Pts,Spread,Total,Open_Line_OU,O/U,teamAbbr.1,teamDayOff,...,opptFIC40,opptOrtg,opptDrtg,opptEDiff,opptPlay%,opptAR,opptAST/TO,opptSTL/TO,opptPoss,opptPace
0,2017-10-17,CLE,BOS,99.0,-3.0,201.0,216.5,0.0,BOS,0.0,...,55.52,102.7,99.7,3.0,0.42,17.59,1.12,0.18,99.3,99.3
2,2017-10-17,GSW,HOU,122.0,1.0,243.0,235.0,1.0,HOU,0.0,...,86.56,118.6,119.6,-1.0,0.47,27.92,2.0,0.29,102.0,102.0
6,2017-10-18,ORL,MIA,109.0,-7.0,225.0,207.0,1.0,MIA,0.0,...,78.23,110.3,103.6,6.7,0.46,19.43,1.57,0.57,105.2,105.2
8,2017-10-18,WAS,PHI,115.0,-5.0,235.0,215.0,1.0,PHI,0.0,...,80.0,115.1,110.3,4.8,0.47,19.04,2.33,0.89,104.3,104.3
10,2017-10-18,BOS,MIL,108.0,8.0,208.0,209.5,0.0,MIL,0.0,...,58.65,102.2,110.4,-8.2,0.42,19.7,1.92,1.0,97.9,97.9
