# Import Dependencies and data 
#### Read the data using pandas 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from sklearn import tree

In [2]:
# Study data files
data1 = "../data/nba odds 2014-15.csv"
data2 = "../data/nba odds 2015-16.csv"
data3 = "../data/nba odds 2016-17.csv"
data4 = "../data/nba odds 2017-18.csv"

# Read the mouse data and the study results
data1 = pd.read_csv(data1)
data2 = pd.read_csv(data2)
data3 = pd.read_csv(data3)
data4 = pd.read_csv(data4)

# Merge all the different seasons 

In [3]:
def set_win(row):
    if row['Point Dif'] > 0:
        return 1
    else:
        return 0
    
def set_spread_win(row):
    handi = row['Point Dif'] + row['Spread']
    if handi > 0:
        return 1
    else:
        return 0
    
def set_ou_win(row):
    if row['Game Total'] > row['Over']:
        return 1
    else:
        return 0

def data_clean(df):
    df = df.replace({'Team': {'Atlanta':'ATL','Boston':'BOS','Brooklyn':'BRK',
                              'Charlotte':'CHO','Chicago':'CHI','Cleveland':'CLE',
                              'Dallas':'DAL','Denver':'DEN','Detroit':'DET',
                              'GoldenState':'GSW','Houston':'HOU','Indiana':'IND',
                              'LAClippers':'LAC','LALakers':'LAL','Memphis':'MEM',
                              'Miami':'MIA','Milwaukee':'MIL','Minnesota':'MIN',
                              'NewOrleans':'NOP','NewYork':'NYK','OklahomaCity':'OKC',
                              'Orlando':'ORL','Philadelphia':'PHI','Phoenix':'PHO',
                              'Portland':'POR','Sacramento':'SAC','SanAntonio':'SAS',
                              'Toronto':'TOR','Utah':'UTA','Washington':'WAS'
                    }})

    p_dif = []
    p_total = []
    spread = []
    over = []

    # Iterate through table and calculate point dif and winners and insert into list 
    for i in range(1, df.shape[0], 2):
        home_dif = df['Final'].iloc[i] - df['Final'].iloc[i-1]
        away_dif = df['Final'].iloc[i-1] - df['Final'].iloc[i]
        total = df['Final'].iloc[i] + df['Final'].iloc[i-1]
        if df['Close'].iloc[i] == 'pk':
            spread.append(0)
            spread.append(0)
            over.append(float(df['Close'].iloc[i-1]))
            over.append(float(df['Close'].iloc[i-1]))
        elif df['Close'].iloc[i-1] == 'pk':
            spread.append(0)
            spread.append(0)
            over.append(float(df['Close'].iloc[i]))
            over.append(float(df['Close'].iloc[i]))
        elif df['ML'].iloc[i] < df['ML'].iloc[i-1]:
            spread.append(float(df['Close'].iloc[i]))
            spread.append(-float(df['Close'].iloc[i]))
            over.append(float(df['Close'].iloc[i-1]))
            over.append(float(df['Close'].iloc[i-1]))
        elif df['ML'].iloc[i] > df['ML'].iloc[i-1]:
            spread.append(-float(df['Close'].iloc[i-1]))
            spread.append(float(df['Close'].iloc[i-1]))
            over.append(float(df['Close'].iloc[i]))
            over.append(float(df['Close'].iloc[i]))
        else:
            spread.append(0)
            spread.append(0)
            over.append(float(df['Close'].iloc[i-1]))
            over.append(float(df['Close'].iloc[i-1]))

        p_total.append(total)
        p_total.append(total)
        p_dif.append(away_dif)
        p_dif.append(home_dif)
            
    # Add point dif column and data         
    df['Point Dif'] = p_dif
    df['Game Total'] = p_total
    df['Spread'] = spread
    df['Over'] = over
    df = df.reset_index()
    
    # add the result column 
    df = df.assign(Win=df.apply(set_win, axis=1))
    df = df.assign(Win_Spread=df.apply(set_spread_win, axis=1)) 
    df = df.assign(Win_Over=df.apply(set_ou_win, axis=1)) 
    
    point_dict = {}
    net_points = []
    wd = {}
    w = []    

    for i in df.iterrows():
    #point difference
        if i[1]['Team'] in point_dict:
            net_points.append(point_dict[i[1]['Team']])
            point_dict[i[1]['Team']] = point_dict[i[1]['Team']] + int(i[1]['Point Dif'])
            w.append(wd[i[1]['Team']])
            wd[i[1]['Team']] = wd[i[1]['Team']] + int(i[1]['Win'])
        else:
            point_dict[i[1]['Team']] = int(i[1]['Point Dif'])
            net_points.append(0)
            wd[i[1]['Team']] = int(i[1]['Win'])
            w.append(0)
    
    df['season_wins'] = w        
    df['net_points'] = net_points
    df = df.sort_values(by=['Team','season_wins','Date'])
    return df

In [4]:
clean1 = data_clean(data1)
clean2 = data_clean(data2)
clean3 = data_clean(data3)
clean4 = data_clean(data4)

frames = [clean1,clean2,clean3,clean4]
    
df = pd.concat(frames)
df.drop(df.columns[23:],axis=1,inplace=True)
df

Unnamed: 0,index,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,...,2H,Point Dif,Game Total,Spread,Over,Win,Win_Spread,Win_Over,season_wins,net_points
12,12,1029,707,V,ATL,22,30,19,31,102,...,100.5,-7,211,4.5,198.5,0,0,1,0,0
65,65,1101,514,H,ATL,27,24,19,32,102,...,4,10,194,-11.0,193.5,1,0,1,0,-7
126,126,1105,515,V,ATL,15,24,23,30,92,...,101,-2,186,8.0,202.0,0,1,0,1,3
140,140,1107,501,V,ATL,29,21,28,19,119,...,99,-3,241,1.5,194.5,0,0,1,1,1
167,167,1108,706,H,ATL,20,28,27,28,103,...,7,7,199,-6.5,194.5,1,1,1,1,-2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318,2318,403,709,V,WAS,24,25,26,29,104,...,1.5,-16,224,8.5,216.5,0,0,1,42,71
2352,2352,405,705,V,WAS,26,28,36,25,115,...,107.5,-4,234,6.0,220.5,0,1,1,42,55
2365,2365,406,506,H,WAS,22,26,29,20,97,...,9,-6,200,-10.5,210.0,0,0,0,42,51
2429,2429,410,506,H,WAS,22,30,27,34,113,...,7,12,214,-7.5,204.5,1,1,1,42,45


In [5]:
df.to_csv('historical_odds.csv')

In [None]:
odds_df = pd.read_csv('historical_odds.csv')
odds_df

# Cleaning Up the Dataframe

In [9]:
cols = [0,1,3,6,7,8,9,10,11,12,14,15,16]
odds_df.drop(odds_df.columns[cols],axis=1,inplace=True)
odds_df

Unnamed: 0,Date,VH,Team,ML,Spread,Over,Win,Win_Spread,Win_Over,season_wins,net_points
0,10/29/14,V,ATL,161,4.5,198.5,0,0,1,0,0
1,11/1/14,H,ATL,-650,-11.0,193.5,1,0,1,0,-7
2,11/5/14,V,ATL,315,8.0,202.0,0,1,0,1,3
3,11/7/14,V,ATL,105,1.5,194.5,0,0,1,1,1
4,11/8/14,H,ATL,-290,-6.5,194.5,1,1,1,1,-2
...,...,...,...,...,...,...,...,...,...,...,...
9835,4/3/18,V,WAS,325,8.5,216.5,0,0,1,42,71
9836,4/5/18,V,WAS,210,6.0,220.5,0,1,1,42,55
9837,4/6/18,H,WAS,-850,-10.5,210.0,0,0,0,42,51
9838,4/10/18,H,WAS,-350,-7.5,204.5,1,1,1,42,45


In [11]:
odds_df = odds_df.replace({'VH': {'V': 0, 'H': 1}})
odds_df = odds_df.rename(columns={"VH": "Home"})
odds_df = odds_df.dropna()

odds_df

Unnamed: 0,Date,Home,Team,ML,Spread,Over,Win,Win_Spread,Win_Over,season_wins,net_points
0,10/29/14,0,ATL,161,4.5,198.5,0,0,1,0,0
1,11/1/14,1,ATL,-650,-11.0,193.5,1,0,1,0,-7
2,11/5/14,0,ATL,315,8.0,202.0,0,1,0,1,3
3,11/7/14,0,ATL,105,1.5,194.5,0,0,1,1,1
4,11/8/14,1,ATL,-290,-6.5,194.5,1,1,1,1,-2
...,...,...,...,...,...,...,...,...,...,...,...
9835,4/3/18,0,WAS,325,8.5,216.5,0,0,1,42,71
9836,4/5/18,0,WAS,210,6.0,220.5,0,1,1,42,55
9837,4/6/18,1,WAS,-850,-10.5,210.0,0,0,0,42,51
9838,4/10/18,1,WAS,-350,-7.5,204.5,1,1,1,42,45


In [12]:
odds_df.to_csv('historical_odds.csv')

# Manually add dates

In [16]:
odds_df = pd.read_csv('historical_odds.csv')
cols = [0]
odds_df.drop(odds_df.columns[cols],axis=1,inplace=True)
odds_df

Unnamed: 0,Date,Home,Team,ML,Spread,Over,Win,Win_Spread,Win_Over,season_wins,net_points
0,2014-10-29,0,ATL,161,4.5,198.5,0,0,1,0,0
1,2014-11-01,1,ATL,-650,-11.0,193.5,1,0,1,0,-7
2,2014-11-05,0,ATL,315,8.0,202.0,0,1,0,1,3
3,2014-11-07,0,ATL,105,1.5,194.5,0,0,1,1,1
4,2014-11-08,1,ATL,-290,-6.5,194.5,1,1,1,1,-2
...,...,...,...,...,...,...,...,...,...,...,...
9835,2018-04-03,0,WAS,325,8.5,216.5,0,0,1,42,71
9836,2018-04-05,0,WAS,210,6.0,220.5,0,1,1,42,55
9837,2018-04-06,1,WAS,-850,-10.5,210.0,0,0,0,42,51
9838,2018-04-10,1,WAS,-350,-7.5,204.5,1,1,1,42,45


In [17]:
stats_df = pd.read_csv('../data/nba.games.stats.csv')
raw_df = pd.merge(odds_df,stats_df,on=['Team','Date'])
raw_df

Unnamed: 0,Date,Home_x,Team,ML,Spread,Over,Win,Win_Spread,Win_Over,season_wins,...,Opp.FreeThrows,Opp.FreeThrowsAttempted,Opp.FreeThrows.,Opp.OffRebounds,Opp.TotalRebounds,Opp.Assists,Opp.Steals,Opp.Blocks,Opp.Turnovers,Opp.TotalFouls
0,2014-10-29,0,ATL,161,4.5,198.5,0,0,1,0,...,27,33,0.818,16,48,26,13,9,9,22
1,2014-11-01,1,ATL,-650,-11.0,193.5,1,0,1,0,...,18,21,0.857,11,44,25,5,5,18,26
2,2014-11-05,0,ATL,315,8.0,202.0,0,1,0,1,...,27,38,0.711,11,50,25,7,9,19,15
3,2014-11-07,0,ATL,105,1.5,194.5,0,0,1,1,...,20,27,0.741,11,51,31,6,7,19,30
4,2014-11-08,1,ATL,-290,-6.5,194.5,1,1,1,1,...,8,11,0.727,13,44,26,2,6,15,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9560,2018-04-03,0,WAS,325,8.5,216.5,0,0,1,42,...,18,27,0.667,10,46,26,13,3,9,14
9561,2018-04-05,0,WAS,210,6.0,220.5,0,1,1,42,...,22,28,0.786,5,35,26,10,3,16,14
9562,2018-04-06,1,WAS,-850,-10.5,210.0,0,0,0,42,...,16,23,0.696,7,50,24,5,5,18,22
9563,2018-04-10,1,WAS,-350,-7.5,204.5,1,1,1,42,...,22,27,0.815,13,44,22,14,1,16,18


In [18]:
raw_df.to_csv('first_merge.csv')

In [14]:
stats_df

Unnamed: 0.1,Unnamed: 0,Team,Game,Date,Home,Opponent,WINorLOSS,TeamPoints,OpponentPoints,FieldGoals,...,Opp.FreeThrows,Opp.FreeThrowsAttempted,Opp.FreeThrows.,Opp.OffRebounds,Opp.TotalRebounds,Opp.Assists,Opp.Steals,Opp.Blocks,Opp.Turnovers,Opp.TotalFouls
0,1,ATL,1,2014-10-29,Away,TOR,L,102,109,40,...,27,33,0.818,16,48,26,13,9,9,22
1,2,ATL,2,2014-11-01,Home,IND,W,102,92,35,...,18,21,0.857,11,44,25,5,5,18,26
2,3,ATL,3,2014-11-05,Away,SAS,L,92,94,38,...,27,38,0.711,11,50,25,7,9,19,15
3,4,ATL,4,2014-11-07,Away,CHO,L,119,122,43,...,20,27,0.741,11,51,31,6,7,19,30
4,5,ATL,5,2014-11-08,Home,NYK,W,103,96,33,...,8,11,0.727,13,44,26,2,6,15,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9835,78119,WAS,78,2018-04-03,Away,HOU,L,104,120,38,...,18,27,0.667,10,46,26,13,3,9,14
9836,79119,WAS,79,2018-04-05,Away,CLE,L,115,119,47,...,22,28,0.786,5,35,26,10,3,16,14
9837,80119,WAS,80,2018-04-06,Home,ATL,L,97,103,35,...,16,23,0.696,7,50,24,5,5,18,22
9838,81124,WAS,81,2018-04-10,Home,BOS,W,113,101,41,...,22,27,0.815,13,44,22,14,1,16,18


# Random Forest Model

In [None]:
#target = df["Win_PL"]
target = df["Win"]
target_names = ["loss", "win"]

In [None]:
data = df.drop(columns=['Win', 'Win_PL'],axis=1)
feature_names = data.columns

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [180,190,200,210,220],
    'criterion': ['gini','entropy']
}

grid = GridSearchCV(RandomForestClassifier(), param_grid, verbose=True, n_jobs=-1)

grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=210,criterion='entropy')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Deep Network

In [None]:
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

deep_model = Sequential()
deep_model.add(Dense(units=18, activation='relu', input_dim=6))
deep_model.add(Dense(units=9, activation='relu'))
deep_model.add(Dense(units=2, activation='softmax'))

In [None]:
deep_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])

In [None]:
deep_model.fit(
    X_train,
    y_train_categorical,
    epochs=50,
    shuffle=True,
    verbose=2
)

In [None]:
model_loss, model_accuracy = deep_model.evaluate(
    X_test, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
import pandas as pd
import re

In [None]:
url = 'https://www.pro-football-reference.com/years/2007/games.htm'

In [None]:
tables = pd.read_html(url)
tables