# Import Dependencies and data 
#### Read the data using pandas 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from sklearn import tree

In [None]:
# Study data files
data1 = "../data/nba odds 2014-15.csv"
data2 = "../data/nba odds 2015-16.csv"
data3 = "../data/nba odds 2016-17.csv"
data4 = "../data/nba odds 2017-18.csv"

# Read the mouse data and the study results
data1 = pd.read_csv(data1)
data2 = pd.read_csv(data2)
data3 = pd.read_csv(data3)
data4 = pd.read_csv(data4)

# Merge all the different seasons 

In [None]:
def set_win(row):
    if row['Point Dif'] > 0:
        return 1
    else:
        return 0
    
def set_spread_win(row):
    handi = row['Point Dif'] + row['Spread']
    if handi > 0:
        return 1
    else:
        return 0
    
def set_ou_win(row):
    if row['Game Total'] > row['Over']:
        return 1
    else:
        return 0

def data_clean(df):
    df = df.replace({'Team': {'Atlanta':'ATL','Boston':'BOS','Brooklyn':'BRK',
                              'Charlotte':'CHO','Chicago':'CHI','Cleveland':'CLE',
                              'Dallas':'DAL','Denver':'DEN','Detroit':'DET',
                              'GoldenState':'GSW','Houston':'HOU','Indiana':'IND',
                              'LAClippers':'LAC','LALakers':'LAL','Memphis':'MEM',
                              'Miami':'MIA','Milwaukee':'MIL','Minnesota':'MIN',
                              'NewOrleans':'NOP','NewYork':'NYK','OklahomaCity':'OKC',
                              'Orlando':'ORL','Philadelphia':'PHI','Phoenix':'PHO',
                              'Portland':'POR','Sacramento':'SAC','SanAntonio':'SAS',
                              'Toronto':'TOR','Utah':'UTA','Washington':'WAS'
                    }})

    p_dif = []
    p_total = []
    spread = []
    over = []

    # Iterate through table and calculate point dif and winners and insert into list 
    for i in range(1, df.shape[0], 2):
        home_dif = df['Final'].iloc[i] - df['Final'].iloc[i-1]
        away_dif = df['Final'].iloc[i-1] - df['Final'].iloc[i]
        total = df['Final'].iloc[i] + df['Final'].iloc[i-1]
        if df['Close'].iloc[i] == 'pk':
            spread.append(0)
            spread.append(0)
            over.append(float(df['Close'].iloc[i-1]))
            over.append(float(df['Close'].iloc[i-1]))
        elif df['Close'].iloc[i-1] == 'pk':
            spread.append(0)
            spread.append(0)
            over.append(float(df['Close'].iloc[i]))
            over.append(float(df['Close'].iloc[i]))
        elif df['ML'].iloc[i] < df['ML'].iloc[i-1]:
            spread.append(float(df['Close'].iloc[i]))
            spread.append(-float(df['Close'].iloc[i]))
            over.append(float(df['Close'].iloc[i-1]))
            over.append(float(df['Close'].iloc[i-1]))
        elif df['ML'].iloc[i] > df['ML'].iloc[i-1]:
            spread.append(-float(df['Close'].iloc[i-1]))
            spread.append(float(df['Close'].iloc[i-1]))
            over.append(float(df['Close'].iloc[i]))
            over.append(float(df['Close'].iloc[i]))
        else:
            spread.append(0)
            spread.append(0)
            over.append(float(df['Close'].iloc[i-1]))
            over.append(float(df['Close'].iloc[i-1]))

        p_total.append(total)
        p_total.append(total)
        p_dif.append(away_dif)
        p_dif.append(home_dif)
            
    # Add point dif column and data         
    df['Point Dif'] = p_dif
    df['Game Total'] = p_total
    df['Spread'] = spread
    df['Over'] = over
    df = df.reset_index()
    
    # add the result column 
    df = df.assign(Win=df.apply(set_win, axis=1))
    df = df.assign(Win_Spread=df.apply(set_spread_win, axis=1)) 
    df = df.assign(Win_Over=df.apply(set_ou_win, axis=1)) 
    
    point_dict = {}
    net_points = []
    wd = {}
    w = []    

    for i in df.iterrows():
    #point difference
        if i[1]['Team'] in point_dict:
            net_points.append(point_dict[i[1]['Team']])
            point_dict[i[1]['Team']] = point_dict[i[1]['Team']] + int(i[1]['Point Dif'])
            w.append(wd[i[1]['Team']])
            wd[i[1]['Team']] = wd[i[1]['Team']] + int(i[1]['Win'])
        else:
            point_dict[i[1]['Team']] = int(i[1]['Point Dif'])
            net_points.append(0)
            wd[i[1]['Team']] = int(i[1]['Win'])
            w.append(0)
    
    df['season_wins'] = w        
    df['net_points'] = net_points
    df = df.sort_values(by=['Team','season_wins','Date'])
    return df

In [None]:
clean1 = data_clean(data1)
clean2 = data_clean(data2)
clean3 = data_clean(data3)
clean4 = data_clean(data4)

frames = [clean1,clean2,clean3,clean4]
    
df = pd.concat(frames)
df.drop(df.columns[23:],axis=1,inplace=True)
df

In [None]:
df.to_csv('historical_odds.csv')

In [None]:
odds_df = pd.read_csv('historical_odds.csv')
odds_df

# Cleaning Up the Dataframe

In [None]:
cols = [0,1,3,6,7,8,9,10,11,12,14,15,16]
odds_df.drop(odds_df.columns[cols],axis=1,inplace=True)
odds_df

In [None]:
odds_df = odds_df.replace({'VH': {'V': 0, 'H': 1, 'N':0}})
odds_df = odds_df.rename(columns={"VH": "Home"})
odds_df = odds_df.dropna()

odds_df

In [None]:
odds_df.to_csv('historical_odds.csv')

# Manually add dates

In [None]:
odds_df = pd.read_csv('historical_odds.csv')
cols = [0]
odds_df.drop(odds_df.columns[cols],axis=1,inplace=True)
odds_df

In [None]:
stats_df = pd.read_csv('../data/nba.games.stats.csv')
raw_df = pd.merge(odds_df,stats_df,on=['Team','Date'])
raw_df

In [None]:
raw_df.to_csv('first_merge.csv')

In [None]:
# team_dict = {}
# avg_for_fg = []
# avg_for_fgp = []
# avg_opp_fgp = []
# avg_for_ft = []
# avg_for_ftp = []
# avg_opp_ft_att = []
# for_3p = []
# avg_for_3p = []
# avg_opp_3p = []
# avg_for_blk = []
# avg_opp_blk = []
# avg_for_to = []
# avg_opp_to = []
# ttl_blk = []

raw_df['avg_for_fg'] = raw_df['FieldGoals'].rolling(2).mean()
raw_df['avg_for_fgp'] = raw_df['FieldGoals.'].rolling(2).mean()
raw_df['avg_opp_fgp'] = raw_df['Opp.FieldGoals.'].rolling(2).mean()
raw_df['avg_for_3p'] = raw_df['X3PointShots'].rolling(2).mean()
raw_df['avg_for_3pp'] = raw_df['X3PointShots.'].rolling(2).mean()
raw_df['avg_opp_3pp'] = raw_df['Opp.3PointShots.'].rolling(2).mean()
raw_df['avg_for_ft'] = raw_df['FreeThrows'].rolling(2).mean()
raw_df['avg_for_ftp'] = raw_df['FreeThrows.'].rolling(2).mean()
raw_df['avg_opp_fta'] = raw_df['Opp.FreeThrowsAttempted'].rolling(2).mean()
raw_df['avg_for_rb'] = raw_df['TotalRebounds'].rolling(2).mean()
raw_df['avg_opp_rb'] = raw_df['Opp.TotalRebounds'].rolling(2).mean()
raw_df['avg_for_blk'] = raw_df['Blocks'].rolling(2).mean()
raw_df['avg_opp_blk'] = raw_df['Opp.Blocks'].rolling(2).mean()
raw_df['avg_for_to'] = raw_df['Turnovers'].rolling(2).mean()
raw_df['avg_opp_to'] = raw_df['Opp.Turnovers'].rolling(2).mean()


# for i in raw_df.iterrows():
# #point difference
#     if i[1]['Team'] in team_dict:
#         for_fg.append(point_dict[i[1]['Team']]['for_fg'])
#         team_dict[i[1]['Team']]['for_fg'] = team_dict[i[1]['Team']]['for_fg'] + int(i[1]['FieldGoals'])
#         for_3p.append(point_dict[i[1]['Team']]['for_3p'])
#         team_dict[i[1]['Team']]['for_3p'] = team_dict[i[1]['Team']]['for_3p'] + int(i[1]['X3PointShots'])
#         ttl_blk.append(point_dict[i[1]['Team']]['ttl_blk'])
#         team_dict[i[1]['Team']]['ttl_blk'] = team_dict[i[1]['Team']]['ttl_blk'] + int(i[1]['Blocks'])
#         ttl_blk.append(point_dict[i[1]['Team']]['ttl_blk'])
#         team_dict[i[1]['Team']]['ttl_blk'] = team_dict[i[1]['Team']]['ttl_blk'] + int(i[1]['Blocks'])
#     else:
#         game_cnt = 1
#         point_dict[i[1]['Team']] = int(i[1]['Point Dif'])
#         net_points.append(0)
#         wd[i[1]['Team']] = int(i[1]['Win'])
#         w.append(0)
        
raw_df

In [None]:
raw_df = raw_df.rename(columns={"Home_x":"home"})

raw_df = raw_df[["Team","home", "ML","Spread","Over","season_wins", "net_points", "avg_for_fg",
              "avg_for_fgp", "avg_opp_fgp","avg_for_3p","avg_for_3pp","avg_opp_3pp","avg_for_ft",
                "avg_for_ftp","avg_opp_fta","avg_for_rb","avg_opp_rb","avg_for_blk","avg_opp_blk",
                 "avg_for_to","avg_opp_to","Win","Win_Spread","Win_Over"]]

In [None]:
raw_df

In [None]:
final_df = raw_df.dropna()
final_df = final_df.reset_index()
cols = [0]
final_df.drop(final_df.columns[cols],axis=1,inplace=True)
final_df

In [None]:
cols = [0]
final_df.drop(final_df.columns[cols],axis=1,inplace=True)
final_df.to_csv('model_data_set.csv')
final_df

# Random Forest Model

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from sklearn import tree

In [2]:
final_df = pd.read_csv('model_data_set.csv')
cols = [0]
final_df.drop(final_df.columns[cols],axis=1,inplace=True)
#final_df

In [4]:
# target = final_df["Win_Over"]
# target = final_df["Win_Spread"]
target = final_df["Win"]
target_names = ["loss", "win"]

In [5]:
data = final_df.drop(columns=['Win', 'Win_Spread', 'Win_Over'],axis=1)
feature_names = data.columns

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [230,240,250,260,270],
    'criterion': ['gini','entropy']
}

grid = GridSearchCV(RandomForestClassifier(), param_grid, verbose=True, n_jobs=-1)

grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [7]:
#win
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200,criterion='entropy')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.7712254286909243

In [None]:
#winspread
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=170,criterion='gini')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
#winover
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=260,criterion='entropy')
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [8]:
from joblib import dump, load
dump(rf, 'straight_model.joblib') 

['straight_model.joblib']