### Production code
- **Things I hope to improve**
    - Wrote scraper for today's games being predicted, and combine them into the structure that can be easily compared 
    - Make going through different models much more easily --> Work on the player specific analyses.

In [1]:
import os
import glob
import gspread
import warnings
import datetime
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.externals import joblib

from sklearn.linear_model import LogisticRegression
pd.options.display.float_format = '{:,.2f}'.format
from sklearn import metrics
from prod_funs import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### Load model to be predicted

In [2]:
base_folder = os.path.abspath(os.getcwd())
filename = 'Production_model_v2.sav'
loaded_model = joblib.load(filename)

### Load data from existing season

In [5]:
# Check and reload data to remove duplicates
for i in glob.glob("*2017.csv"):
    tmp = pd.read_csv(i)
    tmp.drop_duplicates().to_csv(i, index=False)

In [7]:
final_table = pd.DataFrame()
home_csv, teams_csv, more_stats_csv, game_date_csv, win_loss_csv = load_all_files("2017")

# Processing this season's games for prediction!

In [8]:
merge_home_away = create_home_away_var(home_csv).drop_duplicates()
merge_date = pd.merge(merge_home_away, create_date_variable(game_date_csv))
merge_team_stats = add_game_stats(teams_csv.drop_duplicates(), merge_date)

merge_game_count = create_game_count_var(merge_team_stats)
merge_previous_date = create_days_from_previous_games_var(merge_game_count)
merge_opp = create_opp_stats(merge_previous_date)
merge_opp = create_win_loss_vars(merge_opp)
""" I need to cumulate all numeric stats """
var_to_accum = merge_opp.drop(["GAME_ID", "TEAM_ID_x", "TEAM_ID_y", "TEAM_NAME_x", "TEAM_NAME_y", "G_x", "G_y", 
                               "Home", "GAME_DATE", "WL_x", 'p_games_x', 'p_games_y', "SEASON", "LIVE_PERIOD"], axis=1).columns
    
""" Potential for change: Right now, I am using the entire season for the stats calculations. 
    However, it might be more accurate to use rolling window calculations instead."""
# Create accumulative variables
merge_total = merge_opp.sort_values(["TEAM_NAME_x", "GAME_DATE"])
merge_total[var_to_accum] = merge_total.groupby('TEAM_ID_x')[var_to_accum].transform(pd.Series.cumsum)

# Create NBA domain-knowledge variables
merge_total = create_rebs_efficiency_vars(create_fta_to_fga_ratio(create_efg_var(merge_total)))

# Create percentage variables 
"""Dividing the cumulative stats of these variables by the games_played thus far to get their averages."""
vars_to_average = ["OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PTS", "W", "L"]
merge_total_ave = create_shooting_percentage_vars(create_averages(merge_total, vars_to_average))
# merge_games_shifted = shift_game_stats_down_by_one(merge_total_ave)
final_t_ab_opp = create_team_ab_and_opp_table(merge_total_ave)
finals_home = filter_home_teams(final_t_ab_opp)
final_table = final_table.append(finals_home)

### Note on which day to predict!
- I have predictions on today + tomorrow's games. 

In [9]:
date_to_remove.strftime('%Y-%m-%d')

'2017-12-01'

### Functions for predictions! 

In [10]:
ta_table = final_table.copy()
ta_table = ta_table[['game_id', ] + [i for i in final_table.columns if '_ta' in i]]

tb_table = final_table.copy()
tb_table = tb_table[['game_id', ] + [i for i in final_table.columns if '_ta' in i]]
tb_table.columns = [i.replace("_ta_opp", "_tb_opp").replace("_ta", "_tb") for i in tb_table.columns]

In [11]:
def making_predict_table(predict_table, table, ta_table, tb_table):
    # Create team_names for merging
    predict_table["team_name_ta"] = [i.split(" ")[-1] for i in predict_table['home']]
    predict_table['team_name_ta'].replace("Blazers", "Trail Blazers", inplace=True)
    predict_table["team_name_tb"] = [i.split(" ")[-1] for i in predict_table['away']]
    predict_table['team_name_tb'].replace("Blazers", "Trail Blazers", inplace=True)

    
    team_a_latest_stats = ta_table[ta_table["team_name_ta"].isin(predict_table['team_name_ta'].unique().tolist())].sort_values(
        ["game_id", 'team_name_ta'], ascending=[False, False]).drop_duplicates(['team_name_ta'])
    team_b_latest_stats = tb_table[tb_table["team_name_tb"].isin(predict_table['team_name_tb'].unique().tolist())].sort_values(
        ["game_id", 'team_name_tb'], ascending=[False, False]).drop_duplicates(['team_name_tb'])

    predict_table = pd.merge(
        predict_table, 
        team_a_latest_stats[[i for i in final_table.columns if "_ta" in i] + [i for i in final_table.columns if "_la" in i]], 
        on='team_name_ta', how='left')

    predict_table = pd.merge(
        predict_table, 
        team_b_latest_stats[[i for i in final_table.columns if "_tb" in i] + [i for i in final_table.columns if "_lb" in i]], 
        on='team_name_tb', how='left')

    predict_table["pts_ast_ta"] = predict_table['pts_ta'] / predict_table['ast_ta']
    predict_table["pts_ast_tb"] = predict_table['pts_tb'] / predict_table['ast_tb']

    predict_table["pts_ast_ta_opp"] = predict_table['pts_ta'] / predict_table['ast_ta']
    predict_table["pts_ast_tb_opp"] = predict_table['pts_tb'] / predict_table['ast_tb']

    predict_table['game_win_rates_ta'] = predict_table["w_rate_ta"] * predict_table['g_ta'] 
    predict_table['game_win_rates_tb'] = predict_table["w_rate_tb"] * predict_table['g_tb']
    return predict_table

### Calculating player specific stuff 

In [12]:
player_season_stats = pd.read_csv("player_season_statistics.csv")
player_season_stats[player_season_stats.season==2016].head()

Unnamed: 0,season,PLAYER_ID,PLAYER_NAME,PTS,AST,BLK,REB,STL,games_played,pts_l,ast_l,blk_l,reb_l,stl_l,type,defenders,facilitator,game_winners,inside_gamers,normal,pure_scorers,useless
10316,2016,200826,J.J. Barea,10.89,5.51,0.03,2.4,0.4,41,0,1,0,0,0,facilitator,0,1,0,0,0,0,0
10317,2016,1626159,Justise Winslow,10.89,3.67,0.33,5.22,1.44,41,0,0,0,0,1,defenders,1,0,0,0,0,0,0
10318,2016,1627735,Wade Baldwin IV,3.21,1.85,0.21,1.39,0.55,41,0,0,0,0,0,useless,0,0,0,0,0,0,1
10319,2016,1627786,Troy Williams,6.17,0.83,0.33,2.3,0.9,41,0,0,0,0,0,useless,0,0,0,0,0,0,1
10320,2016,1627855,Okaro White,2.8,0.6,0.29,2.34,0.29,41,0,0,0,0,0,useless,0,0,0,0,0,0,1


### Need code to integrate player specific data

### Continue from existing code 

In [15]:
def making_predictions(predictions, model):
    final_predict = for_prediction(predictions, model_9_ratio_vars)

    proba = pd.DataFrame(loaded_model.predict_proba(final_predict))
    proba.columns = ['home_w', "phome_w"]
    proba.loc[proba["phome_w"] > .5, "home_w"] = 1
    proba.loc[proba["phome_w"] <= .5, "home_w"] = 0

    results = pd.concat([predictions[["away", "home", "arena", "location", "time", "d_filter"]], proba], axis=1)
    results['home_w'] = results['home_w'].astype(int)
    return results

In [16]:
def today_games_results(table, game_date_result):
    win_loss = table.copy()
    win_loss['game_date'] = pd.to_datetime(win_loss['GAME_DATE_EST'])
    win_loss = win_loss[win_loss.game_date == game_date_result][["GAME_ID", "TEAM_NICKNAME", "PTS", "game_date"]]
    win_loss = pd.merge(win_loss.groupby(['GAME_ID']).first().reset_index(), win_loss.groupby(['GAME_ID']).last().reset_index(), 
             on=["GAME_ID", "game_date"])
    win_loss.columns = ['game_id', 'team_name_ta', 'pts_ta', 'game_date', 'team_name_tb', 'pts_tb']
    win_loss = win_loss[['game_id', 'game_date', 'team_name_ta', 'team_name_tb', 'pts_ta', 'pts_tb']]
    win_loss["predictions"] = 0
    win_loss.loc[(win_loss['pts_ta'] > win_loss['pts_tb']), "predictions"] = 1
    return win_loss

### Execute predictions 

In [17]:
def for_prediction(dataframe, var):
    """This functions selects the variables required and make them into sklearn-ready formats! """
    # hackish way to create up the dataframe, haha... 
    dataframe['wl_ta'] = 1
    y, x = dmatrices('wl_ta ~ ' + var, dataframe, return_type="dataframe")
    return x

### The parameters of the best model! 

In [18]:
model_9_ratio_vars = '''
w_rate_ta * g_ta + w_rate_tb * g_tb + g_ta + g_tb + p_games_ta + p_games_tb + 
pts_ta + oreb_ta + dreb_ta + ast_ta + stl_ta + blk_ta + to_ta +
pts_tb + oreb_tb + dreb_tb + ast_tb + stl_tb + blk_tb + to_tb + 
pts_ta_opp + oreb_ta_opp + dreb_ta_opp + ast_ta_opp + stl_ta_opp + blk_ta_opp + to_ta_opp +
pts_tb_opp + oreb_tb_opp + dreb_tb_opp + ast_tb_opp + stl_tb_opp + blk_tb_opp + to_tb_opp +

efg_ta + fgp_ta + efg_ta_opp + fgp_ta_opp + fta_fga_ta + fta_fga_ta_opp + fg3p_ta + ftp_ta + 
efg_tb + fgp_tb + efg_tb_opp + fgp_tb_opp + fta_fga_tb + fta_fga_tb_opp + fg3p_tb + ftp_tb +
pts_la + pts_lb + ast_la + blk_la + reb_la + stl_la + ast_lb + blk_lb + reb_lb + stl_lb +
defenders_a + facilitator_a + game_winners_a + inside_gamers_a + pure_scorers_a +
defenders_b + facilitator_b + game_winners_b + inside_gamers_b + pure_scorers_b
'''

### Predictions for today's games!

In [19]:
final_table['game_id'] = final_table['game_id'].astype(int)

In [20]:
# Drop out today's games for today's predictions
today = final_table[final_table.game_date!=date_to_remove]
today.sort_values(['game_id', 'team_name_ta'], ascending=[True, True]).tail(3)

Unnamed: 0,ast_ta,blk_ta,dreb_ta,fg3a_ta,fg3m_ta,fga_ta,fgm_ta,fta_ta,ftm_ta,game_id,oreb_ta,pf_ta,pts_ta,reb_ta,stl_ta,team_id_ta,team_name_ta,to_ta,home,game_date,g_ta,p_games_ta,ast_ta_opp,blk_ta_opp,dreb_ta_opp,fg3a_ta_opp,fg3m_ta_opp,fga_ta_opp,fgm_ta_opp,fta_ta_opp,ftm_ta_opp,oreb_ta_opp,pf_ta_opp,pts_ta_opp,reb_ta_opp,stl_ta_opp,to_ta_opp,season,live_period,g_ta_opp,p_games_ta_opp,wl_ta,w_rate_ta,l_ta,w_ta_opp,l_ta_opp,efg_ta,fgp_ta,efg_ta_opp,fgp_ta_opp,fta_fga_ta,fta_fga_ta_opp,oreb_p_ta,dreb_p_ta,oreb_p_ta_opp,dreb_p_ta_opp,fg3p_ta_opp,ftp_ta_opp,fg3p_ta,ftp_ta,ast_tb,blk_tb,dreb_tb,fg3a_tb,fg3m_tb,fga_tb,fgm_tb,fta_tb,ftm_tb,oreb_tb,pf_tb,pts_tb,reb_tb,stl_tb,team_id_tb,team_name_tb,to_tb,g_tb,p_games_tb,ast_tb_opp,blk_tb_opp,dreb_tb_opp,fg3a_tb_opp,fg3m_tb_opp,fga_tb_opp,fgm_tb_opp,fta_tb_opp,ftm_tb_opp,oreb_tb_opp,pf_tb_opp,pts_tb_opp,reb_tb_opp,stl_tb_opp,to_tb_opp,g_tb_opp,p_games_tb_opp,w_rate_tb,l_tb,w_tb_opp,l_tb_opp,efg_tb,fgp_tb,efg_tb_opp,fgp_tb_opp,fta_fga_tb,fta_fga_tb_opp,oreb_p_tb,dreb_p_tb,oreb_p_tb_opp,dreb_p_tb_opp,fg3p_tb_opp,ftp_tb_opp,fg3p_tb,ftp_tb
339,23.58,4.53,33.16,556,207,1638,764,436,314,21700274,12.26,17.63,107.84,45.42,8.26,1610612743,Nuggets,16.16,Home,2017-11-24,19,2.0,24.58,4.68,31.53,579,221,1622,770,350,262,9.47,21.11,106.47,41.0,9.21,16.11,2017,4,18,2.0,1,0.58,0.42,0.42,0.58,0.53,0.47,0.54,0.47,0.27,0.22,0.28,0.78,0.22,0.72,0.38,0.75,0.37,0.72,21.11,5.78,32.39,513,166,1453,649,413,325,7.44,22.39,99.39,39.83,7.5,1610612763,Grizzlies,13.39,18,2.0,20.06,5.39,34.44,526,184,1479,648,434,339,10.28,21.61,101.06,44.72,6.78,15.06,19,2.0,0.39,0.61,0.61,0.39,0.5,0.45,0.5,0.44,0.28,0.29,0.18,0.76,0.24,0.82,0.35,0.78,0.32,0.79
474,19.15,5.55,35.5,517,167,1800,789,528,395,21700275,11.6,23.55,107.0,47.1,7.15,1610612756,Suns,16.2,Home,2017-11-24,20,2.0,22.0,5.1,36.55,562,210,1815,843,559,418,10.85,23.55,115.7,47.4,8.65,14.15,2017,4,19,2.0,0,0.35,0.65,0.65,0.35,0.48,0.44,0.52,0.46,0.29,0.31,0.24,0.77,0.23,0.76,0.37,0.75,0.32,0.75,26.0,4.32,35.32,558,190,1620,780,418,318,8.89,20.26,108.84,44.21,8.21,1610612740,Pelicans,15.95,19,2.0,22.74,4.47,33.74,585,212,1675,765,402,315,10.21,21.32,108.26,43.95,8.68,15.42,20,2.0,0.58,0.42,0.42,0.58,0.54,0.48,0.52,0.46,0.26,0.24,0.21,0.78,0.22,0.79,0.36,0.78,0.34,0.76
551,30.11,9.0,37.11,599,240,1630,836,396,326,21700276,8.68,21.37,117.79,45.79,8.63,1610612744,Warriors,16.47,Home,2017-11-24,19,2.0,23.16,3.79,31.63,579,197,1728,740,462,352,11.21,17.95,106.79,42.84,9.37,14.63,2017,4,17,2.0,1,0.74,0.26,0.26,0.74,0.59,0.51,0.49,0.43,0.24,0.27,0.22,0.77,0.23,0.78,0.34,0.76,0.4,0.82,21.12,3.18,34.71,532,177,1493,606,281,216,10.0,18.76,94.41,44.71,6.71,1610612741,Bulls,14.12,17,2.0,25.06,4.0,38.0,508,196,1443,675,349,278,8.29,16.29,107.29,46.29,7.82,12.35,19,2.0,0.18,0.82,0.82,0.18,0.47,0.41,0.54,0.47,0.19,0.24,0.21,0.81,0.19,0.79,0.39,0.8,0.33,0.77


### Load new day games for today's predictions

In [21]:
schedule = pd.read_csv("2017_2018_nba_schedule.csv")
schedule['d_filter'] = [datetime.datetime.strptime(i, "%A, %B %d").strftime("%d-%m") for i in schedule['date']]
d_filter = date_to_remove.strftime("%d-%m")
predictions = schedule[schedule['d_filter'] == d_filter]
predictions

Unnamed: 0,away,home,arena,location,time,date,d_filter
276,San Antonio Spurs,Charlotte Hornets,Spectrum Center,"Charlotte, NC",7:00 PM ET,"Saturday, November 25",25-11
277,Orlando Magic,Philadelphia 76ers,Wells Fargo Center,"Philadelphia, PA",7:00 PM ET,"Saturday, November 25",25-11
278,Portland Trail Blazers,Washington Wizards,Capital One Arena,"Washington, DC",7:00 PM ET,"Saturday, November 25",25-11
279,Toronto Raptors,Atlanta Hawks,Philips Arena,"Atlanta, GA",7:30 PM ET,"Saturday, November 25",25-11
280,Boston Celtics,Indiana Pacers,Bankers Life Fieldhouse,"Indianapolis, IN",8:00 PM ET,"Saturday, November 25",25-11
281,New York Knicks,Houston Rockets,Toyota Center,"Houston, TX",8:00 PM ET,"Saturday, November 25",25-11
282,Oklahoma City Thunder,Dallas Mavericks,American Airlines Center,"Dallas, TX",8:30 PM ET,"Saturday, November 25",25-11
283,New Orleans Pelicans,Golden State Warriors,ORACLE Arena,"Oakland, CA",8:30 PM ET,"Saturday, November 25",25-11
284,Milwaukee Bucks,Utah Jazz,Vivint Smart Home Arena,"Salt Lake City, UT",9:00 PM ET,"Saturday, November 25",25-11
285,LA Clippers,Sacramento Kings,Golden 1 Center,"Sacramento, CA",10:00 PM ET,"Saturday, November 25",25-11


In [22]:
today_predict_table = making_predict_table(predictions, today, ta_table, tb_table)    
today_predict = making_predictions(today_predict_table, model_9_ratio_vars)
today_predict

ValueError: X has 55 features per sample; expecting 102

### Today's actual games played!

In [None]:
today_games = today_games_results(win_loss_csv, date_to_remove)
today_games

### Comparing final results with output! 

In [None]:
today_predict['team_name_ta'] = [i.split(" ")[-1] for i in today_predict["home"]]
today_predict['team_name_tb'] = [i.split(" ")[-1] for i in today_predict["away"]]
today_predict['team_name_ta'].replace("Blazers", "Trail Blazers", inplace=True)
today_predict['team_name_tb'].replace("Blazers", "Trail Blazers", inplace=True)

In [None]:
ffinal = pd.merge(today_predict, today_games, on=['team_name_ta', 'team_name_tb'], how='left')
ffinal = ffinal[["away", 'home', 'location', 'home_w', "phome_w", 'predictions', 'd_filter', 'pts_ta', 'pts_tb']]
ffinal['p_score'] = 1
ffinal.loc[ffinal['predictions'] != ffinal['home_w'], 'p_score'] = 0
ffinal

In [None]:
ffinal['p_score'].sum() / float(len(ffinal))

In [None]:
ffinal.to_csv("predict/{}_predictions.csv".format(date_to_remove), index=False)

### Load prediction into Dropbox 

In [None]:
os.chdir(base_folder + "/predict")
load_file_into_my_dropbox(account, "{}_predictions.csv".format(date_to_remove), folder="nba games/predictions/")
print("Upload {}_predictions.csv".format(date_to_remove))
os.chdir(base_folder)

## Next game's predictions!

In [None]:
next_day = (date_to_remove + datetime.timedelta(days=1)).strftime("%d-%m")
next_day_predict = schedule[schedule['d_filter'] == next_day]
next_day_predict

In [None]:
next_day_predict_table = making_predict_table(next_day_predict, final_table, ta_table, tb_table)

In [None]:
tmr_predict = making_predictions(next_day_predict_table, model_9_ratio_vars)
tmr_predict

In [None]:
os.chdir(base_folder)
os.chdir("..")
os.getcwd()
tmr_predict.to_html('predictions.html')