### 2. Production code
- **This is the workflow that I have to push the new scraped data into my prediction model, and from there, output my predictions. **
- I must have at least updated for the latest games played. 

In [1]:
import os
import glob
import gspread
import warnings
import datetime
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.externals import joblib

from sklearn.linear_model import LogisticRegression
pd.options.display.float_format = '{:,.2f}'.format
from sklearn import metrics
from prod_funs import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### Load Prediction Model

In [2]:
base_folder = os.path.abspath(os.getcwd())
filename = 'First_production_model.sav'
loaded_model = joblib.load(filename)

### Dropbox Stuff 
- Some Dropbox functions upload and download my files into Dropbox

In [3]:
from dropbox_functions import authenticate_into_dropbox
from dropbox_functions import load_file_into_my_dropbox, download_file_from_dropbox
account = authenticate_into_dropbox()
files_to_shift = ['game_date_2017.csv', 'main_players_2017.csv', 'more_home_away_2017.csv', 'more_team_stats_2017.csv',
                  'more_wins_losses_2017.csv', 'referees_2017.csv', 'start_bench_2017.csv', 'team_2017.csv']

** Some minor workflow to remind myself to scrape my data first before running this portion of the code **

In [4]:
file_time_modified = os.path.getmtime("game_date_2017.csv")
time_now = int(datetime.datetime.now().strftime('%H'))
time_now

20

### Functions for data interactions with Dropbox 

In [5]:
def delete_dropbox_data(account, files_to_shift):
    for i in files_to_shift:
        try:
            account.files_delete("/nba games/{}".format(i))
            print("Delete {} from dropbox".format(i))
        except:
            print("No {}".format(i))

def upload_dropbx_data(account, files_to_shift):
    for i in files_to_shift:
        load_file_into_my_dropbox(account, i)
    print("Upload {}".format(i))

### Scenarios - Just some if-else code to make sure I did some stuff first
1. If files are not updated today and time is after 5am, update!
1. If files is not updated today but time is before 5am, don't update
1. If file is updated today already, don't update

In [6]:
if (datetime.datetime.fromtimestamp(file_time_modified).strftime('%Y-%m-%d') != datetime.datetime.today().strftime('%Y-%m-%d')):
    if time_now > 5:
        print("Update the data!")
    else:
        print("You are working hard! The data was updated yesterday, but you are still working hard in the wee hours!")
        date_to_remove = (datetime.datetime.today() - datetime.timedelta(days=2)).date()
else:
    print("You are early! The data is updated today! Woho!")
    date_to_remove = (datetime.datetime.today() - datetime.timedelta(days=1)).date()

print(" ")
print(date_to_remove)

You are early! The data is updated today! Woho!
 
2018-02-05


### Delete data from Dropbox 

In [7]:
for i in files_to_shift:
    try:
        account.files_delete("/nba games/{}".format(i))
        print("Delete {} from dropbox".format(i))
    except:
        print("No {}".format(i))

Delete game_date_2017.csv from dropbox
Delete main_players_2017.csv from dropbox
Delete more_home_away_2017.csv from dropbox
Delete more_team_stats_2017.csv from dropbox
Delete more_wins_losses_2017.csv from dropbox
Delete referees_2017.csv from dropbox
Delete start_bench_2017.csv from dropbox
Delete team_2017.csv from dropbox


### Upload updated new data 

In [8]:
for i in files_to_shift:
    load_file_into_my_dropbox(account, i)
    print("Upload {}".format(i))

Upload game_date_2017.csv
Upload main_players_2017.csv
Upload more_home_away_2017.csv
Upload more_team_stats_2017.csv
Upload more_wins_losses_2017.csv
Upload referees_2017.csv
Upload start_bench_2017.csv
Upload team_2017.csv


### Load data from existing season

In [9]:
for i in glob.glob("*2017.csv"):
    tmp = pd.read_csv(i)
    tmp.drop_duplicates().to_csv(i, index=False)

In [10]:
final_table = pd.DataFrame()
home_csv, teams_csv, more_stats_csv, game_date_csv, win_loss_csv = load_all_files("2017")

# Processing this season's games for prediction!

In [11]:
merge_home_away = create_home_away_var(home_csv).drop_duplicates()
merge_date = pd.merge(merge_home_away, create_date_variable(game_date_csv))
merge_team_stats = add_game_stats(teams_csv.drop_duplicates(), merge_date)

merge_game_count = create_game_count_var(merge_team_stats)
merge_previous_date = create_days_from_previous_games_var(merge_game_count)
merge_opp = create_opp_stats(merge_previous_date)
merge_opp = create_win_loss_vars(merge_opp)
""" I need to cumulate all numeric stats """
var_to_accum = merge_opp.drop(["GAME_ID", "TEAM_ID_x", "TEAM_ID_y", "TEAM_NAME_x", "TEAM_NAME_y", "G_x", "G_y", 
                               "Home", "GAME_DATE", "WL_x", 'p_games_x', 'p_games_y', "SEASON", "LIVE_PERIOD"], axis=1).columns
    
""" Potential for change: Right now, I am using the entire season for the stats calculations. 
    However, it might be more accurate to use rolling window calculations instead."""
# Create accumulative variables
merge_total = merge_opp.sort_values(["TEAM_NAME_x", "GAME_DATE"])
merge_total[var_to_accum] = merge_total.groupby('TEAM_ID_x')[var_to_accum].transform(pd.Series.cumsum)

# Create NBA domain-knowledge variables
merge_total = create_rebs_efficiency_vars(create_fta_to_fga_ratio(create_efg_var(merge_total)))

# Create percentage variables 
"""Dividing the cumulative stats of these variables by the games_played thus far to get their averages."""
vars_to_average = ["OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PTS", "W", "L"]
merge_total_ave = create_shooting_percentage_vars(create_averages(merge_total, vars_to_average))
# merge_games_shifted = shift_game_stats_down_by_one(merge_total_ave)
final_t_ab_opp = create_team_ab_and_opp_table(merge_total_ave)
finals_home = filter_home_teams(final_t_ab_opp)
final_table = final_table.append(finals_home)

### Note on which day to predict!
- I have predictions on today + tomorrow's games. 

In [12]:
date_to_remove.strftime('%Y-%m-%d')

'2018-02-05'

### Functions for predictions! 

In [13]:
ta_table = final_table.copy()
ta_table = ta_table[['game_id', ] + [i for i in final_table.columns if '_ta' in i]]

tb_table = final_table.copy()
tb_table = tb_table[['game_id', ] + [i for i in final_table.columns if '_ta' in i]]
tb_table.columns = [i.replace("_ta_opp", "_tb_opp").replace("_ta", "_tb") for i in tb_table.columns]

In [14]:
def making_predict_table(predict_table, table, ta_table, tb_table):
    # Create team_names for merging
    predict_table["team_name_ta"] = [i.split(" ")[-1] for i in predict_table['home']]
    predict_table['team_name_ta'].replace("Blazers", "Trail Blazers", inplace=True)
    predict_table["team_name_tb"] = [i.split(" ")[-1] for i in predict_table['away']]
    predict_table['team_name_tb'].replace("Blazers", "Trail Blazers", inplace=True)

    
    team_a_latest_stats = ta_table[ta_table["team_name_ta"].isin(predict_table['team_name_ta'].unique().tolist())].sort_values(
        ["game_id", 'team_name_ta'], ascending=[False, False]).drop_duplicates(['team_name_ta'])
    team_b_latest_stats = tb_table[tb_table["team_name_tb"].isin(predict_table['team_name_tb'].unique().tolist())].sort_values(
        ["game_id", 'team_name_tb'], ascending=[False, False]).drop_duplicates(['team_name_tb'])

    predict_table = pd.merge(
        predict_table, 
        team_a_latest_stats[[i for i in final_table.columns if "_ta" in i] + [i for i in final_table.columns if "_la" in i]], 
        on='team_name_ta', how='left')

    predict_table = pd.merge(
        predict_table, 
        team_b_latest_stats[[i for i in final_table.columns if "_tb" in i] + [i for i in final_table.columns if "_lb" in i]], 
        on='team_name_tb', how='left')

    predict_table["pts_ast_ta"] = predict_table['pts_ta'] / predict_table['ast_ta']
    predict_table["pts_ast_tb"] = predict_table['pts_tb'] / predict_table['ast_tb']

    predict_table["pts_ast_ta_opp"] = predict_table['pts_ta'] / predict_table['ast_ta']
    predict_table["pts_ast_tb_opp"] = predict_table['pts_tb'] / predict_table['ast_tb']

    predict_table['game_win_rates_ta'] = predict_table["w_rate_ta"] * predict_table['g_ta'] 
    predict_table['game_win_rates_tb'] = predict_table["w_rate_tb"] * predict_table['g_tb']
    return predict_table

In [15]:
def making_predictions(predictions, model):
    final_predict = for_prediction(predictions, model)

    proba = pd.DataFrame(loaded_model.predict_proba(final_predict))
    proba.columns = ['home_w', "phome_w"]
    proba.loc[proba["phome_w"] > .5, "home_w"] = 1
    proba.loc[proba["phome_w"] <= .5, "home_w"] = 0

    results = pd.concat([predictions[["away", "home", "arena", "location", "time", "d_filter"]], proba], axis=1)
    results['home_w'] = results['home_w'].astype(int)
    return results

In [16]:
def today_games_results(table, game_date_result):
    win_loss = table.copy()
    win_loss['game_date'] = pd.to_datetime(win_loss['GAME_DATE_EST'])
    win_loss = win_loss[win_loss.game_date == game_date_result][["GAME_ID", "TEAM_NICKNAME", "PTS", "game_date"]]
    win_loss = pd.merge(win_loss.groupby(['GAME_ID']).first().reset_index(), win_loss.groupby(['GAME_ID']).last().reset_index(), 
             on=["GAME_ID", "game_date"])
    win_loss.columns = ['game_id', 'team_name_ta', 'pts_ta', 'game_date', 'team_name_tb', 'pts_tb']
    win_loss = win_loss[['game_id', 'game_date', 'team_name_ta', 'team_name_tb', 'pts_ta', 'pts_tb']]
    win_loss["predictions"] = 0
    win_loss.loc[(win_loss['pts_ta'] > win_loss['pts_tb']), "predictions"] = 1
    return win_loss

### Execute predictions 

In [17]:
def for_prediction(dataframe, var):
    """This functions selects the variables required and make them into sklearn-ready formats! """
    # hackish way to create up the dataframe, haha... 
    dataframe['wl_ta'] = 1
    y, x = dmatrices('wl_ta ~ ' + var, dataframe, return_type="dataframe")
    return x

### The parameters of the best model! 

In [18]:
model_9_ratio_vars = '''
game_win_rates_ta + game_win_rates_tb + g_ta + g_tb + p_games_ta + p_games_tb + 
pts_ast_ta + pts_ast_tb + pts_ast_ta_opp + pts_ast_tb_opp + 

pts_ta + oreb_ta + dreb_ta + ast_ta + stl_ta + blk_ta + to_ta +
pts_tb + oreb_tb + dreb_tb + ast_tb + stl_tb + blk_tb + to_tb + 
pts_ta_opp + oreb_ta_opp + dreb_ta_opp + ast_ta_opp + stl_ta_opp + blk_ta_opp + to_ta_opp +
pts_tb_opp + oreb_tb_opp + dreb_tb_opp + ast_tb_opp + stl_tb_opp + blk_tb_opp + to_tb_opp +

efg_ta + fgp_ta + efg_ta_opp + fgp_ta_opp + fta_fga_ta + fta_fga_ta_opp + fg3p_ta + ftp_ta + 
efg_tb + fgp_tb + efg_tb_opp + fgp_tb_opp + fta_fga_tb + fta_fga_tb_opp + fg3p_tb + ftp_tb
'''

### Predictions for today's games!

In [19]:
final_table['game_id'] = final_table['game_id'].astype(int)

In [20]:
# Drop out today's games for today's predictions
today = final_table[final_table.game_date<=date_to_remove]

### Load new day games for today's predictions

In [21]:
schedule = pd.read_csv("2017_2018_nba_schedule.csv")
schedule['d_filter'] = [datetime.datetime.strptime(i, "%A, %B %d").strftime("%d-%m") for i in schedule['date']]
d_filter = date_to_remove.strftime("%d-%m")
predictions = schedule[schedule['d_filter'] == d_filter]
predictions

Unnamed: 0,away,home,arena,location,time,date,d_filter
789,Portland Trail Blazers,Detroit Pistons,Little Caesars Arena,"Detroit, MI",7:00 PM ET,"Monday, February 5",05-02
790,Washington Wizards,Indiana Pacers,Bankers Life Fieldhouse,"Indianapolis, IN",7:00 PM ET,"Monday, February 5",05-02
791,Orlando Magic,Miami Heat,AmericanAirlines Arena,"Miami, FL",7:30 PM ET,"Monday, February 5",05-02
792,Utah Jazz,New Orleans Pelicans,Smoothie King Center,"New Orleans, LA",8:00 PM ET,"Monday, February 5",05-02
793,Charlotte Hornets,Denver Nuggets,Pepsi Center,"Denver, CO",9:00 PM ET,"Monday, February 5",05-02
794,Chicago Bulls,Sacramento Kings,Golden 1 Center,"Sacramento, CA",10:00 PM ET,"Monday, February 5",05-02
795,Dallas Mavericks,LA Clippers,Staples Center,"Los Angeles, CA",10:30 PM ET,"Monday, February 5",05-02


In [22]:
today_predict_table = making_predict_table(predictions, today, ta_table, tb_table)
today_predict = making_predictions(today_predict_table, model_9_ratio_vars)
today_predict

### Today's actual games played!

In [24]:
today_games = today_games_results(win_loss_csv, date_to_remove)
today_games

Unnamed: 0,game_id,game_date,team_name_ta,team_name_tb,pts_ta,pts_tb,predictions
0,21700790,2018-02-05,Pistons,Trail Blazers,111,91,1
1,21700791,2018-02-05,Pacers,Wizards,102,111,0
2,21700792,2018-02-05,Heat,Magic,109,111,0
3,21700793,2018-02-05,Pelicans,Jazz,109,133,0
4,21700794,2018-02-05,Nuggets,Hornets,121,104,1
5,21700795,2018-02-05,Kings,Bulls,104,98,1
6,21700796,2018-02-05,Clippers,Mavericks,104,101,1


### Comparing final results with output! 

In [25]:
today_predict['team_name_ta'] = [i.split(" ")[-1] for i in today_predict["home"]]
today_predict['team_name_tb'] = [i.split(" ")[-1] for i in today_predict["away"]]
today_predict['team_name_ta'].replace("Blazers", "Trail Blazers", inplace=True)
today_predict['team_name_tb'].replace("Blazers", "Trail Blazers", inplace=True)

In [26]:
ffinal = pd.merge(today_predict, today_games, on=['team_name_ta', 'team_name_tb'], how='left')
ffinal = ffinal[["away", 'home', 'location', 'home_w', "phome_w", 'predictions', 'd_filter', 'pts_ta', 'pts_tb']]
ffinal['p_score'] = 1
ffinal.loc[ffinal['predictions'] != ffinal['home_w'], 'p_score'] = 0
ffinal

Unnamed: 0,away,home,location,home_w,phome_w,predictions,d_filter,pts_ta,pts_tb,p_score
0,Portland Trail Blazers,Detroit Pistons,"Detroit, MI",1,0.56,1,05-02,111,91,1
1,Washington Wizards,Indiana Pacers,"Indianapolis, IN",1,0.6,0,05-02,102,111,0
2,Orlando Magic,Miami Heat,"Miami, FL",1,0.76,0,05-02,109,111,0
3,Utah Jazz,New Orleans Pelicans,"New Orleans, LA",1,0.6,0,05-02,109,133,0
4,Charlotte Hornets,Denver Nuggets,"Denver, CO",1,0.68,1,05-02,121,104,1
5,Chicago Bulls,Sacramento Kings,"Sacramento, CA",1,0.5,1,05-02,104,98,1
6,Dallas Mavericks,LA Clippers,"Los Angeles, CA",1,0.69,1,05-02,104,101,1


In [27]:
ffinal['p_score'].sum() / float(len(ffinal))

0.5714285714285714

In [28]:
ffinal.to_csv("predict/{}_predictions.csv".format(date_to_remove), index=False)

### Load prediction into Dropbox 

In [29]:
os.chdir(base_folder + "/predict")
load_file_into_my_dropbox(account, "{}_predictions.csv".format(date_to_remove), folder="nba games/predictions/")
print("Upload {}_predictions.csv".format(date_to_remove))
os.chdir(base_folder)

Upload 2018-02-05_predictions.csv


## Next game's predictions!

In [30]:
next_day = (date_to_remove + datetime.timedelta(days=1)).strftime("%d-%m")
next_day_predict = schedule[schedule['d_filter'] == next_day]
next_day_predict

Unnamed: 0,away,home,arena,location,time,date,d_filter
796,Cleveland Cavaliers,Orlando Magic,Amway Center,"Orlando, FL",7:00 PM ET,"Tuesday, February 6",06-02
797,Boston Celtics,Toronto Raptors,Air Canada Centre,"Toronto, ON",7:30 PM ET,"Tuesday, February 6",06-02
798,Memphis Grizzlies,Atlanta Hawks,Philips Arena,"Atlanta, GA",7:30 PM ET,"Tuesday, February 6",06-02
799,Houston Rockets,Brooklyn Nets,Barclays Center,"Brooklyn, NY",7:30 PM ET,"Tuesday, February 6",06-02
800,Milwaukee Bucks,New York Knicks,Madison Square Garden,"New York, NY",7:30 PM ET,"Tuesday, February 6",06-02
801,Washington Wizards,Philadelphia 76ers,Wells Fargo Center,"Philadelphia, PA",8:00 PM ET,"Tuesday, February 6",06-02
802,Oklahoma City Thunder,Golden State Warriors,ORACLE Arena,"Oakland, CA",10:30 PM ET,"Tuesday, February 6",06-02
803,Phoenix Suns,Los Angeles Lakers,Staples Center,"Los Angeles, CA",10:30 PM ET,"Tuesday, February 6",06-02


In [31]:
next_day_predict_table = making_predict_table(next_day_predict, final_table, ta_table, tb_table)

In [32]:
tmr_predict = making_predictions(next_day_predict_table, model_9_ratio_vars)
tmr_predict

Unnamed: 0,away,home,arena,location,time,d_filter,home_w,phome_w
0,Cleveland Cavaliers,Orlando Magic,Amway Center,"Orlando, FL",7:00 PM ET,06-02,0,0.37
1,Boston Celtics,Toronto Raptors,Air Canada Centre,"Toronto, ON",7:30 PM ET,06-02,1,0.65
2,Memphis Grizzlies,Atlanta Hawks,Philips Arena,"Atlanta, GA",7:30 PM ET,06-02,0,0.49
3,Houston Rockets,Brooklyn Nets,Barclays Center,"Brooklyn, NY",7:30 PM ET,06-02,0,0.26
4,Milwaukee Bucks,New York Knicks,Madison Square Garden,"New York, NY",7:30 PM ET,06-02,1,0.54
5,Washington Wizards,Philadelphia 76ers,Wells Fargo Center,"Philadelphia, PA",8:00 PM ET,06-02,1,0.6
6,Oklahoma City Thunder,Golden State Warriors,ORACLE Arena,"Oakland, CA",10:30 PM ET,06-02,1,0.82
7,Phoenix Suns,Los Angeles Lakers,Staples Center,"Los Angeles, CA",10:30 PM ET,06-02,1,0.69
