### Challener Model 1 Backfill + Production
- This is useful for creating a backtesting framework. For now, I want to use this as a backfilling framework too!

In [1]:
import os
import glob
import gspread
import warnings
import datetime
import numpy as np
import pandas as pd
from patsy import dmatrices
from sklearn.externals import joblib

from sklearn.linear_model import LogisticRegression
pd.options.display.float_format = '{:,.2f}'.format
from sklearn import metrics
from prod_funs import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

### Load model to be predicted

In [2]:
base_folder = os.path.abspath(os.getcwd())

In [25]:
filename = 'First_Production_model.sav'
loaded_model = joblib.load(filename)

## Dropbox Stuff 
#### Make sure that I have scripted the new data first!

In [4]:
from dropbox_functions import authenticate_into_dropbox
from dropbox_functions import load_file_into_my_dropbox, download_file_from_dropbox
account = authenticate_into_dropbox()
files_to_shift = ['game_date_2017.csv', 'main_players_2017.csv', 'more_home_away_2017.csv', 'more_team_stats_2017.csv',
                  'more_wins_losses_2017.csv', 'referees_2017.csv', 'start_bench_2017.csv', 'team_2017.csv']

### Load data from existing season

In [5]:
# Check and reload data to remove duplicates
for i in glob.glob("*2017.csv"):
    tmp = pd.read_csv(i)
    tmp.drop_duplicates().to_csv(i, index=False)

In [6]:
final_table = pd.DataFrame()
home_csv, teams_csv, more_stats_csv, game_date_csv, win_loss_csv = load_all_files("2017")

# Processing this season's games for prediction!

In [7]:
merge_home_away = create_home_away_var(home_csv).drop_duplicates()
merge_date = pd.merge(merge_home_away, create_date_variable(game_date_csv))
merge_team_stats = add_game_stats(teams_csv.drop_duplicates(), merge_date)

merge_game_count = create_game_count_var(merge_team_stats)
merge_previous_date = create_days_from_previous_games_var(merge_game_count)
merge_opp = create_opp_stats(merge_previous_date)
merge_opp = create_win_loss_vars(merge_opp)
""" I need to cumulate all numeric stats """
var_to_accum = merge_opp.drop(["GAME_ID", "TEAM_ID_x", "TEAM_ID_y", "TEAM_NAME_x", "TEAM_NAME_y", "G_x", "G_y", 
                               "Home", "GAME_DATE", "WL_x", 'p_games_x', 'p_games_y', "SEASON", "LIVE_PERIOD"], axis=1).columns
    
""" Potential for change: Right now, I am using the entire season for the stats calculations. 
    However, it might be more accurate to use rolling window calculations instead."""
# Create accumulative variables
merge_total = merge_opp.sort_values(["TEAM_NAME_x", "GAME_DATE"])
merge_total[var_to_accum] = merge_total.groupby('TEAM_ID_x')[var_to_accum].transform(pd.Series.cumsum)

# Create NBA domain-knowledge variables
merge_total = create_rebs_efficiency_vars(create_fta_to_fga_ratio(create_efg_var(merge_total)))

# Create percentage variables 
"""Dividing the cumulative stats of these variables by the games_played thus far to get their averages."""
vars_to_average = ["OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PTS", "W", "L"]
merge_total_ave = create_shooting_percentage_vars(create_averages(merge_total, vars_to_average))
# merge_games_shifted = shift_game_stats_down_by_one(merge_total_ave)
final_t_ab_opp = create_team_ab_and_opp_table(merge_total_ave)
finals_home = filter_home_teams(final_t_ab_opp)
final_table = final_table.append(finals_home)

### Add player statistics inside!
- **Good model for future data inclusion!**

In [8]:
players = pd.read_csv("main_players_2017.csv")
players_past_stats = pd.read_csv("player_season_statistics.csv")
players_past_stats = players_past_stats[players_past_stats.season==2016][[
    "PLAYER_ID", "PLAYER_NAME", "defenders", "facilitator", "game_winners", "inside_gamers", "pure_scorers"]]

In [9]:
team_players = pd.merge(players, players_past_stats, on='PLAYER_ID', how='left').groupby(['GAME_ID', "TEAM_ID"])[
    "defenders", "facilitator", "game_winners", "inside_gamers", "pure_scorers"].sum().reset_index()

In [10]:
final_table = pd.merge(pd.merge(final_table, team_players.rename(columns={
    "GAME_ID":"game_id", 'TEAM_ID':'team_id_ta', 'defenders':'defenders_a', 'facilitator':'facilitator_a', 
    'game_winners':'game_winners_a', 'inside_gamers':'insider_gamers_a', 'pure_scorers':'pure_scorers_a'}), 
                  on=['game_id', 'team_id_ta'], how='left'), team_players.rename(columns={
    "GAME_ID":"game_id", 'TEAM_ID':'team_id_tb', 'defenders':'defenders_b', 'facilitator':'facilitator_b', 
    'game_winners':'game_winners_b', 'inside_gamers':'insider_gamers_b', 'pure_scorers':'pure_scorers_b'}), 
         on=['game_id', 'team_id_tb'], how='left')

### Functions for predictions! 

In [11]:
ta_table = final_table.copy()
ta_table = ta_table[['game_id', ] + [i for i in final_table.columns if '_ta' in i] + 
                    ['defenders_a', 'facilitator_a', 'game_winners_a', 'insider_gamers_a', 'pure_scorers_a']]

tb_table = final_table.copy()
tb_table = tb_table[['game_id', ] + [i for i in final_table.columns if '_ta' in i] + 
                    ['defenders_b', 'facilitator_b', 'game_winners_b', 'insider_gamers_b', 'pure_scorers_b']]
tb_table.columns = [i.replace("_ta_opp", "_tb_opp").replace("_ta", "_tb") for i in tb_table.columns]

In [12]:
def making_predict_table(predict_table, table, ta_table, tb_table):
    # Create team_names for merging
    predict_table["team_name_ta"] = [i.split(" ")[-1] for i in predict_table['home']]
    predict_table['team_name_ta'].replace("Blazers", "Trail Blazers", inplace=True)
    predict_table["team_name_tb"] = [i.split(" ")[-1] for i in predict_table['away']]
    predict_table['team_name_tb'].replace("Blazers", "Trail Blazers", inplace=True)

    team_a_latest_stats = ta_table[ta_table["team_name_ta"].isin(predict_table['team_name_ta'].unique().tolist())].sort_values(
        ["game_id", 'team_name_ta'], ascending=[False, False]).drop_duplicates(['team_name_ta'])
    team_b_latest_stats = tb_table[tb_table["team_name_tb"].isin(predict_table['team_name_tb'].unique().tolist())].sort_values(
        ["game_id", 'team_name_tb'], ascending=[False, False]).drop_duplicates(['team_name_tb'])

    predict_table = pd.merge(
        predict_table, 
        team_a_latest_stats[[i for i in final_table.columns if "_ta" in i] + [i for i in final_table.columns if "_la" in i]], 
        on='team_name_ta', how='left')

    predict_table = pd.merge(
        predict_table, 
        team_b_latest_stats[[i for i in final_table.columns if "_tb" in i] + [i for i in final_table.columns if "_lb" in i]], 
        on='team_name_tb', how='left')

    predict_table["pts_ast_ta"] = predict_table['pts_ta'] / predict_table['ast_ta']
    predict_table["pts_ast_tb"] = predict_table['pts_tb'] / predict_table['ast_tb']

    predict_table["pts_ast_ta_opp"] = predict_table['pts_ta'] / predict_table['ast_ta']
    predict_table["pts_ast_tb_opp"] = predict_table['pts_tb'] / predict_table['ast_tb']

    predict_table['game_win_rates_ta'] = predict_table["w_rate_ta"] * predict_table['g_ta'] 
    predict_table['game_win_rates_tb'] = predict_table["w_rate_tb"] * predict_table['g_tb']
    return predict_table

In [13]:
def making_predictions(predictions, model):
    final_predict = for_prediction(predictions, model)

    proba = pd.DataFrame(loaded_model.predict_proba(final_predict))
    proba.columns = ['home_w', "phome_w"]
    proba.loc[proba["phome_w"] > .5, "home_w"] = 1
    proba.loc[proba["phome_w"] <= .5, "home_w"] = 0

    results = pd.concat([predictions[["away", "home", "arena", "location", "time", "d_filter"]], proba], axis=1)
    results['home_w'] = results['home_w'].astype(int)
    return results

In [14]:
def today_games_results(table, game_date_result):
    win_loss = table.copy()
    win_loss['game_date'] = pd.to_datetime(win_loss['GAME_DATE_EST'])
    win_loss = win_loss[win_loss.game_date == game_date_result][["GAME_ID", "TEAM_NICKNAME", "PTS", "game_date"]]
    win_loss = pd.merge(win_loss.groupby(['GAME_ID']).first().reset_index(), win_loss.groupby(['GAME_ID']).last().reset_index(), 
             on=["GAME_ID", "game_date"])
    win_loss.columns = ['game_id', 'team_name_ta', 'pts_ta', 'game_date', 'team_name_tb', 'pts_tb']
    win_loss = win_loss[['game_id', 'game_date', 'team_name_ta', 'team_name_tb', 'pts_ta', 'pts_tb']]
    win_loss["predictions"] = 0
    win_loss.loc[(win_loss['pts_ta'] > win_loss['pts_tb']), "predictions"] = 1
    return win_loss

### Execute predictions 

In [15]:
def for_prediction(dataframe, var):
    """This functions selects the variables required and make them into sklearn-ready formats! """
    # hackish way to create up the dataframe, haha... 
    dataframe['wl_ta'] = 1
    y, x = dmatrices('wl_ta ~ ' + var, dataframe, return_type="dataframe")
    return x

### The parameters of the best model! 

In [31]:
model_9_ad_players = '''
game_win_rates_ta + game_win_rates_tb + g_ta + g_tb + p_games_ta + p_games_tb + 
pts_ast_ta + pts_ast_tb + pts_ast_ta_opp + pts_ast_tb_opp + 

pts_ta + oreb_ta + dreb_ta + ast_ta + stl_ta + blk_ta + to_ta +
pts_tb + oreb_tb + dreb_tb + ast_tb + stl_tb + blk_tb + to_tb + 
pts_ta_opp + oreb_ta_opp + dreb_ta_opp + ast_ta_opp + stl_ta_opp + blk_ta_opp + to_ta_opp +
pts_tb_opp + oreb_tb_opp + dreb_tb_opp + ast_tb_opp + stl_tb_opp + blk_tb_opp + to_tb_opp +

efg_ta + fgp_ta + efg_ta_opp + fgp_ta_opp + fta_fga_ta + fta_fga_ta_opp + fg3p_ta + ftp_ta + 
efg_tb + fgp_tb + efg_tb_opp + fgp_tb_opp + fta_fga_tb + fta_fga_tb_opp + fg3p_tb + ftp_tb
'''

### Preparing for predictions run through

In [27]:
final_table['game_id'] = final_table['game_id'].astype(int)
schedule = pd.read_csv("2017_2018_nba_schedule.csv")
schedule['d_filter'] = [datetime.datetime.strptime(i, "%A, %B %d").strftime("%d-%m") for i in schedule['date']]

### Create list of dates to backfill! 

In [41]:
start_date = datetime.date(year=2017, month=10, day=23)
end_date = (datetime.datetime.now() - datetime.timedelta(days=1)).date()
start_date, end_date
date_list = pd.date_range(start=start_date, end=end_date)
len(date_list)

60

### Load new day games for today's predictions
- **Looping through the different days start from here!**

In [40]:
for date2predict in date_list:
    try:
        d_filter = date2predict.strftime("%d-%m")
        predictions = schedule[schedule['d_filter'] == d_filter]

        today = final_table[final_table.game_date == date2predict]
        today_predict_table = making_predict_table(predictions, today, ta_table, tb_table)
        today_predict = making_predictions(today_predict_table, model_9_ad_players)

        today_games = today_games_results(win_loss_csv, date2predict)
        today_predict['team_name_ta'] = [i.split(" ")[-1] for i in today_predict["home"]]
        today_predict['team_name_tb'] = [i.split(" ")[-1] for i in today_predict["away"]]
        today_predict['team_name_ta'].replace("Blazers", "Trail Blazers", inplace=True)
        today_predict['team_name_tb'].replace("Blazers", "Trail Blazers", inplace=True)

        ffinal = pd.merge(today_predict, today_games, on=['team_name_ta', 'team_name_tb'], how='left')
        ffinal = ffinal[["away", 'home', 'location', 'home_w', "phome_w", 'predictions', 'd_filter', 'pts_ta', 'pts_tb']]
        ffinal['p_score'] = 1
        ffinal.loc[ffinal['predictions'] != ffinal['home_w'], 'p_score'] = 0
        print("Prediction Accuracy for {}: {}".format(date2predict.date(), ffinal['p_score'].sum() / float(len(ffinal))))
        # ffinal.to_csv("predict/cm1/cm1_{}_predictions.csv".format(date2predict), index=False)
    except:
        print("{} had no games!".format(date2predict.date()))
        pass

Prediction Accuracy for 2017-10-23: 0.625
Prediction Accuracy for 2017-10-24: 0.833333333333
Prediction Accuracy for 2017-10-25: 0.8
Prediction Accuracy for 2017-10-26: 0.8
Prediction Accuracy for 2017-10-27: 0.857142857143
Prediction Accuracy for 2017-10-28: 0.5
Prediction Accuracy for 2017-10-29: 0.428571428571
Prediction Accuracy for 2017-10-30: 0.555555555556
Prediction Accuracy for 2017-10-31: 0.5
Prediction Accuracy for 2017-11-01: 0.5
Prediction Accuracy for 2017-11-02: 1.0
Prediction Accuracy for 2017-11-03: 0.833333333333
Prediction Accuracy for 2017-11-04: 0.8
Prediction Accuracy for 2017-11-05: 0.7
Prediction Accuracy for 2017-11-06: 0.666666666667
Prediction Accuracy for 2017-11-07: 0.5
Prediction Accuracy for 2017-11-08: 0.8
Prediction Accuracy for 2017-11-09: 0.8
Prediction Accuracy for 2017-11-10: 0.5
Prediction Accuracy for 2017-11-11: 0.909090909091
Prediction Accuracy for 2017-11-12: 1.0
Prediction Accuracy for 2017-11-13: 0.666666666667
Prediction Accuracy for 2017-1