In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from xgboost import XGBRegressor, XGBClassifier
from load import dataset

In [None]:
# TODO
#

In [59]:
# Load
df = dataset(predict_season=2020, predict_week=1, window_size=4, update_data=True, update_seasons=[pd.to_datetime('today').year])

In [60]:
# Clean Up
df[df.columns[df.columns.str.contains('home|Home')]] = df[df.columns[df.columns.str.contains(
    'home|Home')]].groupby('home_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

df[df.columns[df.columns.str.contains('away|Away')]] = df[df.columns[df.columns.str.contains(
    'away|Away')]].groupby('away_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

df = df.dropna(axis=1, thresh=int(len(df) * .9))

df = df.fillna(0)

df[df.select_dtypes('bool').columns] = df.select_dtypes('bool').astype('int')

In [61]:
# Model
ct = make_column_transformer(
    (OrdinalEncoder(), ['season_type', 'home_conference', 'away_conference']),
    remainder='passthrough'
)

drop_columns =  ['game_id', 'season', 'venue', 'start_date', 'home_team', 'away_team', 'start_date', 'spread_target']

X_train = df[df.start_date.dt.date < pd.to_datetime('today')].drop(drop_columns, axis=1)  
y_train = df[df.start_date.dt.date < pd.to_datetime('today')][['spread_target']] 

X_pred = df[df.start_date.dt.date >= pd.to_datetime('today')].drop(drop_columns, axis=1) 
# y_pred = y[X.start_date.dt.date >= pd.to_datetime('today')]

spread_model = XGBRegressor(learning_rate=0.1, 
                                colsample_bytree=1.0,
                                gamma=0.5,
                                max_depth=5,
                                min_child_weight=8,
                                n_estimators=100,
                                subsample=1.0)

spread_model.fit(ct.fit_transform(X_train), y_train['spread_target'])
y_spread_pred = np.around(spread_model.predict(ct.fit_transform(X_pred))/.5, decimals=0)*.5

final_columns = ['game_id', 'season', 'week', 'season_type', 'home_team',
                'home_conference', 'away_team', 'away_conference',  'pre_game_spread',
                'predicted_spread']

X_pred = pd.concat([
                    X_pred, 
                    df[df.start_date.dt.date >= pd.to_datetime('today')].drop([col for col in df.columns if col not in drop_columns], axis=1) 
                    ], 
                    axis=1)

predict_df = pd.concat([
        X_pred.reset_index(drop=True), 
        pd.Series(y_spread_pred, name='predicted_spread'),
     ], axis=1)

predict_df = predict_df[final_columns]

predict_df['spread_pick'] = np.where(predict_df['predicted_spread'] <= predict_df['pre_game_spread'], predict_df['home_team'], predict_df['away_team'])
predict_df['straight_pick'] = np.where(predict_df['predicted_spread'] < 0, predict_df['home_team'], predict_df['away_team'])

predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick
0,401206213,2020,1,regular,Arizona,Pac-12,Hawai'i,Mountain West,-10.5,-22.0,Arizona,Arizona
1,401207204,2020,1,regular,Navy,American Athletic,Notre Dame,FBS Independents,16.5,13.5,Navy,Notre Dame
2,401207805,2020,1,regular,Georgia Tech,ACC,Clemson,ACC,26.0,33.5,Clemson,Clemson
3,401206217,2020,1,regular,Utah,Pac-12,BYU,FBS Independents,-6.5,-16.5,Utah,Utah
4,401207807,2020,1,regular,Boston College,ACC,Syracuse,ACC,0.0,-6.0,Boston College,Boston College
5,401207808,2020,1,regular,UCF,American Athletic,North Carolina,ACC,-3.0,-6.0,UCF,UCF
6,401206985,2020,1,regular,Wisconsin,Big Ten,Indiana,Big Ten,-12.5,-15.5,Wisconsin,Wisconsin
7,401205855,2020,1,regular,Baylor,Big 12,Ole Miss,SEC,-1.0,-1.5,Baylor,Baylor
8,401206225,2020,1,regular,Washington,Pac-12,Michigan,Big Ten,1.5,2.5,Michigan,Michigan
9,401206958,2020,1,regular,West Virginia,Big 12,Florida State,ACC,6.5,21.0,Florida State,Florida State


In [62]:
# Add spread target in case the game is complete
predict_df = predict_df.merge(df[['game_id', 'spread_target', 'start_date']], on='game_id', how='left')

In [63]:
predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick,spread_target,start_date
0,401206213,2020,1,regular,Arizona,Pac-12,Hawai'i,Mountain West,-10.5,-22.0,Arizona,Arizona,0.0,2020-08-29 04:00:00+00:00
1,401207204,2020,1,regular,Navy,American Athletic,Notre Dame,FBS Independents,16.5,13.5,Navy,Notre Dame,0.0,2020-08-29 04:00:00+00:00
2,401207805,2020,1,regular,Georgia Tech,ACC,Clemson,ACC,26.0,33.5,Clemson,Clemson,0.0,2020-09-03 04:00:00+00:00
3,401206217,2020,1,regular,Utah,Pac-12,BYU,FBS Independents,-6.5,-16.5,Utah,Utah,0.0,2020-09-03 04:00:00+00:00
4,401207807,2020,1,regular,Boston College,ACC,Syracuse,ACC,0.0,-6.0,Boston College,Boston College,0.0,2020-09-04 04:00:00+00:00
5,401207808,2020,1,regular,UCF,American Athletic,North Carolina,ACC,-3.0,-6.0,UCF,UCF,0.0,2020-09-04 04:00:00+00:00
6,401206985,2020,1,regular,Wisconsin,Big Ten,Indiana,Big Ten,-12.5,-15.5,Wisconsin,Wisconsin,0.0,2020-09-04 04:00:00+00:00
7,401205855,2020,1,regular,Baylor,Big 12,Ole Miss,SEC,-1.0,-1.5,Baylor,Baylor,0.0,2020-09-05 04:00:00+00:00
8,401206225,2020,1,regular,Washington,Pac-12,Michigan,Big Ten,1.5,2.5,Michigan,Michigan,0.0,2020-09-05 04:00:00+00:00
9,401206958,2020,1,regular,West Virginia,Big 12,Florida State,ACC,6.5,21.0,Florida State,Florida State,0.0,2020-09-05 04:00:00+00:00


In [64]:
# Convert spread_target games that have not occurred to Null
predict_df['spread_target']  = np.where(pd.to_datetime(predict_df['start_date']).dt.date > pd.to_datetime('today').date(), np.nan, predict_df['spread_target'] )

In [65]:
predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick,spread_target,start_date
0,401206213,2020,1,regular,Arizona,Pac-12,Hawai'i,Mountain West,-10.5,-22.0,Arizona,Arizona,,2020-08-29 04:00:00+00:00
1,401207204,2020,1,regular,Navy,American Athletic,Notre Dame,FBS Independents,16.5,13.5,Navy,Notre Dame,,2020-08-29 04:00:00+00:00
2,401207805,2020,1,regular,Georgia Tech,ACC,Clemson,ACC,26.0,33.5,Clemson,Clemson,,2020-09-03 04:00:00+00:00
3,401206217,2020,1,regular,Utah,Pac-12,BYU,FBS Independents,-6.5,-16.5,Utah,Utah,,2020-09-03 04:00:00+00:00
4,401207807,2020,1,regular,Boston College,ACC,Syracuse,ACC,0.0,-6.0,Boston College,Boston College,,2020-09-04 04:00:00+00:00
5,401207808,2020,1,regular,UCF,American Athletic,North Carolina,ACC,-3.0,-6.0,UCF,UCF,,2020-09-04 04:00:00+00:00
6,401206985,2020,1,regular,Wisconsin,Big Ten,Indiana,Big Ten,-12.5,-15.5,Wisconsin,Wisconsin,,2020-09-04 04:00:00+00:00
7,401205855,2020,1,regular,Baylor,Big 12,Ole Miss,SEC,-1.0,-1.5,Baylor,Baylor,,2020-09-05 04:00:00+00:00
8,401206225,2020,1,regular,Washington,Pac-12,Michigan,Big Ten,1.5,2.5,Michigan,Michigan,,2020-09-05 04:00:00+00:00
9,401206958,2020,1,regular,West Virginia,Big 12,Florida State,ACC,6.5,21.0,Florida State,Florida State,,2020-09-05 04:00:00+00:00


In [66]:
# Spread Result Calculation
won = ((predict_df['predicted_spread'] > predict_df['pre_game_spread']) & (predict_df['spread_target'].fillna(0) > predict_df['pre_game_spread'])) | (
    (predict_df['predicted_spread'] < predict_df['pre_game_spread']) & (predict_df['spread_target'].fillna(0) < predict_df['pre_game_spread']))

push = predict_df['spread_target'].fillna(0) == predict_df['pre_game_spread']

conditions = [won, push]
choices = ['Won', 'Push']

predict_df['spread_result'] = np.select(condlist=conditions, choicelist=choices, default='Lost')
predict_df['spread_result'] = np.where(predict_df['spread_target'].isnull(), np.nan, predict_df['spread_result'])

In [67]:
# Straight Up Result Calculation
result = predict_df['spread_target'].fillna(1) * predict_df['predicted_spread']

predict_df['straight_result'] = np.where(result > 0, 'Won', 'Lost')
predict_df['straight_result'] = np.where(predict_df['spread_target'].isnull(), np.nan, predict_df['straight_result'])

In [68]:
# Rename Column
predict_df = predict_df.rename(columns={'spread_target':'actual_spread'})

In [69]:
# Export
filepath = Path('../zillion_picks/picks')

for key, group in predict_df.groupby(['season', 'week']):
    if group.season_type.unique() == 'postseason':
        group.to_csv(filepath/f'premium/{key[0]}_postseason.csv', index=False)
    else:
        group.to_csv(filepath/f'premium/{key[0]}_{str(key[1]).rjust(2, "0")}.csv', index=False)