In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from xgboost import XGBRegressor, XGBClassifier
from load import dataset

In [2]:
# setup
predict_week = 2
predict_season = 2020

In [41]:
# Load
df = dataset(predict_season=predict_season, predict_week=predict_week, window_size=4, update_data=True, update_seasons=[pd.to_datetime('today').year])

In [42]:
# Clean Up
df[df.columns[df.columns.str.contains('home|Home')]] = df[df.columns[df.columns.str.contains(
    'home|Home')]].groupby('home_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

df[df.columns[df.columns.str.contains('away|Away')]] = df[df.columns[df.columns.str.contains(
    'away|Away')]].groupby('away_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

df = df.dropna(axis=1, thresh=int(len(df) * .9))

df = df.fillna(0)

df[df.select_dtypes('bool').columns] = df.select_dtypes('bool').astype('int')

In [54]:
df[(df['season'] == 2020) & (df['week'] == 2) & (df['spread_target'] == 0.0)]

Unnamed: 0,game_id,season,week,season_type,start_date,neutral_site,conference_game,attendance,venue_id,venue,...,year PKHome,year TEHome,weight LBHome,weight LSHome,height LBHome,height LSHome,year LBHome,year LSHome,talentHome,talentAway
5592,401240296,2020,2,regular,2020-09-12 16:00:00+00:00,0,0.0,0.0,4727.0,McLane Stadium,...,2.5,2.0,219.909091,211.0,73.272727,74.0,2.454545,4.0,654.59,454.499143
5593,401240113,2020,2,regular,2020-09-12 17:30:00+00:00,0,0.0,0.0,3841.0,Blaik Field at Michie Stadium,...,2.25,1.777778,236.324374,228.0,73.423077,73.4,1.923077,2.6,182.456071,317.354848
5594,401234574,2020,2,regular,2020-09-12 18:30:00+00:00,0,1.0,0.0,3855.0,Notre Dame Stadium,...,3.333333,2.625,223.666667,211.0,72.916667,73.0,3.0,2.333333,845.804706,611.357931
5595,401207113,2020,2,regular,2020-09-12 19:30:00+00:00,0,0.0,0.0,3644.0,Jim Wacker Field at Bobcat Stadium,...,2.25,2.0,214.9,195.0,72.366667,69.5,2.2,1.5,328.451379,384.438
5596,401234563,2020,2,regular,2020-09-12 20:00:00+00:00,0,1.0,0.0,3697.0,Bobby Bowden Field at Doak Campbell Stadium,...,2.0,1.6,229.666667,225.5,73.133333,73.5,2.133333,2.5,897.702973,591.955385
5597,401240233,2020,2,regular,2020-09-12 20:00:00+00:00,0,0.0,0.0,3752.0,Heinz Field,...,2.0,2.333333,225.416667,235.0,73.083333,74.0,2.25,2.0,641.079412,57.344
5598,401215298,2020,2,regular,2020-09-12 20:14:00+00:00,0,0.0,0.0,3608.0,Paulson Stadium,...,1.0,2.0,226.470588,226.666667,73.058824,72.666667,2.470588,3.0,381.134483,96.7
5599,401236054,2020,2,regular,2020-09-12 23:00:00+00:00,0,0.0,0.0,3835.0,Gaylord Family Oklahoma Memorial Stadium,...,2.0,3.056788,224.533333,231.0,73.733333,70.0,2.266667,3.0,816.041081,36.79
5600,401240039,2020,2,regular,2020-09-12 23:00:00+00:00,0,0.0,0.0,3886.0,Raymond James Stadium,...,3.0,2.5,221.083333,223.0,72.833333,69.0,2.5,2.5,551.950303,40.2225
5601,401234562,2020,2,regular,2020-09-12 23:30:00+00:00,0,1.0,0.0,3630.0,BB&T Field,...,1.833333,1.777778,222.222222,226.363636,73.444444,71.090909,2.555556,3.240642,528.613143,855.895172


In [67]:
# Model
ct = make_column_transformer(
    (OrdinalEncoder(), ['season_type', 'home_conference', 'away_conference']),
    remainder='passthrough'
)

drop_columns =  ['game_id', 'season', 'venue', 'start_date', 'home_team', 'away_team', 'start_date', 'spread_target']

X_train = df[(df.start_date.dt.tz_convert('US/Central').dt.date < pd.to_datetime('today')) | (df['spread_target'] != 0.0)].drop(drop_columns, axis=1)  
y_train = df[(df.start_date.dt.tz_convert('US/Central').dt.date < pd.to_datetime('today')) | (df['spread_target'] != 0.0)][['spread_target']] 

X_pred = df[(df.start_date.dt.tz_convert('US/Central').dt.date >= pd.to_datetime('today')) & (df['spread_target'] == 0.0)].drop(drop_columns, axis=1) 
# y_pred = y[X.start_date.dt.date >= pd.to_datetime('today')]

if len(X_pred) > 0:

    spread_model = XGBRegressor(learning_rate=0.1, 
                                    colsample_bytree=1.0,
                                    gamma=0.5,
                                    max_depth=5,
                                    min_child_weight=8,
                                    n_estimators=100,
                                    subsample=1.0)

    spread_model.fit(ct.fit_transform(X_train), y_train['spread_target'])
    y_spread_pred = np.around(spread_model.predict(ct.fit_transform(X_pred))/.5, decimals=0)*.5

    final_columns = ['game_id', 'season', 'week', 'season_type', 'home_team',
                    'home_conference', 'away_team', 'away_conference',  'pre_game_spread',
                    'predicted_spread']

    X_pred = pd.concat([
                        X_pred, 
                        df[df.start_date.dt.tz_convert('US/Central').dt.date >= pd.to_datetime('today')].drop([col for col in df.columns if col not in drop_columns], axis=1) 
                        ], 
                        axis=1)

    predict_df = pd.concat([
            X_pred.reset_index(drop=True), 
            pd.Series(y_spread_pred, name='predicted_spread'),
        ], axis=1)

    predict_df = predict_df[final_columns]

    predict_df['spread_pick'] = np.where(predict_df['predicted_spread'] <= predict_df['pre_game_spread'], predict_df['home_team'], predict_df['away_team'])
    predict_df['straight_pick'] = np.where(predict_df['predicted_spread'] < 0, predict_df['home_team'], predict_df['away_team'])

    predict_df = predict_df[predict_df['week'] == predict_week]
    predict_df

else:
    print('Nothing to Predict')
    pass


In [68]:
# Pull in games that have occurred within the current week to append to the predictions
filepath = Path('../zillion_picks/picks')

result_dfs = []

for file in filepath.rglob('*.csv'):
    result_df = pd.read_csv(file)


    result_dfs.append(result_df)

result_df = pd.concat(result_dfs)

result_df = result_df[(result_df['week'] == predict_week) & (result_df['season'] == predict_season)]

try:
    result_df = result_df[~result_df.game_id.isin(predict_df.game_id)]
    result_df = result_df.drop(['actual_spread', 'spread_result', 'straight_result', 'start_date'], axis=1)

except:
        result_df = result_df.drop(['actual_spread', 'spread_result', 'straight_result', 'start_date'], axis=1)

In [69]:
result_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick
0,401234559,2020,2,regular,Miami,ACC,UAB,Conference USA,-14.5,-20.5,Miami,Miami
1,401236081,2020,2,regular,Appalachian State,Sun Belt,Charlotte,Conference USA,-17.0,-24.5,Appalachian State,Appalachian State
2,401236053,2020,2,regular,Kansas State,Big 12,Arkansas State,Sun Belt,-13.0,-21.5,Kansas State,Kansas State
3,401236038,2020,2,regular,West Virginia,Big 12,Eastern Kentucky,MISSING,-44.5,-45.0,West Virginia,West Virginia
4,401236221,2020,2,regular,Iowa State,Big 12,Louisiana,Sun Belt,-11.0,-20.0,Iowa State,Iowa State
5,401234565,2020,2,regular,North Carolina,ACC,Syracuse,ACC,-24.0,-24.0,North Carolina,North Carolina


In [70]:
# Merge Predit and Result Dataframes
try:
    predict_df = pd.concat([result_df, predict_df])
except:
    predict_df = result_df

In [71]:
# Add spread target in case the game is complete
predict_df = predict_df.merge(df[['game_id', 'spread_target', 'start_date']], on='game_id', how='left')

In [72]:
predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick,spread_target,start_date
0,401234559,2020,2.0,regular,Miami,ACC,UAB,Conference USA,-14.5,-20.5,Miami,Miami,-17.0,2020-09-11 00:00:00+00:00
1,401236081,2020,2.0,regular,Appalachian State,Sun Belt,Charlotte,Conference USA,-17.0,-24.5,Appalachian State,Appalachian State,-15.0,2020-09-12 16:00:00+00:00
2,401236053,2020,2.0,regular,Kansas State,Big 12,Arkansas State,Sun Belt,-13.0,-21.5,Kansas State,Kansas State,4.0,2020-09-12 16:00:00+00:00
3,401236038,2020,2.0,regular,West Virginia,Big 12,Eastern Kentucky,MISSING,-44.5,-45.0,West Virginia,West Virginia,-46.0,2020-09-12 16:00:00+00:00
4,401236221,2020,2.0,regular,Iowa State,Big 12,Louisiana,Sun Belt,-11.0,-20.0,Iowa State,Iowa State,17.0,2020-09-12 16:00:00+00:00
5,401234565,2020,2.0,regular,North Carolina,ACC,Syracuse,ACC,-24.0,-24.0,North Carolina,North Carolina,-25.0,2020-09-12 16:00:00+00:00
6,401240296,2020,2.0,regular,Baylor,Big 12,Louisiana Tech,Conference USA,-18.5,-33.5,Baylor,Baylor,0.0,2020-09-12 16:00:00+00:00
7,401240113,2020,2.0,regular,Army,FBS Independents,Louisiana Monroe,Sun Belt,-25.0,-34.0,Army,Army,0.0,2020-09-12 17:30:00+00:00
8,401234574,2020,2.0,regular,Notre Dame,FBS Independents,Duke,ACC,-22.5,-51.5,Notre Dame,Notre Dame,0.0,2020-09-12 18:30:00+00:00
9,401207113,2020,2.0,regular,Texas State,Sun Belt,UT San Antonio,Conference USA,-6.0,-23.5,Texas State,Texas State,0.0,2020-09-12 19:30:00+00:00


In [87]:
# Convert spread_target games that have not occurred to Null
predict_df['spread_target']  = np.where((pd.to_datetime(predict_df['start_date']).dt.date >= pd.to_datetime('today').date()) & (predict_df['spread_target'] == 0.0), np.nan, predict_df['spread_target'] )

In [88]:
predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick,spread_target,start_date
0,401234559,2020,2.0,regular,Miami,ACC,UAB,Conference USA,-14.5,-20.5,Miami,Miami,-17.0,2020-09-11 00:00:00+00:00
1,401236081,2020,2.0,regular,Appalachian State,Sun Belt,Charlotte,Conference USA,-17.0,-24.5,Appalachian State,Appalachian State,-15.0,2020-09-12 16:00:00+00:00
2,401236053,2020,2.0,regular,Kansas State,Big 12,Arkansas State,Sun Belt,-13.0,-21.5,Kansas State,Kansas State,4.0,2020-09-12 16:00:00+00:00
3,401236038,2020,2.0,regular,West Virginia,Big 12,Eastern Kentucky,MISSING,-44.5,-45.0,West Virginia,West Virginia,-46.0,2020-09-12 16:00:00+00:00
4,401236221,2020,2.0,regular,Iowa State,Big 12,Louisiana,Sun Belt,-11.0,-20.0,Iowa State,Iowa State,17.0,2020-09-12 16:00:00+00:00
5,401234565,2020,2.0,regular,North Carolina,ACC,Syracuse,ACC,-24.0,-24.0,North Carolina,North Carolina,-25.0,2020-09-12 16:00:00+00:00
6,401240296,2020,2.0,regular,Baylor,Big 12,Louisiana Tech,Conference USA,-18.5,-33.5,Baylor,Baylor,,2020-09-12 16:00:00+00:00
7,401240113,2020,2.0,regular,Army,FBS Independents,Louisiana Monroe,Sun Belt,-25.0,-34.0,Army,Army,,2020-09-12 17:30:00+00:00
8,401234574,2020,2.0,regular,Notre Dame,FBS Independents,Duke,ACC,-22.5,-51.5,Notre Dame,Notre Dame,,2020-09-12 18:30:00+00:00
9,401207113,2020,2.0,regular,Texas State,Sun Belt,UT San Antonio,Conference USA,-6.0,-23.5,Texas State,Texas State,,2020-09-12 19:30:00+00:00


In [89]:
# Spread Result Calculation
won = ((predict_df['predicted_spread'] > predict_df['pre_game_spread']) & (predict_df['spread_target'].fillna(0) > predict_df['pre_game_spread'])) | (
    (predict_df['predicted_spread'] < predict_df['pre_game_spread']) & (predict_df['spread_target'].fillna(0) < predict_df['pre_game_spread']))

push = predict_df['spread_target'].fillna(0) == predict_df['pre_game_spread']

conditions = [won, push]
choices = ['Won', 'Push']

predict_df['spread_result'] = np.select(condlist=conditions, choicelist=choices, default='Lost')
predict_df['spread_result'] = np.where(predict_df['spread_target'].isnull(), np.nan, predict_df['spread_result'])

In [90]:
# Straight Up Result Calculation
result = predict_df['spread_target'].fillna(1) * predict_df['predicted_spread']

predict_df['straight_result'] = np.where(result > 0, 'Won', 'Lost')
predict_df['straight_result'] = np.where(predict_df['spread_target'].isnull(), np.nan, predict_df['straight_result'])

In [91]:
# Rename Column
predict_df = predict_df.rename(columns={'spread_target':'actual_spread'})

In [92]:
predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick,actual_spread,start_date,spread_result,straight_result
0,401234559,2020,2.0,regular,Miami,ACC,UAB,Conference USA,-14.5,-20.5,Miami,Miami,-17.0,2020-09-11 00:00:00+00:00,Won,Won
1,401236081,2020,2.0,regular,Appalachian State,Sun Belt,Charlotte,Conference USA,-17.0,-24.5,Appalachian State,Appalachian State,-15.0,2020-09-12 16:00:00+00:00,Lost,Won
2,401236053,2020,2.0,regular,Kansas State,Big 12,Arkansas State,Sun Belt,-13.0,-21.5,Kansas State,Kansas State,4.0,2020-09-12 16:00:00+00:00,Lost,Lost
3,401236038,2020,2.0,regular,West Virginia,Big 12,Eastern Kentucky,MISSING,-44.5,-45.0,West Virginia,West Virginia,-46.0,2020-09-12 16:00:00+00:00,Won,Won
4,401236221,2020,2.0,regular,Iowa State,Big 12,Louisiana,Sun Belt,-11.0,-20.0,Iowa State,Iowa State,17.0,2020-09-12 16:00:00+00:00,Lost,Lost
5,401234565,2020,2.0,regular,North Carolina,ACC,Syracuse,ACC,-24.0,-24.0,North Carolina,North Carolina,-25.0,2020-09-12 16:00:00+00:00,Lost,Won
6,401240296,2020,2.0,regular,Baylor,Big 12,Louisiana Tech,Conference USA,-18.5,-33.5,Baylor,Baylor,,2020-09-12 16:00:00+00:00,,
7,401240113,2020,2.0,regular,Army,FBS Independents,Louisiana Monroe,Sun Belt,-25.0,-34.0,Army,Army,,2020-09-12 17:30:00+00:00,,
8,401234574,2020,2.0,regular,Notre Dame,FBS Independents,Duke,ACC,-22.5,-51.5,Notre Dame,Notre Dame,,2020-09-12 18:30:00+00:00,,
9,401207113,2020,2.0,regular,Texas State,Sun Belt,UT San Antonio,Conference USA,-6.0,-23.5,Texas State,Texas State,,2020-09-12 19:30:00+00:00,,


In [93]:
# Export
filepath = Path('../zillion_picks/picks')

for key, group in predict_df.groupby(['season', 'week', 'season_type']):
    if group.season_type.unique() == 'postseason':
        group.to_csv(filepath/f'premium/{key[0]}_postseason.csv', index=False)
    else:
        group.to_csv(filepath/f'premium/{key[0]}_{str(key[1]).rjust(2, "0")}.csv', index=False)