In [10]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from xgboost import XGBRegressor, XGBClassifier
from load import dataset

In [11]:
# Initialize Variables
predict_week = 3
predict_season = 2020
today_localize = pd.to_datetime('today').tz_localize('US/Central') - pd.Timedelta(hours=3)

final_columns = ['game_id', 'season', 'week', 'season_type', 'home_team',
                'home_conference', 'away_team', 'away_conference',  'pre_game_spread',
                'predicted_spread', 'regression_spread_pred', 'classification_spread_pred', 'classification_confidence']

In [105]:
# Load Dataset
df = dataset(predict_season=predict_season, predict_week=predict_week, window_size=4, update_data=True, update_seasons=[pd.to_datetime('today').year])

# Prep for ML Models
df[df.columns[df.columns.str.contains('home|Home')]] = df[df.columns[df.columns.str.contains(
    'home|Home')]].groupby('home_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

df[df.columns[df.columns.str.contains('away|Away')]] = df[df.columns[df.columns.str.contains(
    'away|Away')]].groupby('away_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

df = df.dropna(axis=1, thresh=int(len(df) * .9))

df = df.fillna(0)

df[df.select_dtypes('bool').columns] = df.select_dtypes('bool').astype('int')

ct = make_column_transformer(
    (OrdinalEncoder(), ['season_type', 'home_conference', 'away_conference']),
    remainder='passthrough'
)

drop_columns =  ['game_id', 'season', 'venue', 'start_date', 'home_team', 'away_team', 
                'start_date', 'spread_target', 'pre_game_spread' , 'pre_game_home_win_prob', 'pre_game_away_win_prob'
                ]


In [106]:
# Use in production
X_train = df[(df.start_date.dt.tz_convert('US/Central') < today_localize)]
X_test = df[(df.start_date.dt.tz_convert('US/Central') >= today_localize)] 

# Use in development
# X_train = df[(df['week'] < predict_week) | (df['season'] < predict_season)] 
# X_test = df[(df['week'] == predict_week) & (df['season'] == predict_season)] 

model_regressor = XGBRegressor(learning_rate=0.1, 
                                colsample_bytree=0.9,
                                gamma=0.5,
                                max_depth=2,
                                min_child_weight=4,
                                n_estimators=100,
                                subsample=0.8)

model_regressor.fit(
    ct.fit_transform(
        X_train.drop(drop_columns, axis=1)), 
        X_train['spread_target']
        )

X_train['predicted_spread'] = model_regressor.predict(ct.fit_transform(X_train.drop(drop_columns, axis=1)))
X_test['predicted_spread'] = model_regressor.predict(ct.fit_transform(X_test.drop(drop_columns, axis=1)))

# Clean Up
X_train['predicted_spread'] = np.around(X_train['predicted_spread'] / .5, decimals=0) * .5
X_test['predicted_spread'] = np.around(X_test['predicted_spread'] / .5, decimals=0) * .5

X_train['regression_spread_pred'] = np.where(X_train['predicted_spread'] <= X_train['pre_game_spread'], 1, 0)
X_test['regression_spread_pred'] = np.where(X_test['predicted_spread'] <= X_test['pre_game_spread'], 1, 0)

X_train['spread_result'] = np.where(X_train['spread_target'] <= X_train['pre_game_spread'], 1, 0)
X_test['spread_result'] = np.where(X_test['spread_target'] <= X_test['pre_game_spread'], 1, 0)

model_classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
            colsample_bynode=1, colsample_bytree=0.8, gamma=5, gpu_id=-1,
            importance_type='gain', interaction_constraints='',
            learning_rate=0.1, max_delta_step=0, max_depth=5,
            min_child_weight=10, monotone_constraints='()',
            n_estimators=100, n_jobs=0, num_parallel_tree=1,
            objective='binary:logistic', random_state=0, reg_alpha=0,
            reg_lambda=1, scale_pos_weight=1, subsample=1.0,
            tree_method='exact', validate_parameters=1, verbosity=None)

# Update drop columns list for Classifier Model
drop_columns.append('spread_result')
drop_columns.remove('pre_game_spread')   

model_classifier.fit(ct.fit_transform(X_train.drop(drop_columns, axis=1)), X_train['spread_result'])

X_train['classification_spread_pred'] = model_classifier.predict(ct.fit_transform(X_train.drop(drop_columns, axis=1)))
X_test['classification_spread_pred'] = model_classifier.predict(ct.fit_transform(X_test.drop(drop_columns, axis=1)))

# Add so the Predict Probability is Aligned
drop_columns.append('classification_spread_pred')

X_train['classification_confidence'] = model_classifier.predict_proba(ct.fit_transform(X_train.drop(drop_columns, axis=1)))[:,0]
X_test['classification_confidence'] = model_classifier.predict_proba(ct.fit_transform(X_test.drop(drop_columns, axis=1)))[:,0] 


In [107]:
predict_df = X_test[final_columns]
predict_df.head(10)

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,regression_spread_pred,classification_spread_pred,classification_confidence
5607,401212522,2020,3,regular,Tulane,American Athletic,Navy,American Athletic,-6.5,-8.0,1,1,0.418361
5608,401207124,2020,3,regular,Western Kentucky,Conference USA,Liberty,FBS Independents,-14.5,-5.0,0,0,0.638876
5609,401234571,2020,3,regular,Pittsburgh,ACC,Syracuse,ACC,-21.5,-15.5,0,0,0.645665
5610,401234567,2020,3,regular,Duke,ACC,Boston College,ACC,-6.0,0.5,0,1,0.298071
5611,401236041,2020,2,regular,Oklahoma State,Big 12,Tulsa,American Athletic,-23.5,-9.5,0,1,0.356871
5612,401241282,2020,3,regular,Notre Dame,FBS Independents,South Florida,American Athletic,-25.5,-20.5,0,0,0.675738
5613,401236939,2020,3,regular,Marshall,Conference USA,Appalachian State,Sun Belt,5.0,-1.5,1,1,0.348884
5614,401234568,2020,3,regular,Georgia Tech,ACC,UCF,American Athletic,7.5,7.0,1,1,0.448184
5615,401207116,2020,3,regular,Georgia Southern,Sun Belt,Florida Atlantic,Conference USA,-3.0,10.5,0,1,0.491248
5616,401235433,2020,3,regular,North Carolina,ACC,Charlotte,Conference USA,-29.5,-22.0,0,0,0.559142


In [108]:
# Pull in games that have occurred within the current week to append to the predictions
filepath = Path('../zillion_picks/picks')

result_dfs = []

for file in filepath.rglob('*.csv'):
    result_df = pd.read_csv(file)


    result_dfs.append(result_df)

result_df = pd.concat(result_dfs)

result_df = result_df[(result_df['week'] == predict_week) & (result_df['season'] == predict_season)]

try:
    result_df = result_df[~result_df.game_id.isin(predict_df.game_id)]
    result_df = result_df.drop(['actual_spread', 'spread_result', 'start_date'], axis=1)

except:
        result_df = result_df.drop(['actual_spread', 'spread_result', 'start_date'], axis=1)

result_df.head()

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,regression_spread_pred,classification_spread_pred,classification_confidence,regression_spread_result,classification_spread_result


In [109]:
# Merge Predit and Result Dataframes
try:
    predict_df = pd.concat([result_df, predict_df])
except:
    predict_df = result_df

In [110]:
# Add spread target in case the game is complete
predict_df = predict_df.merge(df[['game_id', 'spread_target', 'start_date']], on='game_id', how='left')
predict_df.head()

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,regression_spread_pred,classification_spread_pred,classification_confidence,regression_spread_result,classification_spread_result,spread_target,start_date
0,401212522,2020,3,regular,Tulane,American Athletic,Navy,American Athletic,-6.5,-8.0,1,1,0.418361,,,0.0,2020-09-19 16:00:00+00:00
1,401207124,2020,3,regular,Western Kentucky,Conference USA,Liberty,FBS Independents,-14.5,-5.0,0,0,0.638876,,,0.0,2020-09-19 16:00:00+00:00
2,401234571,2020,3,regular,Pittsburgh,ACC,Syracuse,ACC,-21.5,-15.5,0,0,0.645665,,,0.0,2020-09-19 16:00:00+00:00
3,401234567,2020,3,regular,Duke,ACC,Boston College,ACC,-6.0,0.5,0,1,0.298071,,,0.0,2020-09-19 16:00:00+00:00
4,401236041,2020,2,regular,Oklahoma State,Big 12,Tulsa,American Athletic,-23.5,-9.5,0,1,0.356871,,,0.0,2020-09-19 16:00:00+00:00


In [111]:
predict_df['spread_result'] = np.where(predict_df['spread_target'] <= predict_df['pre_game_spread'], 1, 0)
predict_df.head()

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,regression_spread_pred,classification_spread_pred,classification_confidence,regression_spread_result,classification_spread_result,spread_target,start_date,spread_result
0,401212522,2020,3,regular,Tulane,American Athletic,Navy,American Athletic,-6.5,-8.0,1,1,0.418361,,,0.0,2020-09-19 16:00:00+00:00,0
1,401207124,2020,3,regular,Western Kentucky,Conference USA,Liberty,FBS Independents,-14.5,-5.0,0,0,0.638876,,,0.0,2020-09-19 16:00:00+00:00,0
2,401234571,2020,3,regular,Pittsburgh,ACC,Syracuse,ACC,-21.5,-15.5,0,0,0.645665,,,0.0,2020-09-19 16:00:00+00:00,0
3,401234567,2020,3,regular,Duke,ACC,Boston College,ACC,-6.0,0.5,0,1,0.298071,,,0.0,2020-09-19 16:00:00+00:00,0
4,401236041,2020,2,regular,Oklahoma State,Big 12,Tulsa,American Athletic,-23.5,-9.5,0,1,0.356871,,,0.0,2020-09-19 16:00:00+00:00,0


In [112]:
# Spread Result Calculation
predict_df['regression_spread_result'] = np.where(predict_df['spread_result'] == predict_df['regression_spread_pred'], 'Won', 'Lost')
predict_df['regression_spread_result'] = np.where(predict_df['spread_target'] == predict_df['pre_game_spread'], 'Push', predict_df['regression_spread_result'])

predict_df['classification_spread_result'] = np.where(predict_df['spread_result'] == predict_df['classification_spread_pred'], 'Won', 'Lost')
predict_df['classification_spread_result'] = np.where(predict_df['spread_target'] == predict_df['pre_game_spread'], 'Push', predict_df['classification_spread_result'])

# Remove Results from Games that haven't been played
predict_df[['regression_spread_result', 'classification_spread_result', 'spread_result']] = predict_df[['regression_spread_result', 'classification_spread_result', 'spread_result']].where(predict_df.spread_target.ne(0), np.nan)

# Rename Column
predict_df = predict_df.rename(columns={'spread_target':'actual_spread'})
predict_df['week'] = predict_df['week'].mode()[0]

predict_df.head()

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,regression_spread_pred,classification_spread_pred,classification_confidence,regression_spread_result,classification_spread_result,actual_spread,start_date,spread_result
0,401212522,2020,3,regular,Tulane,American Athletic,Navy,American Athletic,-6.5,-8.0,1,1,0.418361,,,0.0,2020-09-19 16:00:00+00:00,
1,401207124,2020,3,regular,Western Kentucky,Conference USA,Liberty,FBS Independents,-14.5,-5.0,0,0,0.638876,,,0.0,2020-09-19 16:00:00+00:00,
2,401234571,2020,3,regular,Pittsburgh,ACC,Syracuse,ACC,-21.5,-15.5,0,0,0.645665,,,0.0,2020-09-19 16:00:00+00:00,
3,401234567,2020,3,regular,Duke,ACC,Boston College,ACC,-6.0,0.5,0,1,0.298071,,,0.0,2020-09-19 16:00:00+00:00,
4,401236041,2020,3,regular,Oklahoma State,Big 12,Tulsa,American Athletic,-23.5,-9.5,0,1,0.356871,,,0.0,2020-09-19 16:00:00+00:00,


In [30]:
# Export
filepath = Path('../zillion_picks/picks')

for key, group in predict_df.groupby(['season', 'week', 'season_type']):
    if group.season_type.unique() == 'postseason':
        group.to_csv(filepath/f'premium/{key[0]}_postseason.csv', index=False)
    else:
        group.to_csv(filepath/f'premium/{key[0]}_{str(key[1]).rjust(2, "0")}.csv', index=False)

In [7]:
# REGRESSION AND CLASSIFIER MODEL FOR BACK TESTING

# Load
predict_season = 2019

final_columns = ['game_id', 'season', 'week', 'season_type', 'home_team',
                'home_conference', 'away_team', 'away_conference',  'pre_game_spread',
                'predicted_spread', 'regression_spread_pred', 'classification_spread_pred', 'classification_confidence']

for predict_week in range(1,17):

    df = dataset(predict_season=predict_season, predict_week=predict_week, window_size=4, update_data=False)

    # Prep Dataset
    df[df.columns[df.columns.str.contains('home|Home')]] = df[df.columns[df.columns.str.contains(
        'home|Home')]].groupby('home_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

    df[df.columns[df.columns.str.contains('away|Away')]] = df[df.columns[df.columns.str.contains(
        'away|Away')]].groupby('away_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

    df = df.dropna(axis=1, thresh=int(len(df) * .9))
    df = df.fillna(0)
    df[df.select_dtypes('bool').columns] = df.select_dtypes('bool').astype('int')

    # Model
    ct = make_column_transformer(
        (OrdinalEncoder(), ['season_type', 'home_conference', 'away_conference']),
        remainder='passthrough'
    )

    drop_columns =  ['game_id', 'season', 'venue', 'start_date', 'home_team', 'away_team', 
                    'start_date', 'spread_target', 'pre_game_spread', 'pre_game_home_win_prob', 'pre_game_away_win_prob'
                    ]

    # Use in production
    # X_train = df[(df.start_date.dt.tz_convert('US/Central') < today_localize)]
    # X_test = df[(df.start_date.dt.tz_convert('US/Central') >= today_localize)] 

    # Use in development
    X_train = df[(df['week'] < predict_week) | (df['season'] < predict_season)] 
    X_test = df[(df['week'] == predict_week) & (df['season'] == predict_season)] 

    model_regressor = XGBRegressor(learning_rate=0.1, 
                                    colsample_bytree=0.9,
                                    gamma=0.5,
                                    max_depth=2,
                                    min_child_weight=4,
                                    n_estimators=100,
                                    subsample=0.8)

    model_regressor.fit(
        ct.fit_transform(
            X_train.drop(drop_columns, axis=1)), 
            X_train['spread_target']
            )
    
    # Use this for later to determine feature importance
    model_regressor_cols = X_train.drop(drop_columns, axis=1).columns

    X_train['predicted_spread'] = model_regressor.predict(ct.fit_transform(X_train.drop(drop_columns, axis=1)))
    X_test['predicted_spread'] = model_regressor.predict(ct.fit_transform(X_test.drop(drop_columns, axis=1)))

    # Clean Up
    X_train['predicted_spread'] = np.around(X_train['predicted_spread'] / .5, decimals=0) * .5
    X_test['predicted_spread'] = np.around(X_test['predicted_spread'] / .5, decimals=0) * .5

    X_train['regression_spread_pred'] = np.where(X_train['predicted_spread'] <= X_train['pre_game_spread'], 1, 0)
    X_test['regression_spread_pred'] = np.where(X_test['predicted_spread'] <= X_test['pre_game_spread'], 1, 0)

    X_train['spread_result'] = np.where(X_train['spread_target'] <= X_train['pre_game_spread'], 1, 0)
    X_test['spread_result'] = np.where(X_test['spread_target'] <= X_test['pre_game_spread'], 1, 0)

    model_classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=0.8, gamma=5, gpu_id=-1,
                importance_type='gain', interaction_constraints='',
                learning_rate=0.1, max_delta_step=0, max_depth=5,
                min_child_weight=10, monotone_constraints='()',
                n_estimators=100, n_jobs=0, num_parallel_tree=1,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                reg_lambda=1, scale_pos_weight=1, subsample=1.0,
                tree_method='exact', validate_parameters=1, verbosity=None)

    # Update drop columns list for Classifier Model
    drop_columns.append('spread_result')
    drop_columns.remove('pre_game_spread')   

    model_classifier.fit(ct.fit_transform(X_train.drop(drop_columns, axis=1)), X_train['spread_result'])
    
    # Use this for later to determine feature importance
    model_classifier_cols = X_train.drop(drop_columns, axis=1).columns

    X_train['classification_spread_pred'] = model_classifier.predict(ct.fit_transform(X_train.drop(drop_columns, axis=1)))
    X_test['classification_spread_pred'] = model_classifier.predict(ct.fit_transform(X_test.drop(drop_columns, axis=1)))

    # Add so the Predict Probability is Aligned
    drop_columns.append('classification_spread_pred')

    X_train['classification_confidence'] = model_classifier.predict_proba(ct.fit_transform(X_train.drop(drop_columns, axis=1)))[:,0]
    X_test['classification_confidence'] = model_classifier.predict_proba(ct.fit_transform(X_test.drop(drop_columns, axis=1)))[:,0] 

    predict_df = X_test[final_columns]

    predict_df = predict_df.merge(df[['game_id', 'spread_target', 'start_date']], on='game_id', how='left')
    predict_df['spread_result'] = np.where(predict_df['spread_target'] <= predict_df['pre_game_spread'], 1, 0)

    predict_df['regression_spread_result'] = np.where(predict_df['spread_result'] == predict_df['regression_spread_pred'], 'Won', 'Lost')
    predict_df['regression_spread_result'] = np.where(predict_df['spread_target'] == predict_df['pre_game_spread'], 'Push', predict_df['regression_spread_result'])

    predict_df['classification_spread_result'] = np.where(predict_df['spread_result'] == predict_df['classification_spread_pred'], 'Won', 'Lost')
    predict_df['classification_spread_result'] = np.where(predict_df['spread_target'] == predict_df['pre_game_spread'], 'Push', predict_df['classification_spread_result'])

    # Remove Results from Games that haven't been played
    predict_df[['regression_spread_result', 'classification_spread_result', 'spread_result']] = predict_df[['regression_spread_result', 'classification_spread_result', 'spread_result']].where(predict_df.spread_target.ne(0), np.nan)

    # Rename Column
    predict_df = predict_df.rename(columns={'spread_target':'actual_spread'})
    predict_df['week'] = predict_df['week'].mode()[0]

    # Export
    filepath = Path('../zillion_picks/picks')

    for key, group in predict_df.groupby(['season', 'week', 'season_type']):
        if group.season_type.unique() == 'postseason':
            group.to_csv(filepath/f'premium/{key[0]}_postseason.csv', index=False)
        else:
            group.to_csv(filepath/f'premium/{key[0]}_{str(key[1]).rjust(2, "0")}.csv', index=False)

    print(f"{predict_season} Week {predict_week} Regression Accuracy: {round(accuracy_score(X_test['spread_result'], X_test['regression_spread_pred']) * 100,2)}%" )
    print(f"{predict_season} Week {predict_week} Classification Accuracy: {round(accuracy_score(X_test['spread_result'], X_test['classification_spread_pred']) * 100,2)}%" )


2019 Week 1 Regression Accuracy: 55.42%
2019 Week 1 Classification Accuracy: 57.83%
2019 Week 2 Regression Accuracy: 54.79%
2019 Week 2 Classification Accuracy: 49.32%
2019 Week 3 Regression Accuracy: 41.79%
2019 Week 3 Classification Accuracy: 53.73%
2019 Week 4 Regression Accuracy: 46.55%
2019 Week 4 Classification Accuracy: 55.17%
2019 Week 5 Regression Accuracy: 60.0%
2019 Week 5 Classification Accuracy: 47.27%
2019 Week 6 Regression Accuracy: 57.45%
2019 Week 6 Classification Accuracy: 65.96%
2019 Week 7 Regression Accuracy: 49.06%
2019 Week 7 Classification Accuracy: 66.04%
2019 Week 8 Regression Accuracy: 50.82%
2019 Week 8 Classification Accuracy: 49.18%
2019 Week 9 Regression Accuracy: 54.55%
2019 Week 9 Classification Accuracy: 45.45%
2019 Week 10 Regression Accuracy: 52.08%
2019 Week 10 Classification Accuracy: 52.08%
2019 Week 11 Regression Accuracy: 41.67%
2019 Week 11 Classification Accuracy: 45.83%
2019 Week 12 Regression Accuracy: 50.94%
2019 Week 12 Classification Accu

In [8]:
# Test Feature Importance

regression_important_features = pd.Series(data=model_regressor.feature_importances_, index=model_regressor_cols).sort_values(ascending=False)
classification_important_features = pd.Series(data=model_classifier.feature_importances_, index=model_classifier_cols).sort_values(ascending=False)

print(regression_important_features.head(10))
print('---------------------------------------')
print(classification_important_features.head(10))


pointsAway                          0.058916
away_points                         0.040906
offense_successRateHome             0.037539
averageRating_All PositionsHome     0.031255
rankHome                            0.029669
averageRating_Defensive LineHome    0.025599
home_conference                     0.024919
averageStars_All PositionsHome      0.023821
talentHome                          0.021426
home_points                         0.021078
dtype: float32
---------------------------------------
regression_spread_pred                    0.010872
defense_rushingPlays_totalPPAHome         0.007458
offense_passingPlays_explosivenessAway    0.005569
puntReturnsAway                           0.005525
season_type                               0.005365
weight RBHome                             0.005301
averageRating_LinebackerHome              0.005273
defense_totalPPAAway                      0.005220
averageStars_ReceiverHome                 0.005053
netPassingYardsHome                 

In [9]:
    print(f"{predict_season} Week {predict_week} Classification Accuracy: {round(accuracy_score(X_train['spread_result'], X_train['classification_spread_pred']) * 100,2)}%" )

2019 Week 16 Classification Accuracy: 92.34%


In [535]:
#################### WORK ON THIS ###############################

# CHECK CONFIDENCE TO SEE IF THERE IS A WIN ADVANTAGE
dfs=[]

for file in (filepath/'free').rglob('*'):
    df = pd.read_csv(file)
    dfs.append(df)

df = pd.concat(dfs)
df.head()

df['bin'] = pd.cut(df.classification_confidence, [0,.20,.35,.40,.45,.50,.55,.60,.65,.70,.80,1])
df.groupby('bin')['classification_spread_result'].agg(win=(lambda df: df.eq('Won').sum() / len(df)), count=( 'count'))

Unnamed: 0_level_0,win,count
bin,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0.0, 0.2]",0.75,4
"(0.2, 0.35]",0.466667,90
"(0.35, 0.4]",0.511364,88
"(0.4, 0.45]",0.521739,115
"(0.45, 0.5]",0.535088,114
"(0.5, 0.55]",0.513333,150
"(0.55, 0.6]",0.507692,130
"(0.6, 0.65]",0.518072,83
"(0.65, 0.7]",0.561404,57
"(0.7, 0.8]",0.44186,43


In [532]:
df.head()

# FINISH THE CLEAN UP AND FILE EXPORT

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,regression_spread_pred,classification_spread_pred,classification_confidence,actual_spread,start_date,spread_result,regression_spread_result,classification_spread_result,bin
0,401110723,2019,1,regular,Florida,SEC,Miami,ACC,-7.0,-11.0,1,1,0.373211,-4.0,2019-08-24 23:00:00+00:00,0,Lost,Lost,"(0.3, 0.4]"
1,401114164,2019,1,regular,Hawai'i,Mountain West,Arizona,Pac-12,10.5,11.5,0,0,0.594086,-7.0,2019-08-25 02:30:00+00:00,1,Lost,Lost,"(0.5, 0.6]"
2,401117855,2019,1,regular,Connecticut,American Athletic,Wagner,MISSING,-23.0,-22.5,0,0,0.59645,-3.0,2019-08-29 23:00:00+00:00,0,Won,Won,"(0.5, 0.6]"
3,401117854,2019,1,regular,Cincinnati,American Athletic,UCLA,Pac-12,-2.5,-4.0,1,1,0.450243,-10.0,2019-08-29 23:00:00+00:00,1,Won,Won,"(0.4, 0.5]"
4,401119254,2019,1,regular,Bowling Green,Mid-American,Morgan State,MISSING,-24.0,-20.5,0,1,0.331717,-43.0,2019-08-29 23:00:00+00:00,1,Lost,Won,"(0.3, 0.4]"


In [342]:
df['spread_result'].eq('Won').sum()

456

In [215]:
# setup
predict_week = 1
predict_season = 2020
today_localize = pd.to_datetime('today').tz_localize('US/Central') - pd.Timedelta(hours=3)

In [295]:
# Load
df = dataset(predict_season=predict_season, predict_week=predict_week, window_size=4, update_data=False, update_seasons=[pd.to_datetime('today').year])

In [296]:
# Clean Up
df[df.columns[df.columns.str.contains('home|Home')]] = df[df.columns[df.columns.str.contains(
    'home|Home')]].groupby('home_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

df[df.columns[df.columns.str.contains('away|Away')]] = df[df.columns[df.columns.str.contains(
    'away|Away')]].groupby('away_team', as_index=False, group_keys=False).apply(lambda x: x.fillna(x.mean()))

df = df.dropna(axis=1, thresh=int(len(df) * .9))

df = df.fillna(0)

df[df.select_dtypes('bool').columns] = df.select_dtypes('bool').astype('int')

In [218]:
df.tail()

Unnamed: 0,game_id,season,week,season_type,start_date,neutral_site,conference_game,attendance,venue_id,venue,...,year PKHome,year TEHome,weight LBHome,weight LSHome,height LBHome,height LSHome,year LBHome,year LSHome,talentHome,talentAway
5581,401212484,2020,1,regular,2020-09-05 20:30:00+00:00,0,0.0,0.0,3644.0,Jim Wacker Field at Bobcat Stadium,...,2.25,2.0,214.9,195.0,72.366667,69.5,2.2,1.5,328.451379,497.315357
5582,401207098,2020,1,regular,2020-09-05 23:30:00+00:00,0,0.0,0.0,3825.0,Apogee Stadium,...,3.0,1.571429,218.909091,221.944444,72.909091,73.740741,2.090909,2.612933,369.06,26.8
5583,401212553,2020,1,regular,2020-09-06 00:00:00+00:00,0,0.0,0.0,3805.0,Liberty Bowl Memorial Stadium,...,1.75,2.428571,222.0,223.666667,73.357143,70.0,2.714286,2.333333,477.170937,439.28303
5584,401239884,2020,1,regular,2020-09-06 01:00:00+00:00,0,0.0,0.0,3946.0,Sun Bowl Stadium,...,1.5,1.545455,222.115385,192.142857,73.730769,71.0,2.038462,1.0,307.035,80.5125
5585,401234576,2020,1,regular,2020-09-08 00:00:00+00:00,0,0.0,0.0,3852.0,Navy-Marine Corps Memorial Stadium,...,1.5,4.0,216.727273,213.0,73.333333,73.0,2.181818,2.5,341.698529,549.447576


In [219]:
# Model
ct = make_column_transformer(
    (OrdinalEncoder(), ['season_type', 'home_conference', 'away_conference']),
    remainder='passthrough'
)

drop_columns =  ['game_id', 'season', 'venue', 'start_date', 'home_team', 'away_team', 'start_date', 'spread_target'
,'pre_game_spread', 'pre_game_home_win_prob', 'pre_game_away_win_prob'
]

# use in production
# X_train = df[(df.start_date.dt.tz_convert('US/Central') < today_localize)].drop(drop_columns, axis=1)  
# y_train = df[(df.start_date.dt.tz_convert('US/Central') < today_localize)][['spread_target']] 

# X_pred = df[(df.start_date.dt.tz_convert('US/Central') >= today_localize)].drop(drop_columns, axis=1) 
# # y_pred = y[X.start_date.dt.date >= pd.to_datetime('today')]

# use in development
X_train = df[(df['week'] < predict_week) | (df['season'] < predict_season)].drop(drop_columns, axis=1)  
y_train = df[(df['week'] < predict_week) | (df['season'] < predict_season)][['spread_target']] 
X_pred = df[(df['week'] == predict_week) & (df['season'] == predict_season)].drop(drop_columns, axis=1) 

if len(X_pred) > 0:

    spread_model = XGBRegressor(learning_rate=0.1, 
                                    colsample_bytree=1.0,
                                    gamma=0.5,
                                    max_depth=5,
                                    min_child_weight=8,
                                    n_estimators=100,
                                    subsample=1.0)

    spread_model.fit(ct.fit_transform(X_train), y_train['spread_target'])
    y_spread_pred = np.around(spread_model.predict(ct.fit_transform(X_pred))/.5, decimals=0)*.5

    final_columns = ['game_id', 'season', 'week', 'season_type', 'home_team',
                    'home_conference', 'away_team', 'away_conference',  'pre_game_spread',
                    'predicted_spread']

    # use in production
    # X_pred = pd.concat([
    #                     X_pred, 
    #                     df[df.start_date.dt.tz_convert('US/Central') >= today_localize].drop([col for col in df.columns if col not in drop_columns], axis=1) 
    #                     ], 
    #                     axis=1)

    # use in development
    X_pred = pd.concat([
                        X_pred, 
                        df[(df['week'] == predict_week) & (df['season'] == predict_season)].drop([col for col in df.columns if col not in drop_columns], axis=1) 
                        ], 
                        axis=1)    

    predict_df = pd.concat([
            X_pred.reset_index(drop=True), 
            pd.Series(y_spread_pred, name='predicted_spread'),
        ], axis=1)

    predict_df = predict_df[final_columns]

    predict_df['spread_pick'] = np.where(predict_df['predicted_spread'] <= predict_df['pre_game_spread'], predict_df['home_team'], predict_df['away_team'])
    predict_df['straight_pick'] = np.where(predict_df['predicted_spread'] < 0, predict_df['home_team'], predict_df['away_team'])

    predict_df = predict_df[predict_df['week'] == predict_week]
    predict_df

else:
    print('Nothing to Predict')
    pass


In [220]:
predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick
0,401238035,2020,1,regular,UAB,Conference USA,Central Arkansas,MISSING,-21.0,-14.0,Central Arkansas,UAB
1,401207101,2020,1,regular,Southern Mississippi,Conference USA,South Alabama,Sun Belt,-12.5,-18.0,Southern Mississippi,Southern Mississippi
2,401237353,2020,1,regular,Marshall,Conference USA,Eastern Kentucky,MISSING,-25.5,-18.0,Eastern Kentucky,Marshall
3,401235700,2020,1,regular,Army,FBS Independents,Middle Tennessee,Conference USA,-3.5,-12.5,Army,Army
4,401212484,2020,1,regular,Texas State,Sun Belt,SMU,American Athletic,24.5,5.0,Texas State,SMU
5,401207098,2020,1,regular,North Texas,Conference USA,Houston Baptist,MISSING,-23.0,-27.5,North Texas,North Texas
6,401212553,2020,1,regular,Memphis,American Athletic,Arkansas State,Sun Belt,-18.5,-4.5,Arkansas State,Memphis
7,401239884,2020,1,regular,UTEP,Conference USA,Stephen F. Austin,MISSING,-4.0,-20.5,UTEP,UTEP
8,401234576,2020,1,regular,Navy,American Athletic,BYU,FBS Independents,1.0,2.5,BYU,BYU


In [221]:
# Pull in games that have occurred within the current week to append to the predictions
filepath = Path('../zillion_picks/picks')

result_dfs = []

for file in filepath.rglob('*.csv'):
    result_df = pd.read_csv(file)


    result_dfs.append(result_df)

result_df = pd.concat(result_dfs)

result_df = result_df[(result_df['week'] == predict_week) & (result_df['season'] == predict_season)]

try:
    result_df = result_df[~result_df.game_id.isin(predict_df.game_id)]
    result_df = result_df.drop(['actual_spread', 'spread_result', 'straight_result', 'start_date'], axis=1)

except:
        result_df = result_df.drop(['actual_spread', 'spread_result', 'straight_result', 'start_date'], axis=1)

In [222]:
result_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick


In [223]:
# Merge Predit and Result Dataframes
try:
    predict_df = pd.concat([result_df, predict_df])
except:
    predict_df = result_df

In [224]:
# Add spread target in case the game is complete
predict_df = predict_df.merge(df[['game_id', 'spread_target', 'start_date']], on='game_id', how='left')

In [225]:
predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick,spread_target,start_date
0,401238035,2020,1,regular,UAB,Conference USA,Central Arkansas,MISSING,-21.0,-14.0,Central Arkansas,UAB,-10.0,2020-09-04 00:00:00+00:00
1,401207101,2020,1,regular,Southern Mississippi,Conference USA,South Alabama,Sun Belt,-12.5,-18.0,Southern Mississippi,Southern Mississippi,11.0,2020-09-04 01:00:00+00:00
2,401237353,2020,1,regular,Marshall,Conference USA,Eastern Kentucky,MISSING,-25.5,-18.0,Eastern Kentucky,Marshall,-59.0,2020-09-05 17:00:00+00:00
3,401235700,2020,1,regular,Army,FBS Independents,Middle Tennessee,Conference USA,-3.5,-12.5,Army,Army,-42.0,2020-09-05 17:30:00+00:00
4,401212484,2020,1,regular,Texas State,Sun Belt,SMU,American Athletic,24.5,5.0,Texas State,SMU,7.0,2020-09-05 20:30:00+00:00
5,401207098,2020,1,regular,North Texas,Conference USA,Houston Baptist,MISSING,-23.0,-27.5,North Texas,North Texas,-26.0,2020-09-05 23:30:00+00:00
6,401212553,2020,1,regular,Memphis,American Athletic,Arkansas State,Sun Belt,-18.5,-4.5,Arkansas State,Memphis,-13.0,2020-09-06 00:00:00+00:00
7,401239884,2020,1,regular,UTEP,Conference USA,Stephen F. Austin,MISSING,-4.0,-20.5,UTEP,UTEP,-10.0,2020-09-06 01:00:00+00:00
8,401234576,2020,1,regular,Navy,American Athletic,BYU,FBS Independents,1.0,2.5,BYU,BYU,52.0,2020-09-08 00:00:00+00:00


In [226]:
# Convert spread_target games that have not occurred to Null
predict_df['spread_target']  = np.where(predict_df['spread_target'] == 0.0, np.nan, predict_df['spread_target'] )

In [227]:
predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick,spread_target,start_date
0,401238035,2020,1,regular,UAB,Conference USA,Central Arkansas,MISSING,-21.0,-14.0,Central Arkansas,UAB,-10.0,2020-09-04 00:00:00+00:00
1,401207101,2020,1,regular,Southern Mississippi,Conference USA,South Alabama,Sun Belt,-12.5,-18.0,Southern Mississippi,Southern Mississippi,11.0,2020-09-04 01:00:00+00:00
2,401237353,2020,1,regular,Marshall,Conference USA,Eastern Kentucky,MISSING,-25.5,-18.0,Eastern Kentucky,Marshall,-59.0,2020-09-05 17:00:00+00:00
3,401235700,2020,1,regular,Army,FBS Independents,Middle Tennessee,Conference USA,-3.5,-12.5,Army,Army,-42.0,2020-09-05 17:30:00+00:00
4,401212484,2020,1,regular,Texas State,Sun Belt,SMU,American Athletic,24.5,5.0,Texas State,SMU,7.0,2020-09-05 20:30:00+00:00
5,401207098,2020,1,regular,North Texas,Conference USA,Houston Baptist,MISSING,-23.0,-27.5,North Texas,North Texas,-26.0,2020-09-05 23:30:00+00:00
6,401212553,2020,1,regular,Memphis,American Athletic,Arkansas State,Sun Belt,-18.5,-4.5,Arkansas State,Memphis,-13.0,2020-09-06 00:00:00+00:00
7,401239884,2020,1,regular,UTEP,Conference USA,Stephen F. Austin,MISSING,-4.0,-20.5,UTEP,UTEP,-10.0,2020-09-06 01:00:00+00:00
8,401234576,2020,1,regular,Navy,American Athletic,BYU,FBS Independents,1.0,2.5,BYU,BYU,52.0,2020-09-08 00:00:00+00:00


In [228]:
# Spread Result Calculation
won = ((predict_df['predicted_spread'] > predict_df['pre_game_spread']) & (predict_df['spread_target'].fillna(0) > predict_df['pre_game_spread'])) | (
    (predict_df['predicted_spread'] <= predict_df['pre_game_spread']) & (predict_df['spread_target'].fillna(0) < predict_df['pre_game_spread']))

push = predict_df['spread_target'].fillna(0) == predict_df['pre_game_spread']

conditions = [won, push]
choices = ['Won', 'Push']

predict_df['spread_result'] = np.select(condlist=conditions, choicelist=choices, default='Lost')
predict_df['spread_result'] = np.where(predict_df['spread_target'].isnull(), np.nan, predict_df['spread_result'])

In [229]:
# Straight Up Result Calculation
result = predict_df['spread_target'].fillna(1) * predict_df['predicted_spread']

predict_df['straight_result'] = np.where(result > 0, 'Won', 'Lost')
predict_df['straight_result'] = np.where(predict_df['spread_target'].isnull(), np.nan, predict_df['straight_result'])

In [230]:
# Rename Column
predict_df = predict_df.rename(columns={'spread_target':'actual_spread'})

In [231]:
# predict_df['week'] = predict_df.week.astype(int)
predict_df

Unnamed: 0,game_id,season,week,season_type,home_team,home_conference,away_team,away_conference,pre_game_spread,predicted_spread,spread_pick,straight_pick,actual_spread,start_date,spread_result,straight_result
0,401238035,2020,1,regular,UAB,Conference USA,Central Arkansas,MISSING,-21.0,-14.0,Central Arkansas,UAB,-10.0,2020-09-04 00:00:00+00:00,Won,Won
1,401207101,2020,1,regular,Southern Mississippi,Conference USA,South Alabama,Sun Belt,-12.5,-18.0,Southern Mississippi,Southern Mississippi,11.0,2020-09-04 01:00:00+00:00,Lost,Lost
2,401237353,2020,1,regular,Marshall,Conference USA,Eastern Kentucky,MISSING,-25.5,-18.0,Eastern Kentucky,Marshall,-59.0,2020-09-05 17:00:00+00:00,Lost,Won
3,401235700,2020,1,regular,Army,FBS Independents,Middle Tennessee,Conference USA,-3.5,-12.5,Army,Army,-42.0,2020-09-05 17:30:00+00:00,Won,Won
4,401212484,2020,1,regular,Texas State,Sun Belt,SMU,American Athletic,24.5,5.0,Texas State,SMU,7.0,2020-09-05 20:30:00+00:00,Won,Won
5,401207098,2020,1,regular,North Texas,Conference USA,Houston Baptist,MISSING,-23.0,-27.5,North Texas,North Texas,-26.0,2020-09-05 23:30:00+00:00,Won,Won
6,401212553,2020,1,regular,Memphis,American Athletic,Arkansas State,Sun Belt,-18.5,-4.5,Arkansas State,Memphis,-13.0,2020-09-06 00:00:00+00:00,Won,Won
7,401239884,2020,1,regular,UTEP,Conference USA,Stephen F. Austin,MISSING,-4.0,-20.5,UTEP,UTEP,-10.0,2020-09-06 01:00:00+00:00,Won,Won
8,401234576,2020,1,regular,Navy,American Athletic,BYU,FBS Independents,1.0,2.5,BYU,BYU,52.0,2020-09-08 00:00:00+00:00,Won,Won


In [232]:
# Export
filepath = Path('../zillion_picks/picks')

for key, group in predict_df.groupby(['season', 'week', 'season_type']):
    if group.season_type.unique() == 'postseason':
        group.to_csv(filepath/f'premium/{key[0]}_postseason.csv', index=False)
    else:
        group.to_csv(filepath/f'premium/{key[0]}_{str(key[1]).rjust(2, "0")}.csv', index=False)

In [233]:
important_features = pd.Series(data=spread_model.feature_importances_, index=X_train.columns)
important_features.sort_values(ascending=False,inplace=True)
important_features

pointsAway                         0.060405
averageStars_All PositionsHome     0.033760
away_points                        0.019487
home_points                        0.019050
rankAway                           0.017545
                                     ...   
commits_Special TeamsHome          0.000000
averageStars_Defensive BackHome    0.000000
averageStars_QuarterbackHome       0.000000
defensiveTDsHome                   0.000000
week                               0.000000
Length: 295, dtype: float32