In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn import preprocessing
from NFLUtils import NFLUtils
import optuna
%matplotlib inline
nfl_utils = NFLUtils()

import sklearn as sk
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso



In [2]:
def set_all_seeds(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

### Load CSV
cp Combined.csv ~/drive/Notes/ML/Pytorch/footballData/n

In [3]:
df = pd.read_csv("./footballData/CombinedSlidingWindow4.csv", index_col=False, low_memory=False)

# Shuffle dataFrame (don't do this?)
# df = shuffle(df, random_state=101)
# df.head()
df.info()

test_performance_size = 200
test_performance_df = df[df.shape[0]-test_performance_size:]
df = df[:df.shape[0]-test_performance_size]
print(f'df after perf set removed: {df.shape}')
print(f'df perf set size {test_performance_df.shape}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5479 entries, 0 to 5478
Columns: 209 entries, Unnamed: 0 to D_datediff
dtypes: float64(133), int64(67), object(9)
memory usage: 8.7+ MB
df after perf set removed: (5279, 209)
df perf set size (200, 209)


### Remove items from performance set where you have no odds data

In [4]:
test_performance_df = test_performance_df.loc[test_performance_df['D_start_odds']!= 0]
test_performance_size = test_performance_df.shape[0]
# test_performance_df.tail()

In [5]:

cont_cols = [
    'D_datediff', # Days since last game (Home - visitor)
    
    # first downs
    'D_First_Downs',
    
    # Basic Stats
    'D_Rush',
    'D_Yds',
    'D_TDs',
    'D_Cmp',
    'D_Att',
    'D_Yd',
    'D_TD',
    'D_INT',
    'D_Sacked',
    'D_Yards',
    'D_Net_Pass_Yards',
    'D_Total_Yards',
    'D_Fumbles',
    'D_Lost',
    'D_Turnovers',
    'D_Penalties',
    
    # Passing Detailed
    'D_passing_att',
    'D_passing_cmp',
    'D_passing_int',
    'D_passing_lng',
    'D_passing_sk',
    'D_passing_td',
    'D_passing_yds',
    
    # Receiving
    'D_receiving_lng',
    'D_receiving_td',
    'D_receiving_yds',
    
    # Rushing Detailed
    'D_rushing_att',
    'D_rushing_lng',
    'D_rushing_td',
    'D_rushing_yds',
    
    # Defense interceptions
    'D_def_interceptions_int',
    'D_def_interceptions_lng',
    # 'D_def_interceptions_pd',
    'D_def_interceptions_td',
    'D_def_interceptions_yds',
    
    # Defense fumbles
    'D_fumbles_ff',
    'D_fumbles_fr',
    'D_fumbles_td',
    'D_fumbles_yds',
    
    # Defense tackles
    'D_sk',
    'D_tackles_ast',
    'D_tackles_comb',
    # 'D_tackles_qbhits',
    'D_tackles_solo',
    # 'D_tackles_tfl',
    
    # Kick Returns
    'D_kick_returns_lng',
    'D_kick_returns_rt',
    'D_kick_returns_td',
    'D_kick_returns_yds',
    
    # Punt Returns
    'D_punt_returns_lng',
    'D_punt_returns_ret',
    'D_punt_returns_td',
    'D_punt_returns_yds',
    
    # Punting / Scoring
    'D_punting_lng',
    'D_punting_pnt',
    'D_punting_yds',
    'D_scoring_fga',
    'D_scoring_fgm',
    'D_scoring_xpa',
    'D_scoring_xpm'
]


y_col = ['H_Won']
y_col_perf = ['H_Won', 'H_start_odds', 'V_start_odds', 'H_halftime_odds', 'V_halftime_odds']


# create cont_df and y_df from the df
print(df.shape)
cont_df = df[cont_cols]
y_df = df[y_col]

# test performance set
perf_conts_df = test_performance_df[cont_cols]
perf_y_df = test_performance_df[y_col_perf]
perf_date_df = test_performance_df[['Date']]

perf_y_simple = test_performance_df[y_col]

# print(cont_df.dtypes)
print(cont_df.shape)
print(perf_y_df.shape)
print(perf_y_df.tail())
print(perf_conts_df.tail())

(5279, 209)
(5279, 59)
(155, 5)
      H_Won  H_start_odds  V_start_odds  H_halftime_odds  V_halftime_odds
5429    0.0      1.820000      2.015000         2.585714         1.532857
5430    0.0      3.325000      1.348000         7.592857         1.082857
5431    1.0      5.405556      1.164444         3.908333         1.255000
5432    1.0      1.616000      2.374000         1.288333         3.883333
5433    1.0      1.625000      2.359000         1.546667         2.425000
      D_datediff  D_First_Downs  D_Rush  D_Yds  D_TDs  D_Cmp  D_Att  D_Yd  \
5429        14.0              1     -54    -10     74     -8    123   -13   
5430         0.0              1     -30     -2     20      7     92    -7   
5431        -4.0             -4       0     -7      8      2     49    -2   
5432         3.0              1     -13     -1     24     -8     50   -13   
5433         0.0              4     -24      0    -18     15    -32     6   

      D_TD  D_INT  ...  D_punt_returns_ret  D_punt_returns_td

### 3. Create an array of continuous values
Numpy array 'conts' containing stack of each continuous column

In [6]:
conts = np.stack([cont_df[col].values for col in list(cont_df.columns)], 1)
conts[:5]

y_col = np.stack([y_df[col].values for col in y_col], 1)

# test performance set
perf_conts = np.stack([perf_conts_df[col].values for col in list(perf_conts_df.columns)], 1)
perf_y_col = np.stack([perf_y_df[col].values for col in list(perf_y_df.columns)], 1)
perf_y_simple = np.stack([perf_y_simple[col].values for col in list(perf_y_simple.columns)], 1)

perf_date_col = np.stack([perf_date_df[col].values for col in list(perf_date_df.columns)], 1)


conts_train = conts
y_train = y_col

https://www.kaggle.com/code/arefehsjd/lasso-xgboost

In [12]:
# Tuning hyperparameter
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

models_and_params = {
    "Decision Tree": {
        "model": DecisionTreeRegressor(),
        "params": {
            'max_depth': [6, 8, 10, 12],  # Reduced depth due to sample size
            'min_samples_split': [20, 30, 40],  # Increased to ensure enough samples per split
            'min_samples_leaf': [10, 15, 20],  # Increased to prevent leaf nodes with too few samples
            'max_features': ['sqrt', 'log2', 0.4]  # Kept feature selection options
        }
    },
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            'n_estimators': [100, 150, 200],  # Reduced due to dataset size
            'max_depth': [8, 10, 12],  # More conservative depth
            'min_samples_split': [15, 25, 35],  # Increased for better split quality
            'min_samples_leaf': [5, 10, 15],  # Increased minimum samples per leaf
            'max_features': ['sqrt', 'log2'],  # Simplified feature sampling
            'bootstrap': [True]  # Kept bootstrap sampling
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            'n_estimators': [100, 150, 200],  # Reduced estimators
            'learning_rate': [0.05, 0.1, 0.15],  # Slightly higher learning rates
            'max_depth': [3, 4, 5],  # More conservative depth
            'subsample': [0.7, 0.8, 0.9],  # Added more aggressive subsampling
            'min_samples_split': [20, 30],  # Increased for better splits
            'min_samples_leaf': [10, 15]  # Added leaf constraint
        }
    },
    "Lasso": {
        "model": Lasso(max_iter=2000),
        "params": {
            'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
            'selection': ['cyclic', 'random'],
            'tol': [1e-4, 1e-3, 1e-2],
            'fit_intercept': [True, False],  # Added parameter
            'positive': [True, False]  # Added parameter for non-negative coefficients
        }
    }
}


# Loop through models for hyperparameter tuning
best_estimators = {}
results = []

for model_name, model_info in models_and_params.items():
    print(f"Tuning {model_name}...")
    if model_name == "Gradient Boosting":
        continue
    # Initialize RandomizedSearchCV
    search = RandomizedSearchCV(
        estimator=model_info["model"],
        param_distributions=model_info["params"],
        scoring='neg_mean_squared_error',
        cv=5,
        n_iter=200,
        random_state=42,
        n_jobs=-1
    )
    search = GridSearchCV(
        estimator=model_info["model"],
        param_grid=model_info["params"],  # Changed from param_distributions to param_grid
        scoring='neg_mean_squared_error',
        cv=5,
        n_jobs=-1
    )
    # Fit the model
    search.fit(conts_train, y_train.ravel())
    
    # Best estimator and parameters
    best_estimators[model_name] = search.best_estimator_
    best_params = search.best_params_
    
    # Evaluate on the performance set
    y_pred = search.best_estimator_.predict(perf_conts)
    mse = mean_squared_error(perf_y_simple, y_pred)
    r2 = r2_score(perf_y_simple, y_pred)
    
    backtest_res = nfl_utils.backtest_model(search, perf_conts, perf_y_col, perf_date_col, initial_capital=1000,
                             position_size=0.05, confidence_threshold=0.0, show_plot=False)
    print(backtest_res)
    # Store results
    results.append({
        "Model": model_name,
        "Best Parameters": best_params,
        "MSE": mse,
        "R² Score": r2
    })
# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display results
print(results_df)

Tuning Decision Tree...
{'final_value': 524.98276804098, 'roi': -0.47501723195902, 'win_rate': 0.5855263157894737, 'max_drawdown': 0.4907035622419675, 'total_bets': 152}
Tuning Random Forest...
{'final_value': 1239.5967715447623, 'roi': 0.2395967715447623, 'win_rate': 0.6623376623376623, 'max_drawdown': 0.30009841734738635, 'total_bets': 154}
Tuning Gradient Boosting...
Tuning Lasso...
{'final_value': 1029.4029204078847, 'roi': 0.02940292040788472, 'win_rate': 0.6493506493506493, 'max_drawdown': 0.3917239753929464, 'total_bets': 154}
           Model                                    Best Parameters       MSE  \
0  Decision Tree  {'max_depth': 6, 'max_features': 'sqrt', 'min_...  0.245830   
1  Random Forest  {'bootstrap': True, 'max_depth': 10, 'max_feat...  0.232560   
2          Lasso  {'alpha': 0.01, 'fit_intercept': True, 'positi...  0.231122   

   R² Score  
0  0.011701  
1  0.065052  
2  0.070832  


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
