In [13]:

import pickle
import pandas as pd
import copy

# import sklearn
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error

# feature selection
from sklearn.feature_selection import SequentialFeatureSelector

In [70]:
# COMMIT EVERY DAY
#add log statements

class Rb_GradientBooster:
    def __init__(self, hist = True):
        print("Initializing...")
        self.__train_years = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 1010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
        self.__test_years = [2021, 2022]
        self.__hist = hist
        fantasy_data = pd.read_pickle("fantasy_data.pkl")

        self.__features = [
            'pacr', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 
            'rushing_fumbles_lost', 'rushing_epa', 'receptions', 'targets', 
            'receiving_yards', 'receiving_tds', 'receiving_fumbles', 
            'receiving_air_yards', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 
            'racr', 'target_share', 'air_yards_share', 'wopr_x', 
            'tgt_sh', 'ay_sh', 
            'wopr_y', 'ry_sh', 'rtd_sh', 'rfd_sh', 'rtdfd_sh', 'w8dom', 
            'fantasy_points_half_ppr', 'ir_games', 'out_games',
            'carries/game', 
            'yards/carry', 'rushing_tds/game', 'receiving_tds/game', 'turnovers/game', 'adot', 'targets_game', 
            'height', 'weight', 'years_exp', 'age', 'depth_team', 'win_total_adj_line', 
            'draft_value', 'forty', 'bench', 'vertical', 'broad_jump', 'cone', 
            'shuttle',
            'ppr/game', 'half_ppr/game', 'receiving_epa_team_tes', 'rushing_epa_team_qbs',  
            'receiving_epa_team_wrs', 'receiving_epa_team_rbs', 'rushing_epa_team_rbs', 'dakota_team_qbs'
            ]

        rb_data = fantasy_data.loc[fantasy_data['position'] == 'RB']
        rb_data = rb_data.dropna(subset = ['fantasy_points_half_ppr_future'])

        # Imputation is required for gradient boosting class
        # iterative imputer is a bayesian ridge regression model
        # random state ON to control seeding/ control variables for feature evaluation
        if hist == False:
            print("Imputing missing values...")
            imputer =IterativeImputer(max_iter=500, n_nearest_features=5, 
                initial_strategy='median', random_state=42, 
                add_indicator=False)
            imputer.set_output(transform = 'pandas')
            rb_data[self.__features] = imputer.fit_transform(rb_data[self.__features])
       

        print("Preparing cross-validation sets")
        self.__rb_test = rb_data.loc[rb_data['season'].isin(self.__train_years)]
        self.__rb_train = rb_data.loc[rb_data['season'].isin(self.__test_years)]
        self.__rb_data = rb_data
        print("Model is ready to train")
        
    def set_model(self):
        # what is friedman mse and why is it better?
        # min samples split --> less overfitting?
        # random state on?
        # n estimators is number of trees in the ensemble
        # use max leaf nodes instead of max depth??
        # validation_fraction is 0.0 because we are using our own cross validation methods
        # can use impurity decreate, max depth, or max leaf nodes. impurity decrease measures the MSE loss of a node
        # can use a combo of size and impurity based limits
        # n_iter_no_change, validation fraction focus on early stopping (validation fraction only used is n is integer)
        print(self.__hist)
        if self.__hist == True:
            # can insert 'college' into categorical features
            self.model = HistGradientBoostingRegressor(loss='squared_error', quantile=None,
                learning_rate=0.1, max_iter=100, max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, 
                l2_regularization=0.0, max_features=1.0, max_bins=255, categorical_features='from_dtype', 
                monotonic_cst=None, interaction_cst=None, warm_start=False, early_stopping='auto', scoring='loss', 
                validation_fraction=0.1, n_iter_no_change=10, tol=0.1, verbose=0, random_state=42)
        elif self.__hist == False:
            self.model = GradientBoostingRegressor(loss='squared_error', learning_rate=0.1, n_estimators=100,
                criterion='friedman_mse', min_samples_split=250, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                max_depth=None, min_impurity_decrease=64.0, init=None, random_state=42, max_features=None, alpha=0.9, 
                verbose=0, max_leaf_nodes=15, warm_start=False, validation_fraction=0.1, n_iter_no_change=None,
                tol=0.1, ccp_alpha=0.0)
            
    def train_model(self):
        self.model.fit(self.__rb_train[self.__features], self.__rb_train['fantasy_points_half_ppr_future'])


    def test(self):
        # staged predict: returns each stage of the prediction of the test set, vs just the final
        self.__rb_test['predictions'] = self.model.predict(self.__rb_test[self.__features])
        self.__rb_train['predictions'] = self.model.predict(self.__rb_train[self.__features])
                
        # stage_errors = []
        # for i, y_pred_stage in enumerate(self.model.staged_predict(self.__rb_test)):
        #     mse = mean_squared_error(self.__rb_test['fantasy_points_half_ppr'], y_pred_stage)
        #     stage_errors.append(mse)
        #     print(f"Iteration {i+1}: MSE = {mse}")

    def set_features(self):
        print("Setting features...")
        # make feature evaluation robust to outliers
        # fantasy point evaluation is based on absolute points - risk is not part of strategy
        model = copy.deepcopy(self.model)
        selector = SequentialFeatureSelector(model, n_features_to_select='auto', tol=0.4, direction='forward',
                                             scoring="neg_median_absolute_error", cv=5, n_jobs=-1)
        selector.fit(self.__rb_train[self.__features], self.__rb_train['fantasy_points_half_ppr_future'] )
        indices_selected_arr = selector.get_support(indices = True).tolist()
        features = []
        for i in indices_selected_arr:
            features.append(self.__features[i])
        self.__features = features
        print("Features set.")

    def cross_validate(self):
        self.test_mse = mean_squared_error(self.__rb_test['fantasy_points_half_ppr_future'],self.__rb_test['predictions'])
        self.test_mae = mean_absolute_error(self.__rb_test['fantasy_points_half_ppr_future'],self.__rb_test['predictions'])
        self.train_mse = mean_squared_error(self.__rb_train['fantasy_points_half_ppr_future'],self.__rb_train['predictions'])
        self.train_mae = mean_absolute_error(self.__rb_train['fantasy_points_half_ppr_future'],self.__rb_train['predictions'])
        
    def __str__(self):
        model_string = "Features: \n"
        model_string = model_string + str(self.__features) + "\n"
        # intentional side effect
        print(self.__rb_test[['player_name', 'predictions','fantasy_points_half_ppr_future', 'season']])
        self.__rb_test[['player_name', 'predictions', 'fantasy_points_half_ppr_future', 'season']].to_csv('predictions.csv')
        self.cross_validate()
        model_string += "Test MSE: " + str(self.test_mse) + "\n"
        model_string += "Test MAE: " + str(self.test_mae) + "\n"
        model_string += "Train MSE: " + str(self.train_mse) + "\n"
        model_string += "Test MSE: " + str(self.train_mae) + "\n"
        return model_string
        
        
    

In [71]:
model = Rb_GradientBooster(hist = True)

Initializing...
Preparing cross-validation sets
Model is ready to train


In [72]:
model.set_model()

True


In [73]:
model.set_features()

Setting features...
Features set.


In [74]:
model.train_model()

In [75]:
model.test()

print(model)

         player_name  predictions  fantasy_points_half_ppr_future  season
75    Ricky Williams   127.600413                           230.0    2008
76    Ricky Williams   175.857682                           104.9    2009
79      Sammy Morris    55.387898                           111.7    2003
80      Sammy Morris   136.705379                            21.2    2004
82      Sammy Morris    46.888602                           137.3    2007
...              ...          ...                             ...     ...
5583      Nick Chubb   220.909913                           239.1    2018
5599   Kalen Ballage    50.672365                            44.8    2018
5625   Royce Freeman    68.938820                           120.7    2018
5628  Saquon Barkley   175.857682                           218.1    2018
5633     Sony Michel    98.789644                           146.6    2018

[914 rows x 4 columns]
Features: 
['fantasy_points_half_ppr']
Test MSE: 4892.761263544685
Test MAE: 52.31164569

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.__rb_test['predictions'] = self.model.predict(self.__rb_test[self.__features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.__rb_train['predictions'] = self.model.predict(self.__rb_train[self.__features])


In [40]:
# def shap(self):

    # def permutation_importance

        


# use sklearn gradient booster
# set up corr matrix
# todo: set on only top k
# set up permutation importance
#calculate other rbs_epa
# todo: set up shap values

## output:
# corr matrix
# outputs predictions
# outputs feature imp
# outputs MSE, MAE