In [1]:
import pandas as pd
import numpy as np
import sys
 
sys.path.insert(0, '/Users/dominicbates/Documents/Github/fpl-model/')
from fpl_model.load_data import load_data
from fpl_model.process_data import do_all_processing_steps

### Load and process data

In [None]:
df = load_data()
df = do_all_processing_steps(df, history_size = 5)


Loading data...
Loading data for years: ['2022-23', '2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17']
... Processing year: 2022-23
... Processing year: 2021-22
... Processing year: 2020-21
... Processing year: 2019-20
... Processing year: 2018-19
... Processing year: 2017-18
... Processing year: 2016-17

Dropping Nulls
... Size: 166653
... New Size: 165873
... 780 rows dropped

Data loaded!

Processing opponent feature (goals conceded over last N weeks)...
... 0 / 165873 rows complete
... 10000 / 165873 rows complete
... 20000 / 165873 rows complete
... 30000 / 165873 rows complete
... 40000 / 165873 rows complete
... 50000 / 165873 rows complete
... 60000 / 165873 rows complete
... 70000 / 165873 rows complete
... 80000 / 165873 rows complete
... 90000 / 165873 rows complete
... 100000 / 165873 rows complete
... 110000 / 165873 rows complete
... 120000 / 165873 rows complete
... 130000 / 165873 rows complete
... 140000 / 165873 rows complete
... 150000 / 165873 rows c

In [None]:
def get_features_from_groups(feature_groups):
    '''
    Selects all features ('f|...|...') in dataframe containing any of these keys 
    '''
    return [col for col in list(df) if np.sum([f in col for f in feature_groups])>0 and ('f|' in col)]


feature_groups = ['total_points', 
                  'minutes', 
                  'was_home', 
                  'opponent_gc_history', 
                  'opponent_gc_history_available',
                  'current_season',
                  'player_exists',
                  'current']


features = get_features_from_groups(feature_groups)
target = 'total_points'

### Create model

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor


default_configs = {'XGBoost':{'max_depth':6,
                              'eta':0.3,
                              'n_estimators':10,
#                               'min_child_weight':1,
                              'subsample':1,
#                               'lambda':1,
                              'num_parallel_tree':1
                              },
                   'LinearRegression':None,
                    }



class Classifier:
    def __init__(self, 
                 features,
#                  target,
                 classifier='XGBoost',
                 config=None):
        
        '''
        Model object
        '''
        
        # Check input first
        classifiers_list = ['LinearRegression',
                            'XGBoost']
        if classifier not in classifiers_list:
            raise ValueError('Classifier "'+str(classifier)+'" not in list. Try one of:',classifiers_list)
        
        # Set config params
        if config is None:
            self.config = default_configs[classifier]
        else:
            self.config = config

        # Get relevant model
        self.classifier = classifier
        if classifier == 'LogisticRegression':
            self.model = LogisticRegression(C=2, penalty = 'l2', max_iter=1000, random_state=0, fit_intercept=True)
        elif classifier == 'XGBoost':
            self.model = XGBRegressor(max_depth=self.config['max_depth'],
                                      eta=self.config['eta'],
                                      n_estimators = self.config['n_estimators'],
#                                        min_child_weight=self.config['min_child_weight'],
                                      subsample=self.config['subsample'],
#                                        reg_lambda=self.config['lambda'],
                                      num_parallel_tree=self.config['num_parallel_tree'])
        self.model_fit = False

        # Store features in model
        self.features = features
#         self.taget = target
            

            
    def fit(self, vals_X, vals_y, weights = None):
        self.model.fit(vals_X, vals_y, sample_weight= weights)
        pred_y = self.apply(vals_X)
        self.print_performance(vals_y, pred_y, sample_weight = weights, print_output=True)
        
        
    def apply(self, vals_X):
        if self.model_fit is None:
            raise ValueError('Model not trained yet. Run fit() first')
        else:
            return self.model.predict(vals_X)

#     def apply_proba(self, vals_X):
#         if self.model_fit is None:
#             raise ValueError('Model not trained yet. Run fit() first')
#         else:
#             return self.model.predict_proba(vals_X)
    
    
    def print_performance(self, vals_y, pred_y, weights = None, print_output=True):
        mae_val = mean_absolute_error(vals_y, pred_y, sample_weight = weights)
        mse_val = mean_squared_error(vals_y, pred_y, sample_weight = weights)

        if print==True:
            print('\nMean Absolute Error:')
            print(mae_val)
            
            print('\nMean Squared Error:')
            print(mse_val)
        else:
            return mae_val, mse_val
        
    def save_model(self, fpath):
        print('Saving model to:',fpath)
        with open(fpath, 'wb') as handle:
            pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Model saved!')

    # def print_top_words(self, vectorizer, n = 100):

    #     for c in range(0,len(self.model.classes_)):
    #         top_ind = np.argsort(self.model.coef_[c])[-1*n:][::-1]
    #         top_vals = ([self.model.coef_[c][i] for i in top_ind])

    #         inverse_vocabulary = dict((v,k) for k,v in vectorizer.vocabulary_.items())
    #         print('\nTop words for class:',self.model.classes_[c])
    #         words = [inverse_vocabulary[n]+', ' for n in top_ind]
    #         print(''.join(words)[:-2])
    #         print('')

    


def load_model(fpath):
    print('Loading model from:',fpath)
    with open(fpath, 'rb') as handle:
        model = pickle.load(handle)
        print('Model loaded!')
    return model


### Test fit

In [None]:
model = Classifier(features, classifier='XGBoost')

In [None]:
m_train = df['season'] != '2022-23'
m_test = df['season'] == '2022-23'

In [None]:
model.fit(df[features][m_train], df[target][m_train])

In [None]:
df['p|total_points'] = model.apply(df[features])

In [None]:
df[df['name_cleaned'] == 'erling haaland']