In [8]:
import pandas as pd
import numpy as np
import sys
import copy
 
sys.path.insert(0, '/Users/dominicbates/Documents/Github/fpl-model/')
from fpl_model.load_data import load_data
from fpl_model.process_data import do_all_processing_steps

### Load and process data

In [2]:
df = load_data()
df = do_all_processing_steps(df, history_size = 5)


Loading data...
Loading data for years: ['2022-23', '2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17']
... Processing year: 2022-23
... Processing year: 2021-22
... Processing year: 2020-21
... Processing year: 2019-20
... Processing year: 2018-19
... Processing year: 2017-18
... Processing year: 2016-17

Dropping Nulls
... Size: 166653
... New Size: 165873
... 780 rows dropped

Data loaded!

Processing opponent feature (goals conceded over last N weeks)...
... 0 / 165873 rows complete
... 10000 / 165873 rows complete
... 20000 / 165873 rows complete
... 30000 / 165873 rows complete
... 40000 / 165873 rows complete
... 50000 / 165873 rows complete
... 60000 / 165873 rows complete
... 70000 / 165873 rows complete
... 80000 / 165873 rows complete
... 90000 / 165873 rows complete
... 100000 / 165873 rows complete
... 110000 / 165873 rows complete
... 120000 / 165873 rows complete
... 130000 / 165873 rows complete
... 140000 / 165873 rows complete
... 150000 / 165873 rows c

In [3]:
def get_features_from_groups(feature_groups):
    '''
    Selects all features ('f|...|...') in dataframe containing any of these keys 
    '''
    return [col for col in list(df) if np.sum([f in col for f in feature_groups])>0 and ('f|' in col)]


feature_groups = ['total_points', 
                  'minutes', 
                  'was_home', 
                  'opponent_gc_history', 
                  'opponent_gc_history_available',
                  'current_season',
                  'player_exists',
                  'current']

feature_groups_ext = ['total_points', 
                  'minutes', 
                  'was_home', 
                  'opponent_gc_history', 
                  'opponent_gc_history_available',
                  'current_season',
                  'player_exists',
                  'current',
                  'goals_conceded',
                  'goals_scored',
                  'assists']

features = get_features_from_groups(feature_groups)
features_ext = get_features_from_groups(feature_groups_ext)
target = 'total_points'

### Create model

In [4]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
import time


default_configs = {'XGBoost':{'max_depth':6,
                              'eta':0.3,
                              'n_estimators':10,
#                               'min_child_weight':1,
                              'subsample':1,
#                               'lambda':1,
                              'num_parallel_tree':1
                              },
                   'LinearRegression':None,
                    }



class Classifier:
    def __init__(self, 
                 features,
#                  target,
                 classifier='XGBoost',
                 config=None):
        
        '''
        Model object
        '''
        
        # Check input first
        classifiers_list = ['LinearRegression',
                            'XGBoost']
        if classifier not in classifiers_list:
            raise ValueError('Classifier "'+str(classifier)+'" not in list. Try one of:',classifiers_list)
        
        # Set config params
        if config is None:
            self.config = default_configs[classifier]
        else:
            self.config = config

        # Get relevant model
        self.classifier = classifier
        if classifier == 'LogisticRegression':
            self.model = LogisticRegression(C=2, penalty = 'l2', max_iter=1000, random_state=0, fit_intercept=True)
        elif classifier == 'XGBoost':
            self.model = XGBRegressor(max_depth=self.config['max_depth'],
                                      eta=self.config['eta'],
                                      n_estimators = self.config['n_estimators'],
#                                        min_child_weight=self.config['min_child_weight'],
                                      subsample=self.config['subsample'],
#                                        reg_lambda=self.config['lambda'],
                                      num_parallel_tree=self.config['num_parallel_tree'])
        self.model_fit = False

        # Store features in model
        self.features = features
#         self.taget = target
            

            
    def fit(self, vals_X, vals_y, weights = None):
        self.model.fit(vals_X, vals_y, sample_weight= weights)
        pred_y = self.apply(vals_X)
        self.performance_metrics(vals_y, pred_y, weights = weights, print_output=True)
        
        
    def apply(self, vals_X):
        if self.model_fit is None:
            raise ValueError('Model not trained yet. Run fit() first')
        else:
            return self.model.predict(vals_X)

#     def apply_proba(self, vals_X):
#         if self.model_fit is None:
#             raise ValueError('Model not trained yet. Run fit() first')
#         else:
#             return self.model.predict_proba(vals_X)
    
    
    def performance_metrics(self, vals_y, pred_y, weights = None, print_output=True):
        mae_val = mean_absolute_error(vals_y, pred_y, sample_weight = weights)
        mse_val = mean_squared_error(vals_y, pred_y, sample_weight = weights)

        if print==True:
            print('\nMean Absolute Error:')
            print(mae_val)
            
            print('\nMean Squared Error:')
            print(mse_val)
        else:
            return mae_val, mse_val
        
    def save_model(self, fpath):
        print('Saving model to:',fpath)
        with open(fpath, 'wb') as handle:
            pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print('Model saved!')

    # def print_top_words(self, vectorizer, n = 100):

    #     for c in range(0,len(self.model.classes_)):
    #         top_ind = np.argsort(self.model.coef_[c])[-1*n:][::-1]
    #         top_vals = ([self.model.coef_[c][i] for i in top_ind])

    #         inverse_vocabulary = dict((v,k) for k,v in vectorizer.vocabulary_.items())
    #         print('\nTop words for class:',self.model.classes_[c])
    #         words = [inverse_vocabulary[n]+', ' for n in top_ind]
    #         print(''.join(words)[:-2])
    #         print('')

    


def load_model(fpath):
    print('Loading model from:',fpath)
    with open(fpath, 'rb') as handle:
        model = pickle.load(handle)
        print('Model loaded!')
    return model


### Crossvalidation

In [113]:
def do_cross_validation(df, features, config):
    model = Classifier(features, classifier='XGBoost', config=config)
    mae_sqrt_vals = []
    mse_sqrt_vals = []
    mae_full_vals = []
    mse_full_vals = []
    all_seasons = ['2016-17','2017-18','2018-19','2019-20','2020-21','2021-22','2022-23']
    # Train model missing each seson info\
    count  = 1
    t1 = time.time()
    for season in all_seasons:
        # Split dataset
        tloop = time.time()
        df_train = df[df['season'] != season].reset_index(drop=True)
        df_test = df[df['season'] == season].reset_index(drop=True)
        
        # Fit model
        model.fit(df_train[features], df_train[target], weights = df_train['selected_weight']**0.5)
        
        # Apply to test data
        df_test['preds'] = model.apply(df_test[features])
        mae_sqrt, mse_sqrt = model.performance_metrics(df_test[target], 
                                             df_test['preds'], 
                                             weights = df_test['selected_weight']**0.5, 
                                             print_output = False)
        mae_full, mse_full = model.performance_metrics(df_test[target], 
                                             df_test['preds'], 
                                             weights = df_test['selected_weight'], 
                                             print_output = False)
        # Store in list
        mae_sqrt_vals.append(mae_sqrt)
        mse_sqrt_vals.append(mse_sqrt)
        mae_full_vals.append(mae_full)
        mse_full_vals.append(mse_full)
#         print('...',count,'/',len(all_seasons),'of cross validations done - took',np.round((time.time()-tloop)/60, 2),'mins')
        count+=1
    print('... crossvalidation done - took',np.round((time.time()-t1)/60, 2),'mins total')
        
    # Get mean and standard error for metrics
    mae_sqrt_mean = np.mean(mae_sqrt_vals)
    mae_sqrt_ste = np.std(mae_sqrt_vals) / (len(all_seasons)**0.5)
    mse_sqrt_mean = np.mean(mse_sqrt_vals)
    mse_sqrt_ste = np.std(mse_sqrt_vals) / (len(all_seasons)**0.5)
    
    mae_full_mean = np.mean(mae_full_vals)
    mae_full_ste = np.std(mae_full_vals) / (len(all_seasons)**0.5)
    mse_full_mean = np.mean(mse_full_vals)
    mse_full_ste = np.std(mse_full_vals) / (len(all_seasons)**0.5)
    
    return mae_sqrt_mean, mae_sqrt_ste, mse_sqrt_mean, mse_sqrt_ste, mae_full_mean, mae_full_ste, mse_full_mean, mse_full_ste, 
    




### Test 1

In [107]:
config_ranges = {'max_depth':[3,5,10,20],
                 'eta':[0.02,0.1,0.25,0.5,0.75,1],
                 'n_estimators:':[5,10,20,50]} # Only affects things if using bagging

temp_config = {'max_depth':None,
               'eta':None,
               'n_estimators':None,
               'subsample':1,
               'num_parallel_tree':1}

fname_results = '/Users/dominicbates/Documents/Github/fpl-model/hyperparam_fitting/test.csv'

df_results = pd.DataFrame()
# Loop through parameter combinations
for n_estimators in [5,10,20,50]:
    for max_depth in [3,5,10,20]:
        for eta in [0.05,0.1,0.25,0.5,1]:
            print('Doing model...','n_estimators:',n_estimators,'max_depth:',max_depth,'eta:',eta)
            # Set config
            temp_config['n_estimators'] = n_estimators
            temp_config['max_depth'] = max_depth
            temp_config['eta'] = eta
            # Do crosssvalidation
            mae_mean, mae_ste, mse_mean, mse_ste = do_cross_validation(df, features, temp_config)
            # 
            results = pd.DataFrame({'mae_mean':[mae_mean],
                                    'mae_ste':[mae_ste],
                                    'mse_mean':[mse_mean],
                                    'mse_ste':[mae_ste],
                                    'param|n_estimators':[n_estimators],
                                    'param|max_depth':[max_depth],
                                    'param|eta':[eta]})
            df_results = pd.concat([df_results,results]).reset_index(drop=True)
            
            df_results.to_csv(fname_results,index=False)


Doing model... n_estimators: 5 max_depth: 3 eta: 0.05
... crossvalidation done - took 0.09 mins total
Doing model... n_estimators: 5 max_depth: 3 eta: 0.1
... crossvalidation done - took 0.09 mins total
Doing model... n_estimators: 5 max_depth: 3 eta: 0.25
... crossvalidation done - took 0.09 mins total
Doing model... n_estimators: 5 max_depth: 3 eta: 0.5
... crossvalidation done - took 0.09 mins total
Doing model... n_estimators: 5 max_depth: 3 eta: 1
... crossvalidation done - took 0.09 mins total
Doing model... n_estimators: 5 max_depth: 5 eta: 0.05
... crossvalidation done - took 0.11 mins total
Doing model... n_estimators: 5 max_depth: 5 eta: 0.1
... crossvalidation done - took 0.11 mins total
Doing model... n_estimators: 5 max_depth: 5 eta: 0.25
... crossvalidation done - took 0.11 mins total
Doing model... n_estimators: 5 max_depth: 5 eta: 0.5
... crossvalidation done - took 0.11 mins total
Doing model... n_estimators: 5 max_depth: 5 eta: 1
... crossvalidation done - took 0.11 m

In [None]:
df = load_data()
df = do_all_processing_steps(df, history_size = 5)

### Test 2

Testing data processing hyperparameters too

In [9]:

temp_config = {'max_depth':None,
               'eta':None,
               'n_estimators':None,
               'subsample':1,
               'num_parallel_tree':1}

fname_results = '/Users/dominicbates/Documents/Github/fpl-model/hyperparam_fitting/test2.csv'

df_results = pd.DataFrame()
# Loop through parameter combinations
for history_size in [5,10]:
    df = load_data()
    df = do_all_processing_steps(df, history_size = history_size)
    for feature_set in ['normal','extended']:
        if feature_set == 'normal':
            f = copy.copy(features)
        else:
            f = copy.copy(features_ext)
        for n_estimators in [10,20,50]:
            for max_depth in [3, 5, 10]:
                for eta in [0.1,0.05,0.025]:
                    print('Doing model...','n_estimators:',n_estimators,'max_depth:',max_depth,'eta:',eta)
                    # Set config
                    temp_config['n_estimators'] = n_estimators
                    temp_config['max_depth'] = max_depth
                    temp_config['eta'] = eta
                    # Do crosssvalidation
                    mae_mean, mae_ste, mse_mean, mse_ste = do_cross_validation(df, f, temp_config)
                    # 
                    results = pd.DataFrame({'mae_mean':[mae_mean],
                                            'mae_ste':[mae_ste],
                                            'mse_mean':[mse_mean],
                                            'mse_ste':[mae_ste],
                                            'param|n_estimators':[n_estimators],
                                            'param|max_depth':[max_depth],
                                            'param|eta':[eta],
                                            'param|history_size':[history_size],
                                            'param|feature_set':[feature_set]})
                    df_results = pd.concat([df_results,results]).reset_index(drop=True)

                    df_results.to_csv(fname_results,index=False)



Loading data...
Loading data for years: ['2022-23', '2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17']
... Processing year: 2022-23
... Processing year: 2021-22
... Processing year: 2020-21
... Processing year: 2019-20
... Processing year: 2018-19
... Processing year: 2017-18
... Processing year: 2016-17

Dropping Nulls
... Size: 166653
... New Size: 165873
... 780 rows dropped

Data loaded!

Processing opponent feature (goals conceded over last N weeks)...
... 0 / 165873 rows complete
... 10000 / 165873 rows complete
... 20000 / 165873 rows complete
... 30000 / 165873 rows complete
... 40000 / 165873 rows complete
... 50000 / 165873 rows complete
... 60000 / 165873 rows complete
... 70000 / 165873 rows complete
... 80000 / 165873 rows complete
... 90000 / 165873 rows complete
... 100000 / 165873 rows complete
... 110000 / 165873 rows complete
... 120000 / 165873 rows complete
... 130000 / 165873 rows complete
... 140000 / 165873 rows complete
... 150000 / 165873 rows c

... 120000 / 165873 rows complete
... 130000 / 165873 rows complete
... 140000 / 165873 rows complete
... 150000 / 165873 rows complete
... 160000 / 165873 rows complete

Feature processed!

Processing features...
Processing dataframe binned features
... 0 / 165873 rows complete
... 10000 / 165873 rows complete
... 20000 / 165873 rows complete
... 30000 / 165873 rows complete
... 40000 / 165873 rows complete
... 50000 / 165873 rows complete
... 60000 / 165873 rows complete
... 70000 / 165873 rows complete
... 80000 / 165873 rows complete
... 90000 / 165873 rows complete
... 100000 / 165873 rows complete
... 110000 / 165873 rows complete
... 120000 / 165873 rows complete
... 130000 / 165873 rows complete
... 140000 / 165873 rows complete
... 150000 / 165873 rows complete
... 160000 / 165873 rows complete

Features processed!

One-hot encoding all columns ending in "|"...
Finished!
Doing model... n_estimators: 10 max_depth: 3 eta: 0.1
... crossvalidation done - took 0.11 mins total
Doing

### Test 4

In [114]:

temp_config = {'max_depth':None,
               'eta':None,
               'n_estimators':None,
               'subsample':1,
               'num_parallel_tree':1}

fname_results = '/Users/dominicbates/Documents/Github/fpl-model/hyperparam_fitting/test3_sqrt_weights.csv'

df_results = pd.DataFrame()
# Loop through parameter combinations
for history_size in [10]:
#     df = load_data()
#     df = do_all_processing_steps(df, history_size = history_size)
    for feature_set in ['normal']:
        if feature_set == 'normal':
            f = copy.copy(features)
        else:
            f = copy.copy(features_ext)
        for n_estimators in [10,20,50,100]:
            for max_depth in [3, 5, 10]:
                for eta in [0.2,0.1,0.05,0.025]:
                    print('Doing model...','n_estimators:',n_estimators,'max_depth:',max_depth,'eta:',eta)
                    # Set config
                    temp_config['n_estimators'] = n_estimators
                    temp_config['max_depth'] = max_depth
                    temp_config['eta'] = eta
                    # Do crosssvalidation
                    mae_sqrt_mean, mae_sqrt_ste, mse_sqrt_mean, mse_sqrt_ste, mae_full_mean, mae_full_ste, mse_full_mean, mse_full_ste = do_cross_validation(df, f, temp_config)
                    # 
                    results = pd.DataFrame({'mae_sqrt_mean':[mae_sqrt_mean],
                                            'mae_sqrt_ste':[mae_sqrt_ste],
                                            'mse_sqrt_mean':[mse_sqrt_mean],
                                            'mse_sqrt_ste':[mae_sqrt_ste],
                                            'mae_full_mean':[mae_full_mean],
                                            'mae_full_ste':[mae_full_ste],
                                            'mse_full_mean':[mse_full_mean],
                                            'mse_full_ste':[mae_full_ste],
                                            'param|n_estimators':[n_estimators],
                                            'param|max_depth':[max_depth],
                                            'param|eta':[eta],
                                            'param|history_size':[history_size],
                                            'param|feature_set':[feature_set]})
                    df_results = pd.concat([df_results,results]).reset_index(drop=True)

                    df_results.to_csv(fname_results,index=False)


Doing model... n_estimators: 10 max_depth: 3 eta: 0.2
... crossvalidation done - took 0.11 mins total
Doing model... n_estimators: 10 max_depth: 3 eta: 0.1
... crossvalidation done - took 0.11 mins total
Doing model... n_estimators: 10 max_depth: 3 eta: 0.05
... crossvalidation done - took 0.1 mins total
Doing model... n_estimators: 10 max_depth: 3 eta: 0.025
... crossvalidation done - took 0.11 mins total
Doing model... n_estimators: 10 max_depth: 5 eta: 0.2
... crossvalidation done - took 0.13 mins total
Doing model... n_estimators: 10 max_depth: 5 eta: 0.1
... crossvalidation done - took 0.13 mins total
Doing model... n_estimators: 10 max_depth: 5 eta: 0.05
... crossvalidation done - took 0.13 mins total
Doing model... n_estimators: 10 max_depth: 5 eta: 0.025
... crossvalidation done - took 0.13 mins total
Doing model... n_estimators: 10 max_depth: 10 eta: 0.2
... crossvalidation done - took 0.21 mins total
Doing model... n_estimators: 10 max_depth: 10 eta: 0.1
... crossvalidation d

KeyboardInterrupt: 

### Test best model on recent season

In [37]:
df = load_data()
df = do_all_processing_steps(df, history_size = 10)


Loading data...
Loading data for years: ['2022-23', '2021-22', '2020-21', '2019-20', '2018-19', '2017-18', '2016-17']
... Processing year: 2022-23
... Processing year: 2021-22
... Processing year: 2020-21
... Processing year: 2019-20
... Processing year: 2018-19
... Processing year: 2017-18
... Processing year: 2016-17

Dropping Nulls
... Size: 166653
... New Size: 165873
... 780 rows dropped

Data loaded!

Processing opponent feature (goals conceded over last N weeks)...
... 0 / 165873 rows complete
... 10000 / 165873 rows complete
... 20000 / 165873 rows complete
... 30000 / 165873 rows complete
... 40000 / 165873 rows complete
... 50000 / 165873 rows complete
... 60000 / 165873 rows complete
... 70000 / 165873 rows complete
... 80000 / 165873 rows complete
... 90000 / 165873 rows complete
... 100000 / 165873 rows complete
... 110000 / 165873 rows complete
... 120000 / 165873 rows complete
... 130000 / 165873 rows complete
... 140000 / 165873 rows complete
... 150000 / 165873 rows c

In [203]:
# config = {'max_depth':10,
#           'eta':0.05,
#           'n_estimators':20,
#           'subsample':1,
#           'num_parallel_tree':1}

config = {'max_depth':3,
          'eta':0.1,
          'n_estimators':50,
          'subsample':1,
          'num_parallel_tree':1}
          
df_train = df[df['season'] != '2022-23'].reset_index(drop=True)
df_test = df[df['season'] == '2022-23'].reset_index(drop=True)
        
# Fit model
model = Classifier(features, classifier='XGBoost', config=config)
model.fit(df_train[features], df_train[target], weights=(df_train['selected_weight']**0.5))#, weights = df_train['selected_weight']) #
df_test['preds'] = model.apply(df_test[features]) * 1.175 # To account for underestimation for top 75 players

# Best is probably n_estimators = 20, max_depth=10 eta=0.05, normal features, history_size = 10 (this is fine cause will learn prompted value)


In [239]:
df_test[df_test['name_cleaned'] == 'kevin de bruyne'][['season','GW','name_cleaned','opponent_name','was_home','total_points','preds']]


df_test[(df_test['season'] == '2022-23')&(df_test['GW'] == 35.0)][['season','GW','name_cleaned','opponent_name','was_home','total_points','preds']].sort_values(by='preds',ascending=False).iloc[0:20]



Unnamed: 0,season,GW,name_cleaned,opponent_name,was_home,total_points,preds
7096,2022-23,35.0,erling haaland,Leeds,True,2,6.524767
8527,2022-23,35.0,harry kane,Crystal Palace,True,8,6.306718
16402,2022-23,35.0,marcus rashford,West Ham,False,2,6.093743
3228,2022-23,35.0,bukayo saka,Newcastle,False,3,5.901248
13196,2022-23,35.0,kaoru mitoma,Everton,True,5,5.777907
17890,2022-23,35.0,mohamed salah,Brentford,True,10,5.755095
16635,2022-23,35.0,martin ødegaard,Newcastle,False,10,5.543327
9418,2022-23,35.0,ivan toney,Liverpool,False,2,5.534726
842,2022-23,35.0,alexander isak,Arsenal,True,2,5.0938
13721,2022-23,35.0,kieran trippier,Arsenal,True,1,5.027551


### Create fake output dataset

#### GW 36 Missing from data!!!
#### Will have to train model only up to GW35 for best results

In [207]:
test = df_test[(df_test['season'] == '2022-23')&(df_test['GW'] >= 34.0)][['season','GW','name_cleaned','opponent_name','was_home','total_points','preds']]


In [255]:

fake_output = pd.DataFrame()

gw = 34
for player in list(set(test['name_cleaned'])):
    preds = []
    for n in range(5):
        m_rows = (test['name_cleaned'] == player) & (test['GW'] == gw+n)
        try:
            val = np.mean(test[m_rows]['preds'])
            if pd.isnull(val):
                preds.append(0.0)
            else:
                preds.append(val)
        except:
            print('LOLOL')
            preds.append(0.0)
#     print(player)
    row = pd.DataFrame({'name':[player],
                        'gw_'+str(gw+0):[preds[0]],
                        'gw_'+str(gw+1):[preds[1]],
                        'gw_'+str(gw+2):[preds[2]],
                        'gw_'+str(gw+3):[preds[3]],
                        'gw_'+str(gw+4):[preds[4]],
                        'next_5':np.sum(preds)})
    fake_output = pd.concat([fake_output,row]).reset_index(drop=True)
        
    


In [261]:
fake_output = fake_output.sort_values(by='next_5',ascending=False).reset_index(drop=True)
# fake_output.to_csv('/Users/dominicbates/Documents/Github/fpl-model/fake_output.csv')
fake_output

Unnamed: 0,name,gw_34,gw_35,gw_36,gw_37,gw_38,next_5
0,erling haaland,6.167412,6.524767,0.0,5.941116,6.019530,24.652826
1,mohamed salah,6.357909,5.755095,0.0,6.378763,5.588407,24.080175
2,harry kane,5.710858,6.306718,0.0,6.408712,5.540221,23.966509
3,marcus rashford,6.441618,6.093743,0.0,5.172080,6.022097,23.729538
4,bukayo saka,6.323845,5.901248,0.0,5.304022,5.178668,22.707784
...,...,...,...,...,...,...,...
772,leo fuhr hjelde,0.149921,0.149921,0.0,0.168162,0.250511,0.718515
773,ethan ampadu,0.149921,0.168162,0.0,0.149921,0.250511,0.718515
774,juan larios lópez,0.149921,0.149921,0.0,0.168162,0.250511,0.718515
775,robert kenedy nunes do nascimento,0.149921,0.168162,0.0,0.149921,0.250511,0.718515


In [265]:
fake_output.sort_values(by='gw_38',ascending=False).reset_index(drop=True)

Unnamed: 0,name,gw_34,gw_35,gw_36,gw_37,gw_38,next_5
0,ollie watkins,5.865438,4.977207,0.0,5.404459,6.036258,22.283360
1,marcus rashford,6.441618,6.093743,0.0,5.172080,6.022097,23.729538
2,erling haaland,6.167412,6.524767,0.0,5.941116,6.019530,24.652826
3,mohamed salah,6.357909,5.755095,0.0,6.378763,5.588407,24.080175
4,harry kane,5.710858,6.306718,0.0,6.408712,5.540221,23.966509
...,...,...,...,...,...,...,...
772,sepp van den berg,0.250511,0.250511,0.0,0.279205,0.149921,0.930148
773,nathaniel phillips,0.250511,0.250511,0.0,0.279205,0.149921,0.930148
774,luke chambers,0.250511,0.250511,0.0,0.279205,0.149921,0.930148
775,rhys williams,0.250511,0.250511,0.0,0.279205,0.149921,0.930148


### Testing overestimation

In [201]:
top_names = list(df_test.groupby('name_cleaned')['total_points'].sum().sort_values(ascending=False)[0:50].index)
m_select = (df_test['name_cleaned'] == 'lolololol')
for name in top_names:
    m_select = m_select | (df_test['name_cleaned'] == name)
df_test[m_select][['total_points','preds']].mean()

total_points    4.230435
preds           4.102989
dtype: float64

In [237]:
df[(df['season']=='2022-23')&(df['GW'] == 37)]

Unnamed: 0,season,GW,name,position,opponent_name,opponent_team,kickoff_time,was_home,selected,selected_weight,...,f|player_exists|2_to_3,f|player_exists|3_to_4,f|player_exists|4_to_5,f|player_exists|5_to_10,f|player_exists|10_to_20,f|current|is_home,f|current|position|GKP,f|current|position|MID,f|current|position|FWD,f|current|position|DEF
374,2022-23,37.0,Aaron Cresswell,DEF,Leeds,11,2023-05-21T12:30:00Z,True,177114,0.752259,...,1.0,1.0,1.0,1.0,1.0,True,0,0,0,1
411,2022-23,37.0,Aaron Hickey,DEF,Spurs,18,2023-05-20T11:30:00Z,False,36006,0.152929,...,1.0,1.0,1.0,1.0,1.0,False,0,0,0,1
935,2022-23,37.0,Aaron Ramsdale,GKP,Nott'm Forest,16,2023-05-20T16:30:00Z,False,1232576,5.235138,...,1.0,1.0,1.0,1.0,1.0,False,1,0,0,0
1328,2022-23,37.0,Aaron Wan-Bissaka,DEF,Bournemouth,3,2023-05-20T14:00:00Z,False,185904,0.789593,...,1.0,1.0,1.0,1.0,1.0,False,0,0,0,1
1329,2022-23,37.0,Aaron Wan-Bissaka,DEF,Chelsea,6,2023-05-25T19:00:00Z,True,185904,0.789593,...,1.0,1.0,1.0,1.0,1.0,True,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165187,2022-23,37.0,Zack Steffen,GKP,Chelsea,6,2023-05-21T15:00:00Z,True,1241,0.005271,...,1.0,1.0,1.0,1.0,1.0,True,1,0,0,0
165188,2022-23,37.0,Zack Steffen,GKP,Brighton,5,2023-05-24T19:00:00Z,False,1241,0.005271,...,1.0,1.0,1.0,1.0,1.0,False,1,0,0,0
165366,2022-23,37.0,Zidane Iqbal,MID,Bournemouth,3,2023-05-20T14:00:00Z,False,22375,0.095034,...,1.0,1.0,1.0,1.0,1.0,False,0,1,0,0
165367,2022-23,37.0,Zidane Iqbal,MID,Chelsea,6,2023-05-25T19:00:00Z,True,22375,0.095034,...,1.0,1.0,1.0,1.0,1.0,True,0,1,0,0


In [202]:
1.15806*4.230/4.10298

1.1939112060014918