In [171]:
import nbformat
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler
import time
import lightgbm
import optuna
from sklearn.model_selection import cross_val_score

FUNCTIONS

In [172]:
def kfold_cross_validation(model, splits, X, y):
    mae = []
    mse = []
    r2 = []
    rms = []

    kf = KFold(5, shuffle=True, random_state=42)
    for train_ind, test_ind in kf.split(X):

        X_train, Y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_test, Y_test = X.iloc[test_ind], y.iloc[test_ind]

        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        model.fit(X_train_scaled, Y_train)

        y_pred = model.predict(X_test_scaled)

            # Calculate metrics
        mae_r = mean_absolute_error(Y_test, y_pred)
        mse_r = mean_squared_error(Y_test, y_pred)
        rmse_r = np.sqrt(mse)
        r2_r = r2_score(Y_test, y_pred)

        # Output metrics
        mae.append(mae_r)
        mse.append(mse_r)
        rms.append(rmse_r)
        r2.append(r2_r)

    print("MAE = ", mae)
    print("MSE = ", mse)
    print("rms = ", rms)
    print("r2 = ", r2)


DATA DOWNLOAD

In [173]:
%run data_download_from_fpl.ipynb

{ 'chips': [...],
  'element_stats': [...],
  'element_types': [...],
  'elements': [...],
  'events': [...],
  'game_config': {...},
  'game_settings': {...},
  'phases': [...],
  'teams': [...],
  'total_players': 11154687}
{'fixtures': [...], 'history': [...], 'history_past': [...]}


100%|██████████| 718/718 [00:48<00:00, 14.79it/s]


       id_player      web_name  element  fixture  opponent_team  total_points  \
0              1  Fábio Vieira        1        2             20             0   
11             1  Fábio Vieira        1      111             16             0   
3              1  Fábio Vieira        1       39             18             0   
5              1  Fábio Vieira        1       51             11             0   
17             1  Fábio Vieira        1      171             10             0   
...          ...           ...      ...      ...            ...           ...   
7488         708  Wilson-Brown      708      191              2             0   
8077         709         Danns      709      190             19             0   
8078         709         Danns      709      196             14             0   
1376         710         Jimoh      710      191             11             0   
13453        711       Okoduwa      711      200             16             0   

       was_home          ki

TRAINING DATA PREPROCESSING

In [174]:
past_data_2 = past_data

In [175]:
 # Convert all object columns to float
for col in past_data_2.select_dtypes(include=['object']).columns:
    past_data_2[col] = pd.to_numeric(past_data_2[col], errors='coerce')  

ML MODEL PREP

In [176]:
X = past_data_2.drop(columns = ['total_points', 'event'])
y = past_data_2[['total_points']]

In [177]:
from sklearn.linear_model import ElasticNet
model = lightgbm.LGBMRegressor(alpha=0.1, l1_ratio=0.7)

In [178]:
kfold_cross_validation(model, 5, X, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001395 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 10763, number of used features: 12
[LightGBM] [Info] Start training from score 1.210629


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 10763, number of used features: 12
[LightGBM] [Info] Start training from score 1.233392
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1062
[LightGBM] [Info] Number of data points in the train set: 10763, number of used features: 12
[LightGBM] [Info] Start training from score 1.188237
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1066
[LightGBM] [Info] Number of data points in the train set: 10764, number of used features: 12
[LightGBM] [Info] Start training from score 1.206522
MAE =  [0.20885688183227638, 0.19962158829197685, 0.22071634075895186, 0.21135924887363572, 0.21035797246637636]
MSE =  [0.2512492251480871, 0.20692805865314798, 0.2607190289634553, 0.22046580062514967, 0.21600149185513917]
rms =  [array([], dtype=float64), array([0.50124767]), array([0.50124767, 0.45489346]), array([0.50124767, 0.45489346, 0.51060653]), array([0.50124767, 0.45489346, 0.51060653, 0.46953786])]
r2 =  [0.9550037776755026, 0.9578598225200124, 0.9589986921964901, 0.9610855184068962, 0.9600713715874369]


TRAIN ML MODEl

In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [180]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [181]:
X_trial = X_train_scaled
y_trial = y_train

# Define the objective function and suggest hyperparameters values
def objective(trial):

    svc_num_leaves = trial.suggest_int('num_leaves', 2, 31)
    svc_learning_rate = trial.suggest_loguniform('learning_rate', 0.1, 1)
    svc_n_estimators = trial.suggest_int('n_estimators', 1, 100)
    svc_lambda_l1 = trial.suggest_loguniform('lambda_l1', 0.1, 1) 
    clf = lightgbm.LGBMRegressor (num_leaves = svc_num_leaves, learning_rate =svc_learning_rate, n_estimators = svc_n_estimators, lambda_l1 = svc_lambda_l1)
    return cross_val_score(clf, X_trial, y_trial, n_jobs=-1, cv=3).mean()

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print the best set of hyperparameters
print('Best hyperparameters: ', study.best_params)
# Print the corresponding performance
print('Best performance: ', study.best_value)

[I 2025-01-13 23:19:43,140] A new study created in memory with name: no-name-9e121fe2-3ff6-4f49-ac38-389ad89474f2
  svc_learning_rate = trial.suggest_loguniform('learning_rate', 0.1, 1)
  svc_lambda_l1 = trial.suggest_loguniform('lambda_l1', 0.1, 1)
[I 2025-01-13 23:19:45,557] Trial 0 finished with value: 0.9525795831139376 and parameters: {'num_leaves': 29, 'learning_rate': 0.21466723459783427, 'n_estimators': 84, 'lambda_l1': 0.4366729921483321}. Best is trial 0 with value: 0.9525795831139376.
[I 2025-01-13 23:19:47,106] Trial 1 finished with value: 0.9463295044814096 and parameters: {'num_leaves': 27, 'learning_rate': 0.46417096715830475, 'n_estimators': 97, 'lambda_l1': 0.14513982187244884}. Best is trial 0 with value: 0.9525795831139376.
[I 2025-01-13 23:19:48,701] Trial 2 finished with value: 0.9528152716625812 and parameters: {'num_leaves': 31, 'learning_rate': 0.18511068193990834, 'n_estimators': 80, 'lambda_l1': 0.37453715134231497}. Best is trial 2 with value: 0.9528152716625

Best hyperparameters:  {'num_leaves': 17, 'learning_rate': 0.11562996573396564, 'n_estimators': 82, 'lambda_l1': 0.22607386545691477}
Best performance:  0.9552771751521227


In [182]:
best_params = study.best_params

In [183]:
model = lightgbm.LGBMRegressor(num_leaves = best_params['num_leaves'], learning_rate=best_params['learning_rate'], n_estimators = best_params['n_estimators'], lambda_l1 = best_params['lambda_l1'])

model = model.fit(X_train_scaled, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000536 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 10763, number of used features: 12
[LightGBM] [Info] Start training from score 1.210629


  y = column_or_1d(y, warn=True)


FUTURE DATA PREPROCESSING

In [184]:
future_data_2 = future_data

In [185]:
# Convert all object columns to float
for col in future_data_2.select_dtypes(include=['object']).columns:
    future_data_2[col] = pd.to_numeric(future_data_2[col], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2[col] = pd.to_numeric(future_data_2[col], errors='coerce')


In [186]:
future_data_2['event'] = future_data_2['event'].fillna(0)
future_data_2['event'] = future_data_2['event'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['event'] = future_data_2['event'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['event'] = future_data_2['event'].astype('int64')


In [187]:
future_data_2['is_home'] = future_data_2['is_home'].astype('bool')

future_data_2['id_player'] = future_data_2['id_player'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['is_home'] = future_data_2['is_home'].astype('bool')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['id_player'] = future_data_2['id_player'].astype('int64')


In [188]:
data_to_predict_on = future_data_2.drop(columns = ['total_points','event'])

In [189]:
scaler = RobustScaler()
data_to_predict_on_scaled = scaler.fit_transform(data_to_predict_on)


MODEL PREDICTIONS

In [190]:
y_future = model.predict(data_to_predict_on_scaled)



In [191]:
future_points = pd.DataFrame(y_future, columns=(['predicted_points']))

In [192]:
predicted_data = future_data_2.merge(future_points, left_index=True, right_index=True)

In [193]:
# base url for all FPL API endpoints
base_url = 'https://fantasy.premierleague.com/api/'

# get data from bootstrap-static endpoint
r = requests.get(base_url+'bootstrap-static/').json()

# show the top level fields
pprint(r, indent=2, depth=1, compact=True)

{ 'chips': [...],
  'element_stats': [...],
  'element_types': [...],
  'elements': [...],
  'events': [...],
  'game_config': {...},
  'game_settings': {...},
  'phases': [...],
  'teams': [...],
  'total_players': 11154704}


In [194]:
pd.set_option('display.max_columns', None)
# create players dataframe
players = pd.json_normalize(r['elements'])

In [195]:
predicted_data = predicted_data.merge(players[['id', 'first_name', 'second_name']], left_on= 'id_player', right_on = 'id' )

In [196]:
predicted_data = predicted_data[['id_player', 'first_name', 'second_name', 'opponent_team', 'is_home', 'cumulative_points','bps', 'total_points', 'event', 'predicted_points']]

In [197]:
import sys
print(sys.executable)

c:\Users\fitzm\anaconda3\envs\fantasy\python.exe


In [198]:
# Get feature coefficients
coefficients = model.coef_

# Get feature names (if available, e.g., if X_train is a DataFrame)
try:
    feature_names = X_train.columns
except AttributeError:
    feature_names = [f"Feature {i}" for i in range(X_train.shape[1])]

# Combine feature names with coefficients
feature_importance = list(zip(feature_names, coefficients))

AttributeError: 'LGBMRegressor' object has no attribute 'coef_'