In [86]:
import nbformat
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler
import time

FUNCTIONS

In [87]:
def kfold_cross_validation(model, splits, X, y):
    mae = []
    mse = []
    r2 = []
    rms = []

    kf = KFold(5, shuffle=True, random_state=42)
    for train_ind, test_ind in kf.split(X):

        X_train, Y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_test, Y_test = X.iloc[test_ind], y.iloc[test_ind]

        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        model.fit(X_train_scaled, Y_train)

        y_pred = model.predict(X_test_scaled)

            # Calculate metrics
        mae_r = mean_absolute_error(Y_test, y_pred)
        mse_r = mean_squared_error(Y_test, y_pred)
        rmse_r = np.sqrt(mse)
        r2_r = r2_score(Y_test, y_pred)

        # Output metrics
        mae.append(mae_r)
        mse.append(mse_r)
        rms.append(rmse_r)
        r2.append(r2_r)

    print("MAE = ", mae)
    print("MSE = ", mse)
    print("rms = ", rms)
    print("r2 = ", r2)


DATA DOWNLOAD

In [88]:
%run data_download_from_fpl.ipynb

{ 'chips': [...],
  'element_stats': [...],
  'element_types': [...],
  'elements': [...],
  'events': [...],
  'game_config': {...},
  'game_settings': {...},
  'phases': [...],
  'teams': [...],
  'total_players': 11145556}
{'fixtures': [...], 'history': [...], 'history_past': [...]}


 47%|████▋     | 338/718 [00:15<00:17, 21.14it/s]


KeyboardInterrupt: 

KeyboardInterrupt: 

TRAINING DATA PREPROCESSING

In [63]:
past_data_2 = past_data

In [64]:
 # Convert all object columns to float
for col in past_data_2.select_dtypes(include=['object']).columns:
    past_data_2[col] = pd.to_numeric(past_data_2[col], errors='coerce')  

ML MODEL PREP

In [65]:
X = past_data_2.drop(columns = ['total_points', 'event'])
y = past_data_2[['total_points']]

In [66]:
from sklearn.linear_model import ElasticNet
model = ElasticNet(alpha=0.1, l1_ratio=0.7)

In [None]:
kfold_cross_validation(model, 5, X, y)

MAE =  [0.41720584567887925, 0.41582587241098884, 0.40592709119828185, 0.38692245595308505, 0.4086459909825426]
MSE =  [0.5996421029413346, 0.62765546445985, 0.5606538766343675, 0.5310640458008318, 0.577417677012279]
rms =  [array([], dtype=float64), array([0.77436561]), array([0.77436561, 0.7922471 ]), array([0.77436561, 0.7922471 , 0.74876824]), array([0.77436561, 0.7922471 , 0.74876824, 0.72874141])]
r2 =  [0.8887967122327464, 0.8840846334258369, 0.9023132757176123, 0.9061510382920491, 0.8993494203960708]


TRAIN ML MODEl

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [71]:
model = model.fit(X_train_scaled, y_train)

FUTURE DATA PREPROCESSING

In [72]:
future_data_2 = future_data

In [73]:
# Convert all object columns to float
for col in future_data_2.select_dtypes(include=['object']).columns:
    future_data_2[col] = pd.to_numeric(future_data_2[col], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2[col] = pd.to_numeric(future_data_2[col], errors='coerce')


In [74]:
future_data_2['event'] = future_data_2['event'].fillna(0)
future_data_2['event'] = future_data_2['event'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['event'] = future_data_2['event'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['event'] = future_data_2['event'].astype('int64')


In [75]:
future_data_2['is_home'] = future_data_2['is_home'].astype('bool')

future_data_2['id_player'] = future_data_2['id_player'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['is_home'] = future_data_2['is_home'].astype('bool')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['id_player'] = future_data_2['id_player'].astype('int64')


In [76]:
data_to_predict_on = future_data_2.drop(columns = ['total_points','event'])

In [77]:
scaler = RobustScaler()
data_to_predict_on_scaled = scaler.fit_transform(data_to_predict_on)


MODEL PREDICTIONS

In [79]:
y_future = model.predict(data_to_predict_on_scaled)

In [80]:
future_points = pd.DataFrame(y_future, columns=(['predicted_points']))

In [81]:
predicted_data = future_data_2.merge(future_points, left_index=True, right_index=True)

In [82]:
# base url for all FPL API endpoints
base_url = 'https://fantasy.premierleague.com/api/'

# get data from bootstrap-static endpoint
r = requests.get(base_url+'bootstrap-static/').json()

# show the top level fields
pprint(r, indent=2, depth=1, compact=True)

{ 'chips': [...],
  'element_stats': [...],
  'element_types': [...],
  'elements': [...],
  'events': [...],
  'game_config': {...},
  'game_settings': {...},
  'phases': [...],
  'teams': [...],
  'total_players': 11145531}


In [83]:
pd.set_option('display.max_columns', None)
# create players dataframe
players = pd.json_normalize(r['elements'])

In [84]:
predicted_data = predicted_data.merge(players[['id', 'first_name', 'second_name']], left_on= 'id_player', right_on = 'id' )

In [85]:
predicted_data = predicted_data[['id_player', 'first_name', 'second_name', 'opponent_team', 'is_home', 'cumulative_points','bps', 'total_points', 'event', 'predicted_points']]