In [28]:
import nbformat
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [29]:
def kfold_cross_validation(model, splits, X, y):
    mae = []
    mse = []
    r2 = []
    rms = []

    kf = KFold(5, shuffle=True, random_state=42)
    for train_ind, test_ind in kf.split(X):
        X_train, Y_train = X.iloc[train_ind], y.iloc[train_ind]
        X_test, Y_test = X.iloc[test_ind], y.iloc[test_ind]

        model.fit(X_train, Y_train)

        y_pred = model.predict(X_test)

            # Calculate metrics
        mae_r = mean_absolute_error(Y_test, y_pred)
        mse_r = mean_squared_error(Y_test, y_pred)
        rmse_r = np.sqrt(mse)
        r2_r = r2_score(Y_test, y_pred)

        # Output metrics
        mae.append(mae_r)
        mse.append(mse_r)
        rms.append(rmse_r)
        r2.append(r2_r)

    print("MAE = ", mae)
    print("MSE = ", mse)
    print("rms = ", rms)
    print("r2 = ", r2)


In [30]:
%run data_download_from_fpl.ipynb

{ 'chips': [...],
  'element_stats': [...],
  'element_types': [...],
  'elements': [...],
  'events': [...],
  'game_config': {...},
  'game_settings': {...},
  'phases': [...],
  'teams': [...],
  'total_players': 11122254}
{'fixtures': [...], 'history': [...], 'history_past': [...]}


100%|██████████| 710/710 [00:34<00:00, 20.53it/s]


      id_player      web_name  element  fixture  opponent_team  total_points  \
17            1  Fábio Vieira        1      171             10             0   
9             1  Fábio Vieira        1       96             15             0   
10            1  Fábio Vieira        1      103              6             0   
11            1  Fábio Vieira        1      111             16             0   
3             1  Fábio Vieira        1       39             18             0   
...         ...           ...      ...      ...            ...           ...   
7488        708  Wilson-Brown      708      191              2             0   
7487        708  Wilson-Brown      708      187             13             0   
8077        709         Danns      709      190             19             0   
8078        709         Danns      709      196             14             0   
1376        710         Jimoh      710      191             11             0   

      was_home          kickoff_time  t

In [31]:
future_data_2 = future_data
past_data_2 = past_data

In [32]:
# Convert all object columns to float
for col in future_data_2.select_dtypes(include=['object']).columns:
    future_data_2[col] = pd.to_numeric(future_data_2[col], errors='coerce')

    # Convert all object columns to float
for col in past_data_2.select_dtypes(include=['object']).columns:
    past_data_2[col] = pd.to_numeric(past_data_2[col], errors='coerce')  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2[col] = pd.to_numeric(future_data_2[col], errors='coerce')


In [33]:
future_data_2['event'] = future_data_2['event'].fillna(0)
future_data_2['event'] = future_data_2['event'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['event'] = future_data_2['event'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['event'] = future_data_2['event'].astype('int64')


In [34]:
X = past_data_2.drop(columns = ['total_points', 'event'])
y = past_data_2[['total_points']]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [36]:
model = XGBRegressor()

In [37]:
# Convert all object columns to float
for col in X.select_dtypes(include=['object']).columns:
    X[col] = pd.to_numeric(X[col], errors='coerce') 

In [38]:
kfold_cross_validation(model, 5, X, y)

MAE =  [0.1975773572921753, 0.2117777317762375, 0.21872439980506897, 0.2160012573003769, 0.22141484916210175]
MSE =  [0.23084807395935059, 0.2597533166408539, 0.269618958234787, 0.22480550408363342, 0.273504376411438]
rms =  [array([], dtype=float64), array([0.48046652]), array([0.48046652, 0.50966   ]), array([0.48046652, 0.50966   , 0.51924846]), array([0.48046652, 0.50966   , 0.51924846, 0.47413659])]
r2 =  [0.948478639125824, 0.9533418416976929, 0.9572012424468994, 0.9595949649810791, 0.9535657167434692]


In [39]:
future_data_2['is_home'] = future_data_2['is_home'].astype('bool')

future_data_2['id_player'] = future_data_2['id_player'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['is_home'] = future_data_2['is_home'].astype('bool')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  future_data_2['id_player'] = future_data_2['id_player'].astype('int64')


In [41]:
y_future = model.predict(future_data_2.drop(columns = ['total_points']))

In [42]:
future_points = pd.DataFrame(y_future, columns=(['predicted_points']))

In [43]:
future_data_2 = future_data_2.merge(future_points, left_index=True, right_index=True)
future_data_2.head()

Unnamed: 0,id_player,opponent_team,is_home,cumulative_points,bps,total_points,event,goals_scored,assists,clean_sheets,goals_conceded,influence,creativity,threat,predicted_points
0,1,18,True,0,0,0,21,0,0,0,0,0.0,0.0,0.0,-0.003437
1,1,2,True,0,0,0,22,0,0,0,0,0.0,0.0,0.0,-0.00461
2,1,20,False,0,0,0,23,0,0,0,0,0.0,0.0,0.0,-0.001684
3,1,13,True,0,0,0,24,0,0,0,0,0.0,0.0,0.0,-0.004426
4,1,11,False,0,0,0,25,0,0,0,0,0.0,0.0,0.0,-0.002674


In [44]:
# base url for all FPL API endpoints
base_url = 'https://fantasy.premierleague.com/api/'

# get data from bootstrap-static endpoint
r = requests.get(base_url+'bootstrap-static/').json()

# show the top level fields
pprint(r, indent=2, depth=1, compact=True)

{ 'chips': [...],
  'element_stats': [...],
  'element_types': [...],
  'elements': [...],
  'events': [...],
  'game_config': {...},
  'game_settings': {...},
  'phases': [...],
  'teams': [...],
  'total_players': 11122254}


In [45]:
pd.set_option('display.max_columns', None)
# create players dataframe
players = pd.json_normalize(r['elements'])

In [47]:
future_data_2 = future_data_2.merge(players[['id', 'first_name', 'second_name']], left_on= 'id_player', right_on = 'id' )

In [48]:
future_data_2[['id_player', 'first_name', 'second_name', 'opponent_team', 'is_home', 'cumulative_points','bps', 'total_points', 'event', 'predicted_points']]

Unnamed: 0,id_player,first_name,second_name,opponent_team,is_home,cumulative_points,bps,total_points,event,predicted_points
0,1,Fábio,Ferreira Vieira,18,True,0,0,0,21,-0.003437
1,1,Fábio,Ferreira Vieira,2,True,0,0,0,22,-0.004610
2,1,Fábio,Ferreira Vieira,20,False,0,0,0,23,-0.001684
3,1,Fábio,Ferreira Vieira,13,True,0,0,0,24,-0.004426
4,1,Fábio,Ferreira Vieira,11,False,0,0,0,25,-0.002674
...,...,...,...,...,...,...,...,...,...,...
12895,709,Jayden,Danns,6,False,0,0,0,35,0.000526
12896,709,Jayden,Danns,1,True,0,0,0,36,0.001384
12897,709,Jayden,Danns,5,False,0,0,0,37,0.000526
12898,709,Jayden,Danns,7,True,0,0,0,38,-0.001226


something happening with the event column