In [1]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split



In [2]:
url = "https://fantasy.premierleague.com/api/bootstrap-static/"

In [3]:
request = requests.get(url)
json = request.json()
json.keys()

dict_keys(['events', 'game_settings', 'phases', 'teams', 'total_players', 'elements', 'element_stats', 'element_types'])

In [4]:
print(request)

<Response [200]>


In [5]:
#Create & save as a csv a master list of all EPL players
master_player_df = pd.DataFrame(json['elements'])
master_player_df
#Ensure we can read the whole dataframe, without "..."
pd.set_option("display.max_rows", None)
print(master_player_df.dtypes)



chance_of_playing_next_round            float64
chance_of_playing_this_round            float64
code                                      int64
cost_change_event                         int64
cost_change_event_fall                    int64
cost_change_start                         int64
cost_change_start_fall                    int64
dreamteam_count                           int64
element_type                              int64
ep_next                                  object
ep_this                                  object
event_points                              int64
first_name                               object
form                                     object
id                                        int64
in_dreamteam                               bool
news                                     object
news_added                               object
now_cost                                  int64
photo                                    object
points_per_game                         

In [83]:
#element_type = position ID. The IDs are as follows:
    # 1-Goalkeeper, # 2-Defender, 3-Midfielder, 4-Forward
    
master_player_df_names = master_player_df[["id","first_name", "second_name", "team", "element_type"]]
#master_player_csv = master_player_df_names.to_csv("C:/Users/Daniel Quinn/Desktop/Bootcamp/Project_2 - Working_Copy/master_player_csv.csv", index=False)
#pd.set_option("display.max_rows", None)
master_player_df_names.head()

Unnamed: 0,id,first_name,second_name,team,element_type
0,1,Fábio,Ferreira Vieira,1,3
1,2,Gabriel,Fernando de Jesus,1,4
2,3,Gabriel,dos Santos Magalhães,1,2
3,4,Kai,Havertz,1,4
4,5,Karl,Hein,1,1


In [10]:
#Screen out the goalies only
#element_type = position ID. The IDs are as follows:
    # 1-Goalkeeper, # 2-Defender, 3-Midfielder, 4-Forward
epl_goalies = master_player_df_names[master_player_df_names["element_type"]==1]
epl_goalies.reset_index().shape

(67, 6)

In [138]:
epl_goalies.head()

Unnamed: 0,id,first_name,second_name,team,element_type
4,5,Karl,Hein,1,1
11,15,David,Raya Martin,1,1
21,73,Norberto,Murara Neto,1,1
37,39,Joe,Gauci,2,1
44,46,Filip,Marschall,2,1


In [39]:
#Create a function to ID historical stats for individual players

def get_past_data(element_id):
    url_past = f'https://fantasy.premierleague.com/api/element-summary/{element_id}/'
    response = requests.get(url_past)
    past_data = response.json()
    history_past_df = pd.DataFrame(past_data['history_past'])
    
    return history_past_df

#Get individual players' historical data using their ID
element_id = 2
history_past_df = get_past_data(element_id)

# Seee all columns
pd.set_option("display.max_columns", None)
history_past_df


Unnamed: 0,season_name,element_code,start_cost,end_cost,total_points,minutes,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_saved,penalties_missed,yellow_cards,red_cards,saves,bonus,bps,influence,creativity,threat,ict_index,starts,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded
0,2016/17,205651,90,91,67,651,7,4,3,5,0,0,0,2,0,0,12,258,335.8,165.9,436.0,92.9,0,0.0,0.0,0.0,0.0
1,2017/18,205651,105,104,126,1660,13,7,9,12,0,0,2,6,0,0,18,473,603.6,360.3,1089.0,204.9,0,0.0,0.0,0.0,0.0
2,2018/19,205651,105,99,79,1017,7,3,2,11,0,0,0,1,0,0,7,259,395.2,300.5,864.0,156.3,0,0.0,0.0,0.0,0.0
3,2019/20,205651,95,99,146,2018,14,8,12,14,0,0,1,3,0,0,17,535,757.6,447.7,1620.0,282.1,0,0.0,0.0,0.0,0.0
4,2020/21,205651,95,91,115,2056,9,5,11,17,0,0,0,2,0,0,16,453,501.0,467.5,931.0,190.0,0,0.0,0.0,0.0,0.0
5,2021/22,205651,85,86,120,1871,8,8,11,15,0,0,0,1,0,0,16,488,567.4,530.9,1126.0,222.3,0,0.0,0.0,0.0,0.0
6,2022/23,205651,80,81,125,2064,11,7,9,23,0,0,0,6,0,0,17,401,660.6,427.8,1331.0,241.9,24,14.24,2.91,17.15,23.15
7,2023/24,205651,80,77,85,1470,4,7,11,11,0,0,0,6,0,0,10,271,364.8,403.0,756.0,152.7,17,6.27,3.26,9.53,11.67


In [42]:
#convert objects to int/floats

history_past_df["influence"] = history_past_df["influence"].astype(float)
history_past_df["creativity"] = history_past_df["creativity"].astype(float)
history_past_df["threat"] = history_past_df["threat"].astype(float)
history_past_df["expected_goals"] = history_past_df["expected_goals"].astype(float)
history_past_df["expected_assists"] = history_past_df["expected_assists"].astype(float)
history_past_df["expected_goal_involvements"] = history_past_df["expected_goal_involvements"].astype(float)
history_past_df["expected_goals_conceded"] = history_past_df["expected_goals_conceded"].astype(float)


In [12]:
history_past_df.columns

Index(['season_name', 'element_code', 'start_cost', 'end_cost', 'total_points',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'starts', 'expected_goals', 'expected_assists',
       'expected_goal_involvements', 'expected_goals_conceded'],
      dtype='object')

In [13]:
#Create a df with goalie-relevant features only

past_history_df = history_past_df[['season_name', 'total_points',
       'minutes', 'clean_sheets', 'goals_conceded', 'expected_goals_conceded',
       'own_goals', 'penalties_saved', 'saves', 'starts', 'end_cost']]
past_history_df

Unnamed: 0,season_name,total_points,minutes,clean_sheets,goals_conceded,expected_goals_conceded,own_goals,penalties_saved,saves,starts,end_cost
0,2016/17,67,651,3,5,0.0,0,0,0,0,91
1,2017/18,126,1660,9,12,0.0,0,0,0,0,104
2,2018/19,79,1017,2,11,0.0,0,0,0,0,99
3,2019/20,146,2018,12,14,0.0,0,0,0,0,99
4,2020/21,115,2056,11,17,0.0,0,0,0,0,91
5,2021/22,120,1871,11,15,0.0,0,0,0,0,86
6,2022/23,125,2064,9,23,23.15,0,0,0,24,81
7,2023/24,85,1470,11,11,11.67,0,0,0,17,77


In [139]:
#Import David's Master List of Goalie historical data with goalie-relevant stats as noted above

master_goalies_list = pd.read_csv("C:/Users/Daniel Quinn/Desktop/Bootcamp/Project_2/data/processed/from_david_goalkeepers_data.csv")

master_goalies_list = master_goalies_list[['season_name', 'total_points',
       'minutes', 'clean_sheets', 'goals_conceded', 'expected_goals_conceded',
       'own_goals', 'penalties_saved', 'saves', 'starts', 'end_cost']]



In [140]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression

#Looking to do a regression to see the relationship between the features and the player's "end_cost"

#create X & y variables
X = master_goalies_list.drop(columns=['season_name', 'end_cost'])
y = master_goalies_list['end_cost']

#test-training split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#create model
model = LinearRegression()

model.fit(X_train, y_train)


In [141]:
#Make predictions

prediction1 = model.predict(X_test)

#Evaluate models with mse and r2

mse = mean_squared_error(y_test, prediction1)
r2 = r2_score(y_test, prediction1)

print(f"All Features (no end_cost):")
print(f"mean squared error (MSE): {mse}")
print(f"R-squared (R2): {r2}")


All Features (no end_cost):
mean squared error (MSE): 10.752472714818744
R-squared (R2): 0.6343594684964453


In [142]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#create X & y variables
X = master_goalies_list.drop(columns=['season_name', 'end_cost'])
y = master_goalies_list['end_cost']

#test-training split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#create & train model
random_forest = RandomForestRegressor(n_estimators=500, random_state=42).fit(X_train, y_train)

# Evaluate the model
print(f"Training Score: {random_forest.score(X_train, y_train)}")
print(f"Testing Score: {random_forest.score(X_test, y_test)}")



Training Score: 0.8903890257772723
Testing Score: 0.5960769753492661


In [144]:
# Feature Importance
feature_importances = random_forest.feature_importances_

feature_importances_df = pd.DataFrame(feature_importances, X.columns)


print(feature_importances_df.sort_values(by=0, ascending=False))


                                0
total_points             0.624224
clean_sheets             0.107853
goals_conceded           0.081235
minutes                  0.076072
saves                    0.075089
expected_goals_conceded  0.013365
starts                   0.011853
penalties_saved          0.007270
own_goals                0.003038
