# Here, I am creating a new column in the goalie_hist_model_ready DF that creates a Goalie Score metric for analysis. 

In [11]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler


In [28]:
#Import goalie-specific, model ready csv

goalie_hist_stats = pd.read_csv("C:/Users/Daniel Quinn/Desktop/Bootcamp/Project_2/data/processed/end_cost_predictions_df_master.csv")
#Ensure we can read the whole dataframe, without "..."
pd.set_option("display.max_rows", None)
goalie_hist_stats = goalie_hist_stats.drop(columns = ['Saves_Percentage', 'Unnamed: 0'])
goalie_hist_stats.columns


Index(['id', 'first_name', 'second_name', 'team', 'element_type', 'code',
       'element_code', 'season', 'total_points', 'minutes', 'goals_scored',
       'assists', 'clean_sheets', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'saves', 'bonus', 'bps', 'influence', 'creativity', 'threat',
       'ict_index', 'starts', 'expected_goals', 'expected_assists',
       'expected_goal_involvements', 'expected_goals_conceded', 'start_cost',
       'end_cost', 'Predicted_End_Cost'],
      dtype='object')

In [29]:
#Create Saves Percentage stat:

goalie_hist_stats["Saves_Percentage"] = (
    (goalie_hist_stats['saves'] + goalie_hist_stats['penalties_saved']) /
    (goalie_hist_stats['saves'] + goalie_hist_stats['penalties_saved'] + goalie_hist_stats['goals_conceded'])
) * 100
#pd.set_option("display.max_columns", None)
goalie_hist_stats.shape

(181, 35)

In [24]:
#Clean the dataframe of any irrelevant goalies - ie, those with not enough data to generate a saves percentage

goalie_hist_stats_cleaned = goalie_hist_stats.dropna()
goalie_hist_stats_cleaned.shape

(181, 35)

In [26]:
# Prep dataframe for modeling
#Get only the goalies with meaningful stats


# goalie_hist_model_ready_cleaned = goalie_hist_model_ready.drop(columns = ['season','first_name', 'second_name', 'team',  ])
# # goalie_hist_model_ready.columns
# goalie_hist_model_1_ready = goalie_hist_model_ready_cleaned[['id', 'element_type', 'code',
#        'element_code', 'start_cost', 'end_cost', 'total_points', 'minutes',
#        'assists', 'clean_sheets', 'goals_conceded', 'own_goals',
#        'penalties_saved', 'saves', 'bonus', 'bps', 'ict_index', 'starts',
#        'expected_goals', 'expected_assists', 'expected_goal_involvements',
#        'expected_goals_conceded', 'Saves_Percentage']]


In [27]:
#Create a better dataframe name for going forward

goalie_hist_model_2_ready = goalie_hist_stats_cleaned
goalie_hist_model_2_ready.columns

Index(['id', 'first_name', 'second_name', 'team', 'element_type', 'code',
       'element_code', 'season', 'start_cost', 'end_cost', 'total_points',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'starts', 'expected_goals', 'expected_assists',
       'expected_goal_involvements', 'expected_goals_conceded',
       'Saves_Percentage'],
      dtype='object')

In [36]:
# Check p-value

#Check the p-value to determine the statistical significance of each feature

import statsmodels.api as sm

#create X & y variables
X = goalie_hist_model_2_ready.drop(columns = ['id', 'first_name', 'second_name', 'team', 'element_type', 'code',
       'element_code', 'season', 'Saves_Percentage'])

y = goalie_hist_model_2_ready['Saves_Percentage']

#test-training split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Use the statsmodels package to create and fit a linear regression
lr = sm.OLS(y_train, X_train).fit()
lr.pvalues.sort_values(ascending=False)

own_goals                     0.917378
threat                        0.915752
starts                        0.912047
assists                       0.811326
expected_goals_conceded       0.786864
creativity                    0.777239
ict_index                     0.755579
influence                     0.754513
end_cost                      0.526741
expected_goal_involvements    0.496294
clean_sheets                  0.417995
expected_goals                0.361442
bps                           0.358049
red_cards                     0.347271
penalties_missed              0.336248
expected_assists              0.318242
goals_scored                  0.195739
minutes                       0.144732
bonus                         0.141742
yellow_cards                  0.118835
penalties_saved               0.117664
total_points                  0.068951
start_cost                    0.027218
goals_conceded                0.001890
saves                         0.000048
dtype: float64

In [38]:
#create X & y variables
X = goalie_hist_model_2_ready.drop(columns = ['id', 'first_name', 'second_name', 'team', 'element_type', 'code',
       'element_code', 'season', 'start_cost', 'end_cost', 'total_points',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'yellow_cards',
       'red_cards', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'expected_goals', 'expected_assists',
       'expected_goal_involvements', 'expected_goals_conceded',
       'Saves_Percentage'])

y = goalie_hist_model_2_ready['Saves_Percentage']

#test-training split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#create model
model = LinearRegression()

model.fit(X_train, y_train)
print("Train model score: ", model.score(X_train, y_train))
print("Test model score: ", model.score(X_test, y_test))

Train model score:  0.26246888529115764
Test model score:  0.22829695191496635


In [18]:
#Make predictions

prediction1 = model.predict(X_test)

#Evaluate models with mse and r2

mse = mean_squared_error(y_test, prediction1) # how close are the predicted values to actual values via the squared differences between expected and real
r2 = r2_score(y_test, prediction1) # r2 - how well do the indep variables explain the variation in the dep var? ) 0 is a perfect model, the larger the nuber, the worse the model is performing

print(f"All Features (Saves_Percentage = y):")
print(f"mean squared error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

All Features (Saves_Percentage = y):
mean squared error (MSE): 11.437117549567208
R-squared (R2): 0.6063351432285085


In [19]:
#Run the model to get the predicted Goalie Score

predicted_Saves_Percentage = goalie_hist_model_1_ready.drop(columns = ['season_start_date', 'Goalie_Score', 'first_name', 'second_name'])
Goalie_Saves_Percentage = model.predict(predicted_Goalie_Score)
print(Goalie_Saves_Percentage)

KeyError: "['season_start_date', 'Goalie_Score', 'first_name', 'second_name'] not found in axis"

In [42]:
#create X & y variables
X = goalie_hist_model_1_ready.drop(columns=['season_start_date', 'first_name', 'second_name', 'Goalie_Score'])
y = goalie_hist_model_1_ready['Goalie_Score']

#test-training split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#create & train model
random_forest = RandomForestRegressor(n_estimators=500, random_state=42).fit(X_train, y_train)

# Evaluate the model
print(f"Training Score: {random_forest.score(X_train, y_train)}")
print(f"Testing Score: {random_forest.score(X_test, y_test)}")

Training Score: 0.9996168755316256
Testing Score: 0.9974507584902772


In [43]:
# Feature Importance
feature_importances = random_forest.feature_importances_

feature_importances_df = pd.DataFrame(feature_importances, X.columns)


print(feature_importances_df.sort_values(by=0, ascending=False))

                                   0
bps                         0.981318
ict_index                   0.009356
total_points                0.006314
saves                       0.000846
minutes                     0.000679
id                          0.000238
bonus                       0.000220
code                        0.000218
element_code                0.000153
team                        0.000140
goals_conceded              0.000136
clean_sheets                0.000081
end_cost                    0.000079
starts                      0.000048
start_cost                  0.000044
penalties_saved             0.000040
expected_goals_conceded     0.000039
expected_assists            0.000016
expected_goal_involvements  0.000014
expected_goals              0.000009
own_goals                   0.000007
assists                     0.000005
element_type                0.000000
