# Predicting Player Peaks

In this exercise, we'll predict player peak attributes with multi-output regression using a random forest. Then, to create prediction intervals for high and low level outcomes, we'll use GradientBoostingRegressor with quantile loss. This will allow us to estimate ranges of outcomes for players skills in scouting.

In [17]:
import pandas as pd
import numpy as np

import os
import sys

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline

current = os.path.dirname(os.path.realpath("predicting_peaks"))
parent = os.path.dirname(current)

ncaa_to_peak = pd.read_csv(parent + "/player_data/ncaa_to_peak.csv")
pd.set_option("display.max_columns", 250)

In [18]:
ncaa_to_peak = ncaa_to_peak[ncaa_to_peak["peak_FG%"].notna()].reset_index(drop=True)

In [19]:
y = ncaa_to_peak.loc[:,["peak_AST_2_TOV", "peak_AST_2_FG", "peak_AST%", "peak_TOV%", "peak_USG%",
                 "peak_PPP", "peak_2P_prop", "peak_0-3_prop", "peak_3-10_prop", "peak_10-16_prop", 
                 "peak_16-3P_prop", "peak_3P_prop", "peak_corner_%3PA", "peak_2P_%astd",
                 "peak_3P_%astd", "peak_0-3_fg%", "peak_3-10_fg%", "peak_10-16_fg%", 
                 "peak_16-3P_fg%", "peak_3P_fg%", "peak_corner_3P%", "peak_STL%", 
                 "peak_BLK%", "peak_ORB%", "peak_DRB%"
                ]]

y["peak_2P_%astd"] = y["peak_2P_prop"] * y["peak_2P_%astd"]
y["peak_3P_%astd"] = y["peak_3P_prop"] * y["peak_3P_%astd"]

y["peak_corner_%3PA"] = y["peak_3P_prop"] * y["peak_corner_%3PA"]
y["peak_3P_prop_not_corner"] = y["peak_3P_prop"] - y["peak_corner_%3PA"]

y = y.drop(["peak_2P_prop", "peak_3P_prop"], axis=1)

In [20]:
ncaa_to_peak.head()

Unnamed: 0,prospect_age,ncaa_player_name,ncaa_team,ncaa_conf,ncaa_gp,ncaa_minutes_%,ncaa_ortg,ncaa_usg,ncaa_efg%,ncaa_ts%,ncaa_oreb%,ncaa_dreb%,ncaa_ast%,ncaa_to%,ncaa_ftm,ncaa_fta,ncaa_ft%,ncaa_2pm,ncaa_2pa,ncaa_2p%,ncaa_3pm,ncaa_3pa,ncaa_3p%,ncaa_blk%,ncaa_stl%,ncaa_ftr,ncaa_yr,ncaa_ht,ncaa_porpag,ncaa_adjoe,ncaa_foul_rate,ncaa_year,ncaa_recruit_rank,ncaa_ast_tov_ratio,ncaa_rim_makes,ncaa_rim_att,ncaa_mid_range_makes,ncaa_mid_range_att,ncaa_rim_fg%,ncaa_mid_fg%,ncaa_dunks_made,ncaa_dunks_att,ncaa_dunk_fg%,ncaa_draft_pick,ncaa_drtg,ncaa_adrtg,ncaa_dporpag,ncaa_stops,ncaa_bpm,ncaa_obpm,ncaa_dbpm,ncaa_gbpm,ncaa_min_per_game,ncaa_ogbpm,ncaa_dgbpm,ncaa_oreb,ncaa_dreb,ncaa_treb,ncaa_ast,ncaa_stl,ncaa_blk,ncaa_pts,ncaa_fga,ncaa_tov,ncaa_ppp,ncaa_rim_prop,ncaa_mid_prop,ncaa_3pt_prop,ncaa_ast_fg_ratio,GMM_Primary_Cluster,GMM_Secondary_Cluster,Full Role,HEIGHT,WEIGHT,STANDING REACH,WINGSPAN,STANDING VERTICAL,VERTICAL LEAP,LANE AGILITY,THREE QUARTER SPRINT,consensus_tier,peak_Player-additional,peak_Age,peak_G,peak_GS,peak_MP,peak_FG,peak_FGA,peak_FG%,peak_3P,peak_3PA,peak_3P%,peak_2P,peak_2PA,peak_2P%,peak_FT,peak_FTA,peak_FT%,peak_ORB,peak_DRB,peak_TRB,peak_AST,peak_STL,peak_BLK,peak_TOV,peak_PF,peak_PTS,peak_ORtg,peak_DRtg,peak_AST_2_TOV,peak_AST_2_FG,peak_PPP,peak_Season,peak_PER,peak_TS%,peak_3PAr,peak_FTr,peak_ORB%,peak_DRB%,peak_TRB%,peak_AST%,peak_STL%,peak_BLK%,peak_TOV%,peak_USG%,peak_OWS,peak_DWS,peak_WS,peak_WS/48,peak_OBPM,peak_DBPM,peak_BPM,peak_VORP,peak_Dist.,peak_2P_prop,peak_0-3_prop,peak_3-10_prop,peak_10-16_prop,peak_16-3P_prop,peak_3P_prop,peak_2P_fg%,peak_0-3_fg%,peak_3-10_fg%,peak_10-16_fg%,peak_16-3P_fg%,peak_3P_fg%,peak_2P_%astd,peak_3P_%astd,peak_dunk_%FGA,peak_dunk_#,peak_corner_%3PA,peak_corner_3P%,peak_heave_Att.,peak_heave_#
0,22,Richaun Holmes,Bowling Green,MAC,30,0.671,115.8,0.251,0.593,0.623,0.115,0.21,0.072,0.165,3.366667,4.733333,0.711,4.633333,7.9,0.586,0.6,1.433333,0.419,0.113,0.015,0.507,Sr,80,3.46191,122.24,3.4,2015,0.0,0.433325,3.566667,5.166667,1.066667,2.733333,0.6903,0.3902,1.5,1.533333,0.9783,37.0,91.2274,91.5379,3.56104,192.802,7.98568,2.67221,5.31347,8.6819,28.7742,5.65496,3.02693,2.871,5.1613,8.0323,0.8387,0.7097,2.6452,14.8065,9.333333,1.9355,1.108977,0.553571,0.292857,0.153571,0.089861,Skilled Forward,,Skilled Forward,80.25,242.8,108.0,85.5,32.5,36.0,11.24,3.32,7.0,holmeri01,24.33,49.67,19.0,21.57,8.93,15.23,0.59,0.47,1.7,0.24,8.47,13.53,0.63,2.97,4.13,0.72,4.17,9.27,13.43,2.63,1.53,2.1,1.93,6.17,21.33,123.0,106.0,1.44,0.17,1.12,2017.33,18.57,0.63,0.11,0.27,9.5,21.0,15.27,8.57,1.53,3.73,10.3,16.93,2.4,1.47,3.83,0.17,0.17,0.87,1.03,0.8,6.77,0.89,0.52,0.22,0.1,0.05,0.11,0.63,0.76,0.44,0.47,0.31,0.16,0.77,0.67,0.26,78.0,0.1,0.16,0.67,0.0
1,23,Larry Nance Jr.,Wyoming,MWC,29,0.773,106.3,0.279,0.527,0.577,0.062,0.205,0.197,0.159,3.896552,4.965517,0.785,5.586207,10.551724,0.529,0.482759,1.413793,0.341,0.043,0.023,0.415,Sr,80,3.6941,117.739,2.6,2015,0.0,1.085692,3.206897,4.068966,2.37931,6.482759,0.7881,0.367,1.586207,1.724138,0.92,27.0,91.6832,91.8035,4.33077,203.468,5.38939,1.53945,3.84994,6.40162,34.9032,3.6465,2.75512,1.4194,5.8065,7.2258,2.4516,1.1935,1.1935,16.0645,11.965517,2.2581,0.979039,0.340058,0.541787,0.118156,0.204889,Skilled Forward,,Skilled Forward,80.0,227.6,106.7,84.9,29.3,34.4,11.5,3.3,7.0,nancela02,27.0,66.0,19.33,23.19,7.17,12.67,0.57,0.5,1.53,0.28,6.7,11.1,0.6,2.57,3.73,0.69,4.6,9.73,14.33,4.3,2.6,1.23,1.97,5.27,17.4,126.33,108.67,2.21,0.35,1.07,2019.0,18.27,0.61,0.12,0.29,10.37,22.4,16.3,12.7,2.6,2.23,12.0,14.43,3.2,2.0,5.2,0.17,0.63,1.63,2.27,1.67,7.5,0.88,0.51,0.2,0.08,0.09,0.12,0.6,0.74,0.42,0.36,0.35,0.28,0.72,0.99,0.23,80.67,0.33,0.22,1.33,0.0
2,22,Jae Crowder,Marquette,BE,35,0.824,122.3,0.22,0.568,0.598,0.07,0.205,0.135,0.098,3.4,4.628571,0.735,4.457143,7.4,0.602,1.742857,5.057143,0.345,0.031,0.043,0.372,Sr,78,5.29599,132.03,3.2,2012,0.0,1.659136,3.657143,4.885714,0.8,2.514286,0.7485,0.3182,0.228571,0.228571,1.0,34.0,84.8446,81.1209,5.17007,311.979,13.2884,7.37385,5.91459,13.0841,32.9429,7.96643,5.1177,2.0,6.4,8.4,2.0857,2.5143,1.0,17.5429,12.457143,1.2571,1.113777,0.392202,0.201835,0.405963,0.16743,Pass-First Playmaker,,Pass-First Playmaker,76.75,241.2,99.5,81.25,31.0,34.5,11.45,3.37,7.0,crowdja01,28.67,66.67,60.67,29.08,6.37,15.27,0.42,3.4,9.47,0.36,2.93,5.83,0.5,2.53,3.17,0.79,1.1,7.4,8.43,3.3,2.17,0.77,1.6,4.1,18.7,111.67,106.0,2.09,0.22,1.02,2018.67,12.97,0.56,0.63,0.2,2.4,16.37,9.47,9.07,2.17,1.37,8.8,16.27,2.03,2.93,4.97,0.12,-0.07,1.53,1.47,1.77,19.27,0.37,0.14,0.07,0.06,0.1,0.63,0.5,0.67,0.36,0.37,0.42,0.36,0.79,0.94,0.01,7.67,0.23,0.38,1.0,0.0
3,23,Julyan Stone,UTEP,CUSA,34,0.91,109.5,0.157,0.535,0.549,0.041,0.192,0.267,0.191,2.235294,3.970588,0.563,2.558824,4.352941,0.588,0.382353,1.5,0.255,0.017,0.023,0.678,Sr,79,2.24394,104.031,2.6,2011,0.0,2.796802,1.882353,2.647059,0.676471,1.705882,0.7111,0.3966,0.382353,0.441176,0.8667,0.0,90.8303,89.429,4.73535,268.672,6.99895,2.41287,4.58608,5.86673,36.7941,2.01948,3.84725,1.2059,6.3824,7.5882,5.2647,1.4412,0.5588,8.5,5.852941,1.8824,0.896398,0.452261,0.291457,0.256281,0.899496,Pass-First Playmaker,,Pass-First Playmaker,79.0,222.1,105.4,83.9,29.5,34.9,11.4,3.3,7.0,stoneju01,25.67,22.0,0.67,7.15,2.8,6.6,0.43,1.07,3.4,0.3,1.77,3.2,0.55,1.17,1.67,0.63,0.87,7.1,7.97,7.63,1.57,0.77,2.43,5.57,7.77,111.0,107.0,3.12,1.3,0.81,2013.67,9.07,0.56,0.61,0.23,1.97,15.93,8.93,20.67,1.57,1.2,26.13,8.57,0.13,0.17,0.3,0.08,-2.37,1.87,-0.5,0.07,17.37,0.39,0.16,0.17,0.03,0.02,0.61,0.37,0.38,0.39,0.11,0.17,0.3,0.19,1.0,0.03,0.67,0.35,0.25,0.33,0.0
4,23,Delon Wright,Utah,P12,34,0.837,125.2,0.242,0.545,0.617,0.041,0.136,0.328,0.154,4.470588,5.352941,0.835,3.970588,7.235294,0.549,0.764706,2.147059,0.356,0.032,0.039,0.571,Sr,77,5.7605,136.98,1.7,2015,0.0,2.617582,2.970588,4.382353,1.0,2.882353,0.6779,0.3469,0.5,0.529412,0.9444,20.0,87.5652,83.3382,5.1177,237.663,14.4311,7.9039,6.52721,14.7263,33.2857,8.27648,6.44984,0.9714,3.9714,4.9429,5.0857,2.0857,0.9714,14.5429,9.382353,1.9429,1.063035,0.467085,0.30721,0.22884,0.54205,Pass-First Playmaker,,Pass-First Playmaker,76.25,181.4,101.5,79.5,29.5,31.0,11.16,3.29,6.0,wrighde01,27.67,60.67,19.0,24.31,6.37,13.63,0.47,1.77,4.93,0.36,4.6,8.7,0.53,2.73,3.3,0.83,1.87,5.3,7.17,7.43,3.0,0.87,2.27,2.43,17.23,120.33,110.0,3.39,0.55,0.99,2019.67,16.37,0.57,0.36,0.24,4.4,11.83,8.17,21.17,3.0,1.5,12.9,15.47,2.37,1.83,4.17,0.14,0.8,2.13,2.97,1.8,12.33,0.64,0.31,0.26,0.05,0.02,0.36,0.53,0.71,0.37,0.39,0.49,0.36,0.27,0.81,0.02,7.33,0.22,0.37,3.0,0.33


In [21]:
X_end_loc = ncaa_to_peak.columns.get_loc("consensus_tier")+1
# we want to drop any team stats in the long term
X_col_drop = ["ncaa_team", "ncaa_gp", "ncaa_minutes_%", "ncaa_ortg", "ncaa_efg%", "ncaa_ts%",
              "ncaa_ftm", "ncaa_fta", "ncaa_2pm", "ncaa_2pa", "ncaa_3pm", "ncaa_3pa", 
              "ncaa_yr", "ncaa_ht", "ncaa_porpag", "ncaa_adjoe", "ncaa_year", 
              "ncaa_rim_makes", "ncaa_rim_att", "ncaa_mid_range_makes", "ncaa_mid_range_att",
              "ncaa_dunks_made", "ncaa_dunks_att", "ncaa_draft_pick", "ncaa_drtg", 
              "ncaa_adrtg", "ncaa_dporpag", "ncaa_stops", "ncaa_bpm", "ncaa_obpm", 
              "ncaa_dbpm", "ncaa_gbpm", "ncaa_min_per_game", "ncaa_oreb", "ncaa_dreb", 
              "ncaa_treb", "ncaa_ast", "ncaa_stl", "ncaa_blk", "ncaa_pts", "ncaa_fga",
              "ncaa_tov", "GMM_Primary_Cluster", "GMM_Secondary_Cluster", "Full Role"
             ]
X = ncaa_to_peak.iloc[:,0:X_end_loc].drop(X_col_drop, axis=1)

In [22]:
X.loc[X["ncaa_conf"].isin(["SEC", "B10", "BE", "ACC", "P12", "P10", "B12"]), "ncaa_conf"] = 3
X.loc[X["ncaa_conf"].isin(["WCC", "Amer", "A10", "MWC"]), "ncaa_conf"] = 2
X.loc[X["ncaa_conf"].isin(["CAA", "SB", "MVC", "CUSA", "MAC", "SC", "WAC"]), "ncaa_conf"] = 1
X.loc[X["ncaa_conf"].isin(["Sum", "MEAC", "OVC", "BSky", "Horz", "ASun", "MAAC", "BW", "NEC", "Pat", "Ivy", "BSth", "AE"]), "ncaa_conf"] = 0

wing_diff = X["WINGSPAN"] - X["HEIGHT"]
X.insert(X.shape[1]-1, "WINGSPAN DIFFERENTIAL", wing_diff)

In [23]:
X

Unnamed: 0,prospect_age,ncaa_player_name,ncaa_conf,ncaa_usg,ncaa_oreb%,ncaa_dreb%,ncaa_ast%,ncaa_to%,ncaa_ft%,ncaa_2p%,ncaa_3p%,ncaa_blk%,ncaa_stl%,ncaa_ftr,ncaa_foul_rate,ncaa_recruit_rank,ncaa_ast_tov_ratio,ncaa_rim_fg%,ncaa_mid_fg%,ncaa_dunk_fg%,ncaa_ogbpm,ncaa_dgbpm,ncaa_ppp,ncaa_rim_prop,ncaa_mid_prop,ncaa_3pt_prop,ncaa_ast_fg_ratio,HEIGHT,WEIGHT,STANDING REACH,WINGSPAN,STANDING VERTICAL,VERTICAL LEAP,LANE AGILITY,THREE QUARTER SPRINT,WINGSPAN DIFFERENTIAL,consensus_tier
0,22,Richaun Holmes,1,0.251,0.115,0.210,0.072,0.165,0.711,0.586,0.419,0.113,0.015,0.507,3.4,0.0,0.433325,0.6903,0.3902,0.9783,5.65496,3.02693,1.108977,0.553571,0.292857,0.153571,0.089861,80.25,242.8,108.0,85.50,32.5,36.0,11.24,3.32,5.25,7.0
1,23,Larry Nance Jr.,2,0.279,0.062,0.205,0.197,0.159,0.785,0.529,0.341,0.043,0.023,0.415,2.6,0.0,1.085692,0.7881,0.3670,0.9200,3.64650,2.75512,0.979039,0.340058,0.541787,0.118156,0.204889,80.00,227.6,106.7,84.90,29.3,34.4,11.50,3.30,4.90,7.0
2,22,Jae Crowder,3,0.220,0.070,0.205,0.135,0.098,0.735,0.602,0.345,0.031,0.043,0.372,3.2,0.0,1.659136,0.7485,0.3182,1.0000,7.96643,5.11770,1.113777,0.392202,0.201835,0.405963,0.167430,76.75,241.2,99.5,81.25,31.0,34.5,11.45,3.37,4.50,7.0
3,23,Julyan Stone,1,0.157,0.041,0.192,0.267,0.191,0.563,0.588,0.255,0.017,0.023,0.678,2.6,0.0,2.796802,0.7111,0.3966,0.8667,2.01948,3.84725,0.896398,0.452261,0.291457,0.256281,0.899496,79.00,222.1,105.4,83.90,29.5,34.9,11.40,3.30,4.90,7.0
4,23,Delon Wright,3,0.242,0.041,0.136,0.328,0.154,0.835,0.549,0.356,0.032,0.039,0.571,1.7,0.0,2.617582,0.6779,0.3469,0.9444,8.27648,6.44984,1.063035,0.467085,0.307210,0.228840,0.542050,76.25,181.4,101.5,79.50,29.5,31.0,11.16,3.29,3.25,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,19,Anthony Davis,3,0.191,0.116,0.239,0.075,0.100,0.709,0.653,0.150,0.137,0.025,0.602,2.4,100.0,1.219512,0.8736,0.3846,0.9796,9.09614,7.56117,1.213301,0.516320,0.424332,0.059347,0.148368,81.25,221.8,108.0,89.50,28.6,33.4,11.60,3.30,8.25,1.0
362,19,Marvin Bagley III,3,0.258,0.138,0.214,0.088,0.147,0.627,0.647,0.397,0.026,0.014,0.475,2.2,100.0,0.666696,0.7311,0.4576,0.9592,8.30044,1.34425,1.143405,0.600000,0.268182,0.131818,0.113640,83.00,244.2,110.9,87.90,28.6,33.0,11.70,3.40,4.90,1.0
363,20,Harrison Barnes,3,0.260,0.069,0.110,0.072,0.140,0.723,0.469,0.358,0.011,0.020,0.374,2.4,100.0,0.581082,0.7333,0.3857,1.0000,4.32657,1.67460,0.951818,0.172745,0.562380,0.262956,0.082535,79.00,227.8,101.5,83.25,38.0,39.5,10.93,3.16,4.25,2.0
364,19,Derrick Favors,3,0.218,0.125,0.202,0.073,0.229,0.628,0.617,0.000,0.078,0.018,0.476,3.8,100.0,0.383745,0.7829,0.3571,0.9464,3.82367,3.68395,1.013172,0.607639,0.388889,0.003472,0.114589,80.75,245.2,110.0,88.00,31.5,35.5,11.74,3.25,7.25,2.0


Once we've engineered our features and formatted our data, we need to split the data into training and testing sets to work with.

In [24]:
y

Unnamed: 0,peak_AST_2_TOV,peak_AST_2_FG,peak_AST%,peak_TOV%,peak_USG%,peak_PPP,peak_0-3_prop,peak_3-10_prop,peak_10-16_prop,peak_16-3P_prop,peak_corner_%3PA,peak_2P_%astd,peak_3P_%astd,peak_0-3_fg%,peak_3-10_fg%,peak_10-16_fg%,peak_16-3P_fg%,peak_3P_fg%,peak_corner_3P%,peak_STL%,peak_BLK%,peak_ORB%,peak_DRB%,peak_3P_prop_not_corner
0,1.44,0.17,8.57,10.30,16.93,1.12,0.52,0.22,0.10,0.05,0.0110,0.6853,0.0737,0.76,0.44,0.47,0.31,0.16,0.16,1.53,3.73,9.50,21.00,0.0990
1,2.21,0.35,12.70,12.00,14.43,1.07,0.51,0.20,0.08,0.09,0.0396,0.6336,0.1188,0.74,0.42,0.36,0.35,0.28,0.22,2.60,2.23,10.37,22.40,0.0804
2,2.09,0.22,9.07,8.80,16.27,1.02,0.14,0.07,0.06,0.10,0.1449,0.2923,0.5922,0.67,0.36,0.37,0.42,0.36,0.38,2.17,1.37,2.40,16.37,0.4851
3,3.12,1.30,20.67,26.13,8.57,0.81,0.16,0.17,0.03,0.02,0.2135,0.0741,0.6100,0.38,0.39,0.11,0.17,0.30,0.25,1.57,1.20,1.97,15.93,0.3965
4,3.39,0.55,21.17,12.90,15.47,0.99,0.31,0.26,0.05,0.02,0.0792,0.1728,0.2916,0.71,0.37,0.39,0.49,0.36,0.37,3.00,1.50,4.40,11.83,0.2808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,1.63,0.17,15.23,8.37,28.87,1.09,0.35,0.19,0.16,0.18,0.0228,0.5632,0.1200,0.75,0.41,0.42,0.36,0.25,0.33,2.10,6.07,8.43,24.57,0.0972
362,0.78,0.09,5.67,9.27,21.47,1.03,0.33,0.35,0.11,0.03,0.0594,0.4920,0.1656,0.75,0.46,0.38,0.33,0.28,0.24,0.93,2.43,10.10,21.20,0.1206
363,1.79,0.24,10.13,10.17,16.97,1.09,0.28,0.22,0.06,0.06,0.1365,0.3599,0.3666,0.70,0.38,0.43,0.30,0.40,0.49,1.10,0.50,4.10,15.73,0.2535
364,1.06,0.15,7.63,10.70,18.77,1.09,0.50,0.21,0.15,0.09,0.0295,0.6935,0.0165,0.73,0.51,0.43,0.34,0.07,0.07,1.67,4.67,12.53,21.40,0.0205


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=9)

In [26]:
# we only kept this feature in for eventual player identification
X_train = X_train.drop("ncaa_player_name", axis=1)

We need to tune the parameters for our Random Forest. To do this, we could work sequentially but I am going to borrow the Random Search CV method laid out in the following article.

https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

Below, we'll begin

In [27]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 50, stop = 1000, num = 50)]
max_features = [3, 4, 5, 6, 7, 8, 9]
max_depth = [int(x) for x in np.linspace(1, 10, num = 1)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [28]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=9, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'max_depth': [1, None],
                                        'max_features': [3, 4, 5, 6, 7, 8, 9],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 69, 88, 108, 127,
                                                         146, 166, 185, 205,
                                                         224, 243, 263, 282,
                                                         302, 321, 340, 360,
                                                         379, 398, 418, 437,
                                                         457, 476, 495, 515,
                                                         534, 554, 573, 592,
                                                         612, ...]},
  

In [30]:
rf_random.best_params_

{'n_estimators': 631,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 6,
 'max_depth': None}

In [31]:
rf_1 = RandomForestRegressor(random_state=9)
rf_2 = RandomForestRegressor(n_estimators=631, max_features=6, max_depth=None, random_state=9)
rf_1.fit(X_train, y_train)
rf_2.fit(X_train, y_train)

RandomForestRegressor(max_features=6, n_estimators=631, random_state=9)

In [32]:
from sklearn.model_selection import cross_val_score

rf1_scores = cross_val_score(rf_1, X_train, y_train, scoring="r2", n_jobs=-1)
rf2_scores = cross_val_score(rf_2, X_train, y_train, scoring="r2", n_jobs=-1)

print(f"R2 for default model: {np.mean(rf1_scores)}")
print(f"R2 for tuned model: {np.mean(rf2_scores)}")

R2 for default model: 0.29909252968770583
R2 for tuned model: 0.3075697273477204


When using the tuned parameters, we make the model run better, and improve R2 by a minor 1.1%, but this is still better than nothing. Also, we account for the fact it's extremely hard to predict these values accurately, but something (at this extremely low R2) is still better than nothing.

In [33]:
X_train_preds = rf_2.predict(X_train)
pd.DataFrame(X_train_preds, columns=y_train.columns).head()

Unnamed: 0,peak_AST_2_TOV,peak_AST_2_FG,peak_AST%,peak_TOV%,peak_USG%,peak_PPP,peak_0-3_prop,peak_3-10_prop,peak_10-16_prop,peak_16-3P_prop,peak_corner_%3PA,peak_2P_%astd,peak_3P_%astd,peak_0-3_fg%,peak_3-10_fg%,peak_10-16_fg%,peak_16-3P_fg%,peak_3P_fg%,peak_corner_3P%,peak_STL%,peak_BLK%,peak_ORB%,peak_DRB%,peak_3P_prop_not_corner
0,2.866561,0.391743,23.914105,11.117496,21.978241,1.026545,0.180919,0.241141,0.146561,0.095404,0.076032,0.169227,0.248687,0.665816,0.449588,0.469477,0.457956,0.383217,0.442155,1.378922,0.45832,1.967765,11.054231,0.253207
1,1.66393,0.220571,16.385563,10.430761,25.565864,1.049366,0.241775,0.172821,0.154612,0.14252,0.058255,0.324754,0.23717,0.684358,0.445357,0.429905,0.393994,0.355658,0.394168,2.262282,1.956513,4.832488,18.201759,0.229542
2,1.0342,0.140539,7.698352,11.711474,19.330269,0.888067,0.331157,0.125943,0.066165,0.203154,0.078085,0.438679,0.264424,0.661521,0.290079,0.236529,0.253265,0.278479,0.31233,1.401363,1.492076,7.615166,19.620824,0.201788
3,1.954295,0.331426,12.991109,12.714311,15.930444,1.019271,0.338082,0.133423,0.065151,0.049334,0.120152,0.311224,0.372643,0.683027,0.368399,0.339762,0.32458,0.364057,0.456783,1.659208,0.940507,4.024453,17.031284,0.294222
4,1.657908,0.360777,15.621965,15.79935,17.478368,0.978051,0.455261,0.166228,0.10309,0.140903,0.024351,0.578502,0.125079,0.681014,0.514945,0.417591,0.427956,0.202995,0.28599,1.341379,5.297211,11.075024,25.244485,0.109596


In [34]:
y_train.head()

Unnamed: 0,peak_AST_2_TOV,peak_AST_2_FG,peak_AST%,peak_TOV%,peak_USG%,peak_PPP,peak_0-3_prop,peak_3-10_prop,peak_10-16_prop,peak_16-3P_prop,peak_corner_%3PA,peak_2P_%astd,peak_3P_%astd,peak_0-3_fg%,peak_3-10_fg%,peak_10-16_fg%,peak_16-3P_fg%,peak_3P_fg%,peak_corner_3P%,peak_STL%,peak_BLK%,peak_ORB%,peak_DRB%,peak_3P_prop_not_corner
259,2.97,0.37,24.77,9.93,23.1,1.08,0.17,0.28,0.17,0.09,0.0728,0.1584,0.21,0.7,0.48,0.5,0.5,0.4,0.48,1.2,0.27,1.83,11.37,0.2072
189,1.77,0.21,19.43,9.03,29.97,1.1,0.18,0.16,0.2,0.18,0.0392,0.2664,0.2128,0.7,0.48,0.46,0.44,0.4,0.46,2.7,1.9,3.83,17.73,0.2408
327,0.9,0.12,6.6,11.83,19.17,0.83,0.33,0.09,0.05,0.26,0.0784,0.432,0.2716,0.65,0.22,0.16,0.2,0.27,0.31,1.33,1.1,8.27,20.33,0.2016
171,2.09,0.39,14.2,13.73,14.9,1.05,0.39,0.12,0.06,0.03,0.104,0.312,0.356,0.71,0.37,0.32,0.32,0.37,0.5,1.67,0.8,4.43,19.23,0.296
15,1.88,0.43,18.1,16.87,17.07,0.95,0.44,0.13,0.12,0.18,0.0221,0.5829,0.1235,0.67,0.56,0.43,0.48,0.2,0.33,1.3,6.5,11.8,27.2,0.1079


# Prediction Intervals

Now, as we move past our initial model, we develop our prediction intervals to decide the ranges of outcomes for prospects. Therefore, we'll use quantile loss to predict the 90th and 10th percentilles of player outcomes across all attributes.

In [48]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor

gbr_floor = GradientBoostingRegressor(loss="quantile", alpha=.1)
gbr_ceiling = GradientBoostingRegressor(loss="quantile", alpha=.9)

mor_floor = MultiOutputRegressor(gbr_floor).fit(X_train_preds, y_train)
mor_ceiling = MultiOutputRegressor(gbr_ceiling).fit(X_train_preds, y_train)

Last, we just need to save the models to apply them later.

In [50]:
import joblib
joblib.dump(rf_2, "prospect_models/peak_models/rf_peak_pred.sav")
joblib.dump(mor_floor, "prospect_models/peak_models/gbr_floor.sav")
joblib.dump(mor_ceiling, "prospect_models/peak_models/gbr_ceiling.sav")

['prospect_models/peak_models/gbr_ceiling.sav']