In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, f1_score
from rfpimp import permutation_importances

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

import logistic_regression_functions as lrf
import GradientDescent as GD
GradientDescent = GD.GradientDescent

from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools import add_constant

In [141]:
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
from sklearn.cluster import KMeans

In [15]:
from sklearn import preprocessing

In [3]:
pd.set_option('display.max_columns', 2000)
pd.set_option('precision', 2) #setting the number of decimel points

In [175]:
#Import advanced team data
tadv = pd.read_csv('data/TEAMstats - AdvancedTeam.csv')

In [176]:
#get defensive votes
%run defvotes


In [177]:
#make d_votes table
#fv = sum_past_yrs(fv_dict2)
sdd = replace_nans(fv)
d_votes = make_df(sdd)

In [178]:
#Get main player data
from process_players import *

In [179]:
#combine main player data with d_votes table
pdata = get_clean_pdata()
pdata = add_defvotes(pdata, d_votes)

In [180]:
#add age multiplier
pdata['AgeMulti'] = 1
pdata.loc[pdata['Age'] > 31,'AgeMulti'] = .8
pdata.loc[pdata['Age'] > 34,'AgeMulti'] = .67
pdata.loc[pdata['Age'] < 24, 'AgeMulti'] = 1.2
pdata.loc[pdata['Age'] < 21, 'AgeMulti'] = 1.33

In [181]:
#drop nans and players who played less than 31 games (half of strike shortened season)
p_wage = pdata.dropna()
p_wage = p_wage[p_wage['G'] > 33]

In [182]:
#Create a list of columns to normalize
cols = ['MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%',
       'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS',
       'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP','MPG', 'advotes']

In [183]:
#separate p_wage into df's by position
dflist = []
for i in ['PF', 'PG', 'SF', 'SG', 'C']:
    df = p_wage[p_wage['Pos'] == i].reset_index()
    dflist.append(df)

In [188]:
#Make everything a normalized version of itself
dfscaledlist = []
count = 0
for i in ['PF', 'PG', 'SF', 'SG', 'C']:
    x = p_wage[p_wage['Pos'] == i][['MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%',
           'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS',
           'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP','MPG', 'advotes']] #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled, columns=['sMP', 'sPER', 'sTS%', 's3PAr', 'sFTr', 'sORB%',
           'sDRB%', 'sTRB%', 'sAST%', 'sSTL%', 'sBLK%', 'sTOV%', 'sUSG%', 'sOWS', 'sDWS',
           'sWS', 'sWS/48', 'sOBPM', 'sDBPM', 'sBPM', 'sVORP','sMPG', 'sadvotes'])
    dfscaled = pd.concat([dflist[count], df], axis=1, sort=False)
    dfscaledlist.append(dfscaled)
    count += 1
        
        

In [189]:
#Rejoin positioned dataframes
dfs = dfscaledlist[0].append(dfscaledlist[1])
dfs = dfs.append(dfscaledlist[2])
dfs = dfs.append(dfscaledlist[3])
dfs = dfs.append(dfscaledlist[4])

In [190]:
dfs.head()

Unnamed: 0,index,Player,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,TM,YR,YRprior,2YRprior,MPG,advotes,AgeMulti,sMP,sPER,sTS%,s3PAr,sFTr,sORB%,sDRB%,sTRB%,sAST%,sSTL%,sBLK%,sTOV%,sUSG%,sOWS,sDWS,sWS,sWS/48,sOBPM,sDBPM,sBPM,sVORP,sMPG,sadvotes
0,0,Shareef Abdur-Rahim,PF,29.0,72.0,1961.0,17.2,0.59,0.035,0.44,6.4,14.8,10.6,13.2,1.3,1.6,12.5,20.1,4.0,2.2,6.2,0.15,0.4,0.4,0.9,1.4,Sacramento Kings 05-06,6,5.0,4.0,27.24,0.0,1.0,0.58,0.51,0.63,0.0418,0.34,0.31,0.23,0.24,0.3,0.34,0.16,0.35,0.46,0.37,0.31,0.37,0.56,0.47,0.49,0.54,0.29,0.64,0.0
1,12,Brian Skinner,PF,29.0,38.0,429.0,10.8,0.54,0.0,0.39,8.3,19.2,13.8,6.0,1.3,3.4,18.2,10.6,0.2,0.6,0.8,0.09,-3.7,2.1,-1.6,0.0,Sacramento Kings 05-06,6,5.0,4.0,11.29,0.0,1.0,0.09,0.29,0.52,0.0,0.3,0.42,0.37,0.39,0.14,0.34,0.35,0.56,0.12,0.15,0.07,0.11,0.41,0.23,0.65,0.43,0.17,0.2,0.0
2,16,Kenny Thomas,PF,28.0,82.0,2293.0,15.0,0.54,0.002,0.34,10.3,21.2,15.8,12.0,1.6,1.2,16.3,16.6,2.2,3.3,5.5,0.12,-1.1,1.8,0.7,1.6,Sacramento Kings 05-06,6,5.0,4.0,27.96,0.0,1.0,0.69,0.43,0.51,0.00239,0.25,0.54,0.44,0.49,0.27,0.45,0.12,0.49,0.33,0.27,0.47,0.34,0.46,0.38,0.62,0.53,0.31,0.66,0.0
3,33,Kenny Thomas,PF,29.0,62.0,1412.0,10.2,0.49,0.003,0.25,10.9,20.6,15.7,8.4,1.5,0.9,21.1,13.6,-0.1,1.4,1.2,0.04,-3.1,1.2,-1.8,0.1,Sacramento Kings 06-07,7,6.0,5.0,22.77,0.0,1.0,0.41,0.27,0.39,0.00358,0.18,0.58,0.42,0.48,0.19,0.41,0.09,0.67,0.23,0.13,0.19,0.13,0.27,0.26,0.56,0.42,0.18,0.52,0.0
4,35,Corliss Williamson,PF,33.0,68.0,1337.0,12.7,0.56,0.004,0.45,5.4,14.2,9.7,5.2,1.1,0.9,14.9,21.9,0.6,0.9,1.5,0.05,-3.2,-1.9,-5.1,-1.1,Sacramento Kings 06-07,7,6.0,5.0,19.66,0.0,0.8,0.38,0.35,0.56,0.00478,0.34,0.26,0.21,0.2,0.12,0.28,0.09,0.44,0.52,0.17,0.12,0.14,0.3,0.26,0.27,0.28,0.08,0.43,0.0


*Clustering Starts here:*


In [191]:
#Make offensive and defensive clusters
Xo = dfs[['sPER', 's3PAr','sORB%', 'sAST%', 'sUSG%', 'sOWS','sOBPM', 'sVORP', 'sMPG']]
Xd = dfs[['sDRB%', 'sSTL%', 'sBLK%', 'sDWS', 'sDBPM', 'sVORP','sMPG', 'sadvotes']]


In [196]:
#Create Kmeans models for offense and defense
kmeansO = KMeans(n_clusters=20, random_state=7).fit(Xo)
kmeansD = KMeans(n_clusters=10, random_state=9).fit(Xd)

In [197]:
#Create labels to tie back to individual players
labsO = kmeansO.labels_
labsD = kmeansD.labels_

In [198]:
#Add clusters back to dfs
dfs['O_cluster'] = labsO
dfs['D_cluster'] = labsD

In [199]:
dfs.groupby('D_cluster').mean().sort_values('DWS', ascending=False).index

Int64Index([7, 8, 0, 9, 4, 1, 3, 2, 5, 6], dtype='int64', name='D_cluster')

In [200]:
dfs.groupby('O_cluster').mean().sort_values('OWS', ascending=False).index

Int64Index([8, 19, 11, 13, 17, 1, 6, 9, 15, 2, 12, 10, 5, 7, 14, 4, 18, 0, 16,
            3],
           dtype='int64', name='O_cluster')

In [201]:
#Indices to reassign clusters to numerical value
oind = [8, 19, 11, 13, 17, 1, 6, 9, 15, 2, 12, 10, 5, 7, 14, 4, 18, 0, 16, 3]
dind = [7, 8, 0, 9, 4, 1, 3, 2, 5, 6]

In [202]:
#Reassign values to numerical values from greatest to least based on highest mean win shares
count = 9
for i in dind:
    dfs.loc[dfs['D_cluster'] == i,'D_clust'] = count
    count -= 1

In [203]:
#Reassign values to numerical values from greatest to least based on highest mean win shares
count = 19
for i in oind:
    dfs.loc[dfs['O_cluster'] == i,'O_clust'] = count
    count -= 1

In [204]:
#Assign position scaled dataframe
p_sca = dfs

In [92]:
ttarg = advteam_target()

In [138]:
20**1.5/6
3**1.5/6

0.8660254037844387

In [205]:
p_sca['O_clust'] = (p_sca['O_clust']**1.5)/6
p_sca['D_clust'] = (p_sca['D_clust']**1.5)/3


In [206]:
p_sca.head()

Unnamed: 0,index,Player,Pos,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,TM,YR,YRprior,2YRprior,MPG,advotes,AgeMulti,sMP,sPER,sTS%,s3PAr,sFTr,sORB%,sDRB%,sTRB%,sAST%,sSTL%,sBLK%,sTOV%,sUSG%,sOWS,sDWS,sWS,sWS/48,sOBPM,sDBPM,sBPM,sVORP,sMPG,sadvotes,O_cluster,D_cluster,D_clust,O_clust
0,0,Shareef Abdur-Rahim,PF,29.0,72.0,1961.0,17.2,0.59,0.035,0.44,6.4,14.8,10.6,13.2,1.3,1.6,12.5,20.1,4.0,2.2,6.2,0.15,0.4,0.4,0.9,1.4,Sacramento Kings 05-06,6,5.0,4.0,27.24,0.0,1.0,0.58,0.51,0.63,0.0418,0.34,0.31,0.23,0.24,0.3,0.34,0.16,0.35,0.46,0.37,0.31,0.37,0.56,0.47,0.49,0.54,0.29,0.64,0.0,15,1,2.67,6.08
1,12,Brian Skinner,PF,29.0,38.0,429.0,10.8,0.54,0.0,0.39,8.3,19.2,13.8,6.0,1.3,3.4,18.2,10.6,0.2,0.6,0.8,0.09,-3.7,2.1,-1.6,0.0,Sacramento Kings 05-06,6,5.0,4.0,11.29,0.0,1.0,0.09,0.29,0.52,0.0,0.3,0.42,0.37,0.39,0.14,0.34,0.35,0.56,0.12,0.15,0.07,0.11,0.41,0.23,0.65,0.43,0.17,0.2,0.0,16,3,1.73,0.17
2,16,Kenny Thomas,PF,28.0,82.0,2293.0,15.0,0.54,0.002,0.34,10.3,21.2,15.8,12.0,1.6,1.2,16.3,16.6,2.2,3.3,5.5,0.12,-1.1,1.8,0.7,1.6,Sacramento Kings 05-06,6,5.0,4.0,27.96,0.0,1.0,0.69,0.43,0.51,0.00239,0.25,0.54,0.44,0.49,0.27,0.45,0.12,0.49,0.33,0.27,0.47,0.34,0.46,0.38,0.62,0.53,0.31,0.66,0.0,9,9,4.9,6.93
3,33,Kenny Thomas,PF,29.0,62.0,1412.0,10.2,0.49,0.003,0.25,10.9,20.6,15.7,8.4,1.5,0.9,21.1,13.6,-0.1,1.4,1.2,0.04,-3.1,1.2,-1.8,0.1,Sacramento Kings 06-07,7,6.0,5.0,22.77,0.0,1.0,0.41,0.27,0.39,0.00358,0.18,0.58,0.42,0.48,0.19,0.41,0.09,0.67,0.23,0.13,0.19,0.13,0.27,0.26,0.56,0.42,0.18,0.52,0.0,0,4,3.73,0.47
4,35,Corliss Williamson,PF,33.0,68.0,1337.0,12.7,0.56,0.004,0.45,5.4,14.2,9.7,5.2,1.1,0.9,14.9,21.9,0.6,0.9,1.5,0.05,-3.2,-1.9,-5.1,-1.1,Sacramento Kings 06-07,7,6.0,5.0,19.66,0.0,0.8,0.38,0.35,0.56,0.00478,0.34,0.26,0.21,0.2,0.12,0.28,0.09,0.44,0.52,0.17,0.12,0.14,0.3,0.26,0.27,0.28,0.08,0.43,0.0,0,5,0.33,0.47


In [207]:
p2yr = p_sca[['Player','Age','Pos','G','MP','TM','YR','YRprior','2YRprior','AgeMulti', 'sPER', 'sTS%', 's3PAr', 'sFTr', 'sORB%',
           'sDRB%', 'sTRB%', 'sAST%', 'sSTL%', 'sBLK%', 'sTOV%', 'sUSG%', 'sOWS', 'sDWS',
           'sWS', 'sWS/48', 'sOBPM', 'sDBPM', 'sBPM', 'sVORP','sMPG', 'sadvotes', 'O_cluster', 'D_cluster','D_clust','O_clust']]



In [208]:
p2 = add_2yrs_prior(p2yr)

In [209]:
p2.shape

(7654, 106)

In [210]:
p2no6 = p2[p2['YR_x'] != 6]

In [211]:
p2no6.columns

Index(['Player', 'Age_x', 'Pos_x', 'G_x', 'MP_x', 'TM_x', 'YR_x', 'YRprior_x',
       '2YRprior_x', 'AgeMulti_x',
       ...
       'sOBPM', 'sDBPM', 'sBPM', 'sVORP', 'sMPG', 'sadvotes', 'O_cluster',
       'D_cluster', 'D_clust', 'O_clust'],
      dtype='object', length=106)

In [169]:
def get_2yr_mean(df):
    ap_2yr_mean = df[['Player', 'Pos_x', 'Age_x','TM_x','YR_x','MP_x', 'AgeMulti_x']]

    col_means = ['PERs', '3PArs', 'ORB%s', 'AST%s', 'USG%s', 'OWSs', 'OBPMs',
       'VORPs', 'MPGs', 'DRB%s', 'STL%s', 'BLK%s', 'DWSs', 'DBPMs', 'VORPs',
       'MPGs', 'advotess', 'O_cluster', 'D_cluster', 'D_clust', 'O_clust']
    cols = ['PERs_y', '3PArs_y', 'ORB%s_y', 'AST%s_y',
       'USG%s_y', 'OWSs_y', 'OBPMs_y', 'VORPs_y', 'MPGs_y', 'DRB%s_y',
       'STL%s_y', 'BLK%s_y', 'DWSs_y', 'DBPMs_y', 'VORPs_y', 'MPGs_y',
       'advotess_y','O_cluster_y', 'D_cluster_y', 'D_clust_y', 'O_clust_y']

    for i in range(len(col_means)):
        ap_2yr_mean[str(col_means[i])+"mean"] = (df[col_means[i]] + df[cols[i]])/2
    
    return ap_2yr_mean

In [212]:
p2mean = get_2yr_mean(p2no6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  cols = ['PERs_y', '3PArs_y', 'ORB%s_y', 'AST%s_y',


KeyError: 'PER'

In [220]:
%run process_players

In [214]:
p2no6 = p2no6.dropna()

In [223]:
p2no6.head()

Unnamed: 0,Player,Age_x,Pos_x,G_x,MP_x,TM_x,YR_x,YRprior_x,2YRprior_x,AgeMulti_x,sPER_x,sTS%_x,s3PAr_x,sFTr_x,sORB%_x,sDRB%_x,sTRB%_x,sAST%_x,sSTL%_x,sBLK%_x,sTOV%_x,sUSG%_x,sOWS_x,sDWS_x,sWS_x,sWS/48_x,sOBPM_x,sDBPM_x,sBPM_x,sVORP_x,sMPG_x,sadvotes_x,O_cluster_x,D_cluster_x,D_clust_x,O_clust_x,Age_y,Pos_y,G_y,MP_y,TM_y,YR_y,YRprior_y,2YRprior_y,AgeMulti_y,sPER_y,sTS%_y,s3PAr_y,sFTr_y,sORB%_y,sDRB%_y,sTRB%_y,sAST%_y,sSTL%_y,sBLK%_y,sTOV%_y,sUSG%_y,sOWS_y,sDWS_y,sWS_y,sWS/48_y,sOBPM_y,sDBPM_y,sBPM_y,sVORP_y,sMPG_y,sadvotes_y,O_cluster_y,D_cluster_y,D_clust_y,O_clust_y,Age,Pos,G,MP,TM,YR,YRprior,2YRprior,AgeMulti,sPER,sTS%,s3PAr,sFTr,sORB%,sDRB%,sTRB%,sAST%,sSTL%,sBLK%,sTOV%,sUSG%,sOWS,sDWS,sWS,sWS/48,sOBPM,sDBPM,sBPM,sVORP,sMPG,sadvotes,O_cluster,D_cluster,D_clust,O_clust
5,Mikki Moore,32.0,PF,82.0,2385.0,Sacramento Kings 07-08,8.0,7.0,6.0,0.8,0.31,0.7,0.00478,0.28,0.41,0.27,0.32,0.12,0.14,0.15,0.48,0.2,0.33,0.19,0.29,0.4,0.41,0.44,0.46,0.23,0.69,0.0,9.0,1.0,2.67,6.93,31.0,C,79.0,2082.0,New Jersey Nets 06-07,7.0,6.0,5.0,1.0,0.49,0.77,0.00299,0.36,0.4,0.19,0.26,0.2,0.34,0.2,0.34,0.37,0.43,0.3,0.42,0.5,0.52,0.43,0.51,0.34,0.63,0.0,9.0,1.0,2.67,6.93,30.0,C,47.0,583.0,Seattle SuperSonics 05-06,6.0,5.0,4.0,1.0,0.28,0.44,0.024,0.39,0.44,0.31,0.37,0.22,0.14,0.17,0.51,0.33,0.17,0.0,0.06,0.18,0.33,0.24,0.22,0.15,0.22,0.0,16.0,6.0,0.0,0.17
7,Mikki Moore,33.0,PF,46.0,746.0,Sacramento Kings 08-09,9.0,8.0,7.0,0.8,0.21,0.6,0.0,0.26,0.35,0.3,0.31,0.13,0.21,0.14,0.48,0.1,0.17,0.01,0.11,0.3,0.27,0.39,0.34,0.14,0.33,0.0,16.0,6.0,0.0,0.17,32.0,PF,82.0,2385.0,Sacramento Kings 07-08,8.0,7.0,6.0,0.8,0.31,0.7,0.00478,0.28,0.41,0.27,0.32,0.12,0.14,0.15,0.48,0.2,0.33,0.19,0.29,0.4,0.41,0.44,0.46,0.23,0.69,0.0,9.0,1.0,2.67,6.93,31.0,C,79.0,2082.0,New Jersey Nets 06-07,7.0,6.0,5.0,1.0,0.49,0.77,0.00299,0.36,0.4,0.19,0.26,0.2,0.34,0.2,0.34,0.37,0.43,0.3,0.42,0.5,0.52,0.43,0.51,0.34,0.63,0.0,9.0,1.0,2.67,6.93
8,Andres Nocioni,29.0,PF,53.0,1278.0,Chicago Bulls 08-09,9.0,8.0,7.0,1.0,0.31,0.53,0.556,0.2,0.12,0.29,0.21,0.17,0.24,0.09,0.35,0.46,0.18,0.15,0.16,0.35,0.42,0.29,0.4,0.16,0.55,0.00546,7.0,5.0,0.33,2.45,28.0,SF,82.0,2021.0,Chicago Bulls 07-08,8.0,7.0,6.0,1.0,0.36,0.62,0.443,0.35,0.22,0.58,0.49,0.15,0.06,0.24,0.42,0.62,0.23,0.29,0.22,0.39,0.4,0.24,0.29,0.12,0.53,0.00667,12.0,4.0,3.73,4.5,27.0,PF,53.0,1406.0,Chicago Bulls 06-07,7.0,6.0,5.0,1.0,0.45,0.61,0.428,0.18,0.14,0.43,0.32,0.16,0.24,0.13,0.39,0.59,0.22,0.37,0.27,0.52,0.43,0.43,0.47,0.21,0.62,0.00205,12.0,4.0,3.73,4.5
11,Carl Landry,26.0,PF,52.0,1415.0,Houston Rockets 09-10,10.0,9.0,8.0,1.0,0.64,0.72,0.0,0.39,0.49,0.19,0.29,0.11,0.24,0.24,0.3,0.59,0.38,0.18,0.33,0.64,0.51,0.35,0.5,0.23,0.64,0.0,15.0,1.0,2.67,6.08,25.0,PF,69.0,1467.0,Houston Rockets 08-09,9.0,8.0,7.0,1.0,0.52,0.75,0.00836,0.35,0.54,0.29,0.38,0.1,0.21,0.15,0.32,0.39,0.36,0.31,0.36,0.66,0.46,0.47,0.52,0.25,0.47,0.0,9.0,4.0,3.73,6.93,24.0,PF,42.0,711.0,Houston Rockets 07-08,8.0,7.0,6.0,1.0,0.65,0.77,0.00597,0.38,0.84,0.31,0.52,0.12,0.34,0.07,0.21,0.41,0.28,0.16,0.25,0.82,0.57,0.42,0.58,0.23,0.35,0.0,10.0,6.0,0.0,3.77
13,Carl Landry,27.0,PF,53.0,1406.0,Sacramento Kings 10-11,11.0,10.0,9.0,1.0,0.42,0.52,0.00717,0.34,0.52,0.08,0.23,0.13,0.28,0.1,0.34,0.46,0.24,0.09,0.19,0.39,0.41,0.32,0.41,0.17,0.62,0.0,9.0,5.0,0.33,6.93,26.0,PF,52.0,1415.0,Houston Rockets 09-10,10.0,9.0,8.0,1.0,0.64,0.72,0.0,0.39,0.49,0.19,0.29,0.11,0.24,0.24,0.3,0.59,0.38,0.18,0.33,0.64,0.51,0.35,0.5,0.23,0.64,0.0,15.0,1.0,2.67,6.08,25.0,PF,69.0,1467.0,Houston Rockets 08-09,9.0,8.0,7.0,1.0,0.52,0.75,0.00836,0.35,0.54,0.29,0.38,0.1,0.21,0.15,0.32,0.39,0.36,0.31,0.36,0.66,0.46,0.47,0.52,0.25,0.47,0.0,9.0,4.0,3.73,6.93


In [221]:
p2mean = get_2yr_mean(p2no6)

26
26


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ap_2yr_mean[str(col_means[i])+"mean"] = (df[col_means[i]] + df[cols[i]])/2


KeyError: 'sTS%_y, s3PAr_y'