In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor as RFR

In [2]:
dfTeams = pd.read_csv('teams.csv')
dfRosters = pd.read_csv('rosters.csv')

In [3]:
dfAll = dfTeams.merge(dfRosters,
                      how = 'left',
                      on = ['tm_link', 'year']
                      )

In [4]:
dfAll['conf_clean'] = dfAll.conference.apply(lambda c: c.split(' (')[0])

In [5]:
conferences = {c : ii for ii, c in enumerate(sorted(dfAll.conf_clean.unique()))}
dfAll['conference_id'] = dfAll.conf_clean.apply(lambda c: conferences[c])

In [6]:
dfAll['pos_id'], pos_key = pd.factorize(dfAll.pos_primary)

AttributeError: 'DataFrame' object has no attribute 'pos_primary'

In [7]:
dfAll.head()

Unnamed: 0,conference,team,tm_link,year,class,ht,player,pos,player_link,conf_clean,conference_id
0,ACC,Duke,duke,2001,SR,80,Shane Battier,F,shane-battier-1,ACC,3
1,ACC,Duke,duke,2001,SO,81,Carlos Boozer,C,carlos-boozer-1,ACC,3
2,ACC,Duke,duke,2001,SO,76,Andy Borman,G,andy-borman-1,ACC,3
3,ACC,Duke,duke,2001,SO,70,Andre Buckner,G,andre-buckner-1,ACC,3
4,ACC,Duke,duke,2001,SR,75,Ryan Caldbeck,G,ryan-caldbeck-1,ACC,3


### Clean Up Classes

In [8]:
# Manually change C -> SO
dfAll.loc[dfAll['class'] == 'C', 'class'] = 'SO'
dfAll['class'] = dfAll['class'].apply(lambda c: c.upper() if not pd.isnull(c) else np.nan)

In [9]:
fixed_classes = {54441 : 'JR',
                 60813 : 'FR',
                 60814 : 'FR',
                 62037 : 'FR'
                }
for fc in fixed_classes:
    dfAll.loc[fc, 'class'] = fixed_classes[fc]


In [10]:
# Set all grad school students to SR (only ~60 players)
dfAll.loc[dfAll['class'].isin(['GS', 'GR']), 'class'] = 'SR'

In [11]:
# Remove "Others"
dfAll = dfAll.loc[dfAll.player != 'Others']

In [12]:
class_mapping = {
    'SR' : 4,
    'JR' : 3,
    'SO' : 2,
    'FR' : 1
}
dfAll['class_id'] = dfAll['class'].apply(lambda c: class_mapping[c] if pd.notnull(c) else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Clean Up Positions

In [13]:
dfAll.pos = dfAll.pos.apply(lambda p: p.upper() if not pd.isnull(p) else np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [14]:
# Change pos == 'W' (n = 2) to other years' position (Bo Barnes (G), Jalen Moore (F))
# dfAll[dfAll.pos == 'W']
dfAll.loc[dfAll.player == 'Bo Barnes', 'pos'] = 'G'
dfAll.loc[dfAll.player == 'Jalen Moore', 'pos'] = 'F'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [15]:
# Creare Column for "Multi-Position"
dfAll['multipos'] = dfAll.pos.apply(lambda p: 0 if pd.isnull(p) else ('-' in p))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [16]:
# Create Column for Primary Position
dfAll['pos_primary'] = dfAll.pos.apply(lambda p: np.nan if pd.isnull(p) else p[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [17]:
dfAll['pos_id'], pos_key = pd.factorize(dfAll.pos_primary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [63]:
# dfAll.to_csv('rosters_cleaned.csv', index = False)

### Clean Up Heights

In [18]:
dfAll.ht.unique()

array([ 80.,  81.,  76.,  70.,  75.,  82.,  73.,  78.,  83.,  74.,  71.,
        79.,  77.,  85.,  84.,  72.,  nan,  69.,  87.,  65.,  86.,  66.,
        68.,  67.,  64.,  90.,  60.,  63.,  89.,  91.,  88.,  62.])

In [19]:
dfAll.insert(6, 'ht_imp', dfAll.ht.isnull())

In [20]:
# Fill NA heights via...

Xcol = ['year', 'class_id', 'multipos', 'conference_id', 'pos_id']
ycol = 'ht'

rfr_best = RFR(n_estimators = 100,
               # max_features = 10,
               max_depth = 10,
               min_samples_split = 2,
               min_samples_leaf = 3
               )

Xtrain = dfAll.loc[dfAll.ht.notnull(), Xcol]
ytrain = dfAll.loc[dfAll.ht.notnull(), ycol]
Xtest = dfAll.loc[dfAll.ht.isnull(), Xcol]

rfr_best.fit(Xtrain, ytrain)
ypred = rfr_best.predict(Xtest)

dfAll.loc[dfAll.ht.isnull(), ycol] = ypred

In [21]:
dfAll.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67011 entries, 0 to 67023
Data columns (total 16 columns):
conference       67011 non-null object
team             67011 non-null object
tm_link          67011 non-null object
year             67011 non-null float64
class            66898 non-null object
ht               67011 non-null float64
ht_imp           67011 non-null bool
player           67007 non-null object
pos              66116 non-null object
player_link      67007 non-null object
conf_clean       67011 non-null object
conference_id    67011 non-null int64
class_id         67011 non-null int64
multipos         67011 non-null object
pos_primary      66116 non-null object
pos_id           67011 non-null int64
dtypes: bool(1), float64(2), int64(3), object(10)
memory usage: 8.2+ MB


In [24]:
dfAll[dfAll.pos_primary.isnull()]

Unnamed: 0,conference,team,tm_link,year,class,ht,ht_imp,player,pos,player_link,conf_clean,conference_id,class_id,multipos,pos_primary,pos_id
23,Pac-10,Arizona,arizona,2001,FR,77.000000,False,Mike Schwertley,,mike-schwertley-1,Pac-10,26,1,0,,-1
127,Big 12,Kansas,kansas,2001,JR,81.000000,False,Todd Kappelmann,,todd-kappelmann-1,Big 12,5,3,0,,-1
148,ACC,Virginia,virginia,2001,SO,77.000000,False,Roger Mason,,roger-mason-1,ACC,3,2,0,,-1
301,SEC (East),Tennessee,tennessee,2001,FR,80.000000,False,Andy Ikeakor,,andy-ikeakor-1,SEC,29,1,0,,-1
323,Big Ten,Wisconsin,wisconsin,2001,FR,77.584017,True,Ricky Bower,,ricky-bower-2,Big Ten,9,1,0,,-1
325,Big Ten,Wisconsin,wisconsin,2001,FR,77.584017,True,Kyle Grusczynski,,kyle-grusczynski-1,Big Ten,9,1,0,,-1
332,Big Ten,Wisconsin,wisconsin,2001,FR,77.584017,True,Pete Schmit,,pete-schmit-2,Big Ten,9,1,0,,-1
405,CUSA (American),Charlotte,charlotte,2001,FR,80.000000,False,Butter Johnson,,butter-johnson-1,CUSA,12,1,0,,-1
443,SEC (West),Alabama,alabama,2001,FR,79.000000,False,Gerald Wallace,,gerald-wallace-1,SEC,29,1,0,,-1
545,Big Ten,Penn State,penn-state,2001,SO,78.104591,True,Marcus Banta,,marcus-banta-1,Big Ten,9,2,0,,-1


In [105]:
dfAll.to_csv('rosters_imputed.csv', index = False)