In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

ModuleNotFoundError: No module named 'cleaning.py'; 'cleaning' is not a package

In [None]:
# I am using min-max feature scaling normalization because
# I want to compare player's stats relative to their apex.
def normalize(ls):
    '''Performs min-max feature scaling normalization on a sequence'''
    return [(num - min(ls))/(max(ls)-min(ls)) for num in ls]

df = pd.read_csv('all-stats-clean.csv', header=0, index_col=0)

In [47]:
# There are many instances of players who took seasons off due to injury/
# health concerns, military service, or to play elsewhere. These instances 
# screwed up the data scraping process so they require additional cleaning.
# This is done by systematically going through and checking Basketball
# Reference and adjusting their stats accordingly by hand. I wanted to do it
# programmatically, but the juice wasn't worth the squeeze.

df = df.replace('None', np.nan)
df.columns = df.columns.str.lower()

In [48]:
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='ignore')

# Add All Star category
allstar= {'allstar':[]}
for i, row in df.iterrows():
    try:
        len(row['season'])
        allstar['allstar'].append(0)
    except:
        allstar['allstar'].append(1)
df['allstar'] = allstar['allstar']


# I will also need to deal with players who were traded mid season. I am going
# to solely look at their totals over the course of the entire season
df = df.loc[df.age.shift(1) != df.age]
df = df.reset_index(drop=True)

In [49]:
df.describe()

Unnamed: 0,age,g,gs,mp,fg,fga,fg%,3p,3pa,3p%,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,allstar
count,3956.0,3867.0,2678.0,3818.0,3834.0,3834.0,3834.0,2793.0,2811.0,2625.0,...,3087.0,3071.0,3823.0,3852.0,3099.0,3100.0,2936.0,3876.0,3878.0,3956.0
mean,27.928969,68.590897,53.157954,31.195364,5.916745,12.715962,1.232978,0.733906,2.450729,0.279507,...,1.56252,4.137126,6.229741,3.590421,1.108772,0.784032,2.141519,2.519221,15.674523,0.368807
std,4.54251,16.673757,29.175766,7.883603,2.539932,5.198693,18.004452,0.812113,5.455761,0.147848,...,1.175941,2.337452,3.863537,2.333859,0.568381,1.309786,0.968293,0.748,6.753456,0.482542
min,18.0,1.0,0.0,1.8,0.0,0.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,24.0,64.0,27.0,26.9,4.0,8.8,0.427,0.0,0.1,0.213,...,0.6,2.3,3.3,1.9,0.7,0.2,1.4,2.0,10.6,0.0
50%,28.0,75.0,66.0,33.3,5.9,12.8,0.46,0.4,1.4,0.324,...,1.2,3.5,5.2,3.0,1.0,0.4,2.1,2.5,15.6,0.0
75%,31.0,80.0,78.0,36.875,7.7,16.4,0.498,1.3,3.6,0.372,...,2.3,5.6,8.5,4.8,1.4,1.0,2.8,3.0,20.3,1.0
max,43.0,88.0,82.0,48.5,20.0,39.5,448.0,5.1,79.0,1.0,...,7.2,13.7,27.2,14.5,4.2,26.4,18.2,4.6,50.4,1.0


In [40]:
df.columns

Index(['player', 'href', 'height', 'season', 'age', 'tm', 'lg', 'pos', 'g',
       'gs', 'mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', '2p', '2pa', '2p%',
       'efg%', 'ft', 'fta', 'ft%', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk',
       'tov', 'pf', 'pts', 'allstar'],
      dtype='object')

In [42]:
df.rename(columns={'fg%':'fgp','3p':'threep', '3pa':'threepa',
                  '3p%':'threepp', '2p':'twop', '2pa':'twopa',
                  '2p%':'twopp', 'efg%':'efgp', 'ft%':'ftp'}, inplace=True)

In [44]:
df.head()

Unnamed: 0,player,href,height,season,age,tm,lg,pos,g,gs,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,allstar
0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,22,MIL,NBA,C,82.0,,...,,,14.5,4.1,,,,3.5,28.8,1
1,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,23,MIL,NBA,C,82.0,,...,,,16.0,3.3,,,,3.2,31.7,1
2,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,24,MIL,NBA,C,81.0,,...,,,16.6,4.6,,,,2.9,34.8,1
3,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,25,MIL,NBA,C,76.0,,...,,,16.1,5.0,,,,2.7,30.2,1
4,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,26,MIL,NBA,C,81.0,,...,3.5,11.0,14.5,4.8,1.4,3.5,,2.9,27.0,1


In [23]:
df.head()

Unnamed: 0.1,Unnamed: 0,Player,href,Height,Season,Age,Tm,Lg,Pos,G,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,22,MIL,NBA,C,82.0,...,0.653,,,14.5,4.1,,,,3.5,28.8
1,1.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,23,MIL,NBA,C,82.0,...,0.69,,,16.0,3.3,,,,3.2,31.7
2,2.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,24,MIL,NBA,C,81.0,...,0.689,,,16.6,4.6,,,,2.9,34.8
3,3.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,25,MIL,NBA,C,76.0,...,0.713,,,16.1,5.0,,,,2.7,30.2
4,4.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,26,MIL,NBA,C,81.0,...,0.702,3.5,11.0,14.5,4.8,1.4,3.5,,2.9,27.0


In [24]:
# create new dataframe with each players data normalized
# against every season in their career
unique = df['player'].unique()
all_norm_df = pd.DataFrame()
for player in unique:
    pnormdf = pd.DataFrame()
    playerdf = df[df['player'] == player]
    years_in_league = np.array(range(1, len(playerdf)+1))
    for col in playerdf:    
        if playerdf[col].dtypes == np.float64:
            statdf = playerdf[col]
            try:
                normdf = pd.DataFrame({col:normalize(statdf)})
                pnormdf = pd.concat([pnormdf, normdf], axis=1)
            except:
                pass
    pnormdf['allstar'] = playerdf['allstar']
    pnormdf['player'] = player
    pnormdf['years_in_league'] = years_in_league
    all_norm_df = pd.concat([all_norm_df, pnormdf])
all_norm_df.head()

KeyError: 'player'

In [25]:
#rearrange order of dataframe so name and years_in_league are first columns
columns = list(all_norm_df.columns)
columns = ['player', 'years_in_league'] + columns
columns.pop(24)
columns.pop()
all_norm_df = all_norm_df[columns]
all_norm_df = all_norm_df.reset_index(drop=True)
all_norm_df.head()

Unnamed: 0,player,years_in_league,player.1,years_in_league.1,2p,2p%,2pa,3p,3p%,3pa,...,ft,ft%,fta,g,gs,mp,pf,pts,stl,tov
0,Kareem Abdul-Jabbar,1,Kareem Abdul-Jabbar,1,0.712871,0.322835,0.825,,,,...,0.934783,0.0,1.0,1.0,,0.948357,0.9,0.757085,,
1,Kareem Abdul-Jabbar,2,Kareem Abdul-Jabbar,2,0.871287,0.787402,0.85,,,,...,0.891304,0.284615,0.884058,1.0,,0.807512,0.6,0.874494,,
2,Kareem Abdul-Jabbar,3,Kareem Abdul-Jabbar,3,1.0,0.76378,1.0,,,,...,1.0,0.276923,0.985507,0.95,,1.0,0.3,1.0,,
3,Kareem Abdul-Jabbar,4,Kareem Abdul-Jabbar,4,0.861386,0.606299,0.9,,,,...,0.586957,0.461538,0.565217,0.7,,0.934272,0.1,0.813765,,
4,Kareem Abdul-Jabbar,5,Kareem Abdul-Jabbar,5,0.742574,0.488189,0.8,,,,...,0.434783,0.376923,0.434783,0.95,,0.981221,0.3,0.684211,,


In [6]:
# Some stats are not positive indicators such as turnovers
# so I want used the inverse of their values from the overall values
all_norm_df.tov = 1-all_norm_df.tov

In [7]:
# create individual normalized season sum column
s = [row[2:].sum() for i, row in all_norm_df.iterrows()]
season_sum = pd.DataFrame({'season_sum':s})

In [8]:
# create individual normalized season average column
# the average performance in each statistic
m = [row[2:].mean() for i, row in all_norm_df.iterrows()]
season_mean = pd.DataFrame({'season_mean':m})

In [9]:
statistic_df = pd.concat([all_norm_df[['player', 'years_in_league']],
                          season_sum, season_mean], axis=1)

In [10]:
statistic_df.head(10)

Unnamed: 0,player,years_in_league,season_sum,season_mean
0,Kareem Abdul-Jabbar,1,12.116465,0.712733
1,Kareem Abdul-Jabbar,2,13.603501,0.800206
2,Kareem Abdul-Jabbar,3,14.605082,0.859122
3,Kareem Abdul-Jabbar,4,12.360202,0.727071
4,Kareem Abdul-Jabbar,5,11.397593,0.670447
5,Kareem Abdul-Jabbar,6,11.613782,0.683164
6,Kareem Abdul-Jabbar,7,12.662827,0.744872
7,Kareem Abdul-Jabbar,8,11.865249,0.697956
8,Kareem Abdul-Jabbar,9,9.440216,0.555307
9,Kareem Abdul-Jabbar,10,11.432148,0.672479


In [11]:
statistic_df.describe()

Unnamed: 0,years_in_league,season_sum,season_mean
count,3956.0,3956.0,3885.0
mean,7.324065,11.29204,0.55258
std,4.371802,5.307444,0.203683
min,1.0,0.0,0.0
25%,4.0,7.339929,0.421777
50%,7.0,11.599877,0.588492
75%,10.0,15.777758,0.711796
max,21.0,22.690476,0.974768


In [12]:
# Show seasons in a player's career where they performed in top 75% 
# for all categories
print(len(statistic_df[statistic_df['season_mean'] > 0.75]))
statistic_df[statistic_df['season_mean'] > 0.75].head(20)

641


Unnamed: 0,player,years_in_league,season_sum,season_mean
1,Kareem Abdul-Jabbar,2,13.603501,0.800206
2,Kareem Abdul-Jabbar,3,14.605082,0.859122
42,Bill Russell,4,9.065811,0.755484
44,Bill Russell,6,10.405016,0.867085
58,Kobe Bryant,7,20.443757,0.81775
61,Kobe Bryant,10,20.822053,0.832882
62,Kobe Bryant,11,20.064829,0.802593
63,Kobe Bryant,12,19.502857,0.780114
74,Wilt Chamberlain,3,9.048562,0.754047
107,Tim Duncan,6,19.905438,0.796218


This process shows a bias towards older players as well as players who were less consistent through their careers. For instance, this line of thinking has led to results that ignore Michael Jordan and say Wilt Chamberlain had a single peak year. Perhaps by adding a measurement of a player's individual season against the entire sample, I can identify above average players from a historical sample and then identify how those seasons fit into their individual career narrative.

In [None]:
# Create 

In [13]:
hist_norm_df = pd.DataFrame()
for col in df:    
    if df[col].dtypes == np.float64:
        statdf = df[col]
        try:
            normdf = pd.DataFrame({col:normalize(statdf)})
        except:
            pass
    hist_norm_df = pd.concat([hist_norm_df, normdf], axis=1)


In [14]:
hist_norm_df = hist_norm_df.iloc[:, 7:-2]
hist_norm_df.head(20)

Unnamed: 0,pts,g,gs,mp,fg,fga,fg%,3p,3pa,3p%,...,fta,ft%,orb,drb,trb,ast,stl,blk,tov,pf
0,0.0,0.931034,,0.884368,0.57,0.554987,0.518,,,,...,0.535294,0.116607,,,0.533088,0.282759,,,,0.76087
1,0.496732,0.931034,,0.820128,0.65,0.565217,0.577,,,,...,0.488235,0.123214,,,0.588235,0.227586,,,,0.695652
2,0.568627,0.91954,,0.907923,0.715,0.626598,0.574,,,,...,0.529412,0.123036,,,0.610294,0.317241,,,,0.630435
3,0.901961,0.862069,,0.877944,0.645,0.585678,0.554,,,,...,0.358824,0.127321,,,0.591912,0.344828,,,,0.586957
4,0.830065,0.91954,,0.899358,0.585,0.544757,0.539,,,,...,0.305882,0.125357,,,0.533088,0.331034,,,,0.630435
5,0.594771,0.735632,,0.867238,0.625,0.613811,0.513,,,,...,0.388235,0.13625,,,0.514706,0.282759,,,,0.695652
6,0.718954,0.931034,,0.843683,0.555,0.529412,0.529,,,,...,0.458824,0.125536,,,0.621324,0.344828,,,,0.782609
7,1.0,0.931034,,0.749465,0.54,0.468031,0.579,,,,...,0.382353,0.125179,,,0.488971,0.268966,,,,0.695652
8,,0.701149,,0.743041,0.535,0.485934,0.55,,,,...,0.329412,0.139821,,,0.474265,0.296552,,,,0.630435
9,,0.908046,,0.807281,0.485,0.419437,0.577,,,,...,0.347059,0.131429,,,0.470588,0.372414,,,,0.630435


In [15]:
hist_norm_df['player'] = df.loc[:, 'player']
hist_norm_df['years_in_league'] = all_norm_df.loc[:, 'years_in_league']
hist_norm_df['allstar'] = all_norm_df.loc[:, 'allstar']

In [16]:
# There is some issue with the drb column which needs investigation
columns.remove('drb')
hist_norm_df = hist_norm_df[columns]

In [17]:
hist_norm_df.describe()

Unnamed: 0,years_in_league,2p,2p%,2pa,3p,3p%,3pa,allstar,ast,blk,...,fta,g,gs,mp,orb,pf,pts,stl,tov,trb
count,3956.0,3121.0,3128.0,3128.0,0.0,0.0,0.0,20.0,3852.0,0.0,...,3826.0,3859.0,0.0,3810.0,0.0,3882.0,8.0,0.0,0.0,3823.0
mean,7.324065,0.131996,0.040258,0.364614,,,,0.95,0.248503,,...,0.25926,0.776808,,0.629156,,0.548098,0.638889,,,0.228937
std,4.371802,0.084975,0.067686,0.17992,,,,0.223607,0.161677,,...,0.144887,0.19183,,0.168857,,0.16288,0.311182,,,0.14201
min,1.0,0.0,0.0,0.0,,,,0.0,0.0,,...,0.0,0.0,,0.0,,0.0,0.0,,,0.0
25%,4.0,0.07561,0.031586,0.219512,,,,1.0,0.131034,,...,0.147059,0.724138,,0.537473,,0.434783,0.550654,,,0.121324
50%,7.0,0.12439,0.033793,0.351916,,,,1.0,0.206897,,...,0.241176,0.850575,,0.674518,,0.543478,0.656863,,,0.191176
75%,10.0,0.17561,0.035931,0.494774,,,,1.0,0.331034,,...,0.347059,0.908046,,0.749465,,0.673913,0.848039,,,0.308824
max,21.0,1.0,1.0,1.0,,,,1.0,1.0,,...,1.0,1.0,,1.0,,1.0,1.0,,,1.0


In [18]:
all_norm_df.describe()

Unnamed: 0,years_in_league,2p,2p%,2pa,3p,3p%,3pa,allstar,ast,blk,...,fta,g,gs,mp,orb,pf,pts,stl,tov,trb
count,3956.0,3095.0,3095.0,3095.0,2223.0,2262.0,2440.0,20.0,3850.0,2762.0,...,3823.0,3859.0,2424.0,3672.0,2807.0,3882.0,3884.0,2745.0,2665.0,3705.0
mean,7.324065,0.560029,0.529906,0.556614,0.416117,0.533973,0.429453,0.95,0.527239,0.48487,...,0.526151,0.725836,0.620714,0.643166,0.50964,0.56193,0.573992,0.522915,0.470232,0.54612
std,4.371802,0.309553,0.299503,0.310799,0.346681,0.330134,0.345306,0.223607,0.308864,0.322675,...,0.313162,0.31843,0.373111,0.313725,0.314124,0.306047,0.311983,0.306359,0.308837,0.310339
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,0.322005,0.302023,0.306624,0.070197,0.272898,0.090404,1.0,0.285714,0.25,...,0.268336,0.586984,0.248904,0.440311,0.25,0.333333,0.340426,0.285714,0.217391,0.305556
50%,7.0,0.605634,0.55,0.602484,0.391304,0.589417,0.397674,1.0,0.548387,0.5,...,0.555556,0.863636,0.774421,0.742331,0.5,0.6,0.62251,0.533333,0.4375,0.583333
75%,10.0,0.81687,0.765862,0.811823,0.705882,0.804575,0.717531,1.0,0.77551,0.714286,...,0.78125,0.971429,0.961538,0.899273,0.75,0.8125,0.831528,0.75,0.714286,0.8
max,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
df

Unnamed: 0,player,href,height,season,age,tm,lg,pos,g,gs,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,allstar
0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,22,MIL,NBA,C,82.0,,...,,,14.5,4.1,,,,3.5,28.8,1
1,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,23,MIL,NBA,C,82.0,,...,,,16.0,3.3,,,,3.2,31.7,1
2,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,24,MIL,NBA,C,81.0,,...,,,16.6,4.6,,,,2.9,34.8,1
3,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,25,MIL,NBA,C,76.0,,...,,,16.1,5.0,,,,2.7,30.2,1
4,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,26,MIL,NBA,C,81.0,,...,3.5,11.0,14.5,4.8,1.4,3.5,,2.9,27.0,1
5,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,27,MIL,NBA,C,65.0,,...,3.0,11.0,14.0,4.1,1.0,3.3,,3.2,30.0,1
6,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,28,LAL,NBA,C,82.0,,...,3.3,13.5,16.9,5.0,1.5,4.1,,3.6,27.7,1
7,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,29,LAL,NBA,C,82.0,,...,3.2,10.0,13.3,3.9,1.2,3.2,,3.2,26.2,1
8,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,1977-78,30,LAL,NBA,C,62.0,,...,3.0,9.9,12.9,4.3,1.7,3.0,3.4,2.9,25.8,0
9,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,31,LAL,NBA,C,80.0,,...,2.6,10.2,12.8,5.4,1.0,4.0,3.5,2.9,23.8,1


In [28]:
df.head()


Unnamed: 0.1,Unnamed: 0,Player,href,Height,Season,Age,Tm,Lg,Pos,G,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,0.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,22,MIL,NBA,C,82.0,...,0.653,,,14.5,4.1,,,,3.5,28.8
1,1.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,23,MIL,NBA,C,82.0,...,0.69,,,16.0,3.3,,,,3.2,31.7
2,2.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,24,MIL,NBA,C,81.0,...,0.689,,,16.6,4.6,,,,2.9,34.8
3,3.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,25,MIL,NBA,C,76.0,...,0.713,,,16.1,5.0,,,,2.7,30.2
4,4.0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,26,MIL,NBA,C,81.0,...,0.702,3.5,11.0,14.5,4.8,1.4,3.5,,2.9,27.0


In [32]:
df.columns

Index(['Player', 'href', 'Height', 'Season', 'Age', 'Tm', 'Lg', 'Pos', 'G',
       'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%',
       'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'PTS'],
      dtype='object')