In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from cleaning import df

In [2]:
# I am using min-max feature scaling normalization because
# I want to compare player's stats relative to their apex.
def normalize(ls):
    '''Performs min-max feature scaling normalization on a sequence'''
    return [(num - min(ls))/(max(ls)-min(ls)) for num in ls]

In [3]:
df.head()

Unnamed: 0,player,href,height,season,age,tm,lg,pos,g,gs,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,allstar
0,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,22,MIL,NBA,C,82.0,,...,,,14.5,4.1,,,,3.5,28.8,1
1,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,23,MIL,NBA,C,82.0,,...,,,16.0,3.3,,,,3.2,31.7,1
2,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,24,MIL,NBA,C,81.0,,...,,,16.6,4.6,,,,2.9,34.8,1
3,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,25,MIL,NBA,C,76.0,,...,,,16.1,5.0,,,,2.7,30.2,1
4,Kareem Abdul-Jabbar,/players/a/abdulka01.html,86,,26,MIL,NBA,C,81.0,,...,3.5,11.0,14.5,4.8,1.4,3.5,,2.9,27.0,1


In [4]:
# create new dataframe with each players data normalized
# against every season in their career
unique = df['player'].unique()
all_norm_df = pd.DataFrame()
for player in unique:
    pnormdf = pd.DataFrame()
    playerdf = df[df['player'] == player]
    years_in_league = np.array(range(1, len(playerdf)+1))
    for col in playerdf:    
        if playerdf[col].dtypes == np.float64:
            statdf = playerdf[col]
            try:
                normdf = pd.DataFrame({col:normalize(statdf)})
                pnormdf = pd.concat([pnormdf, normdf], axis=1)
            except:
                pass
    pnormdf['player'] = player
    pnormdf['years_in_league'] = years_in_league
    all_norm_df = pd.concat([all_norm_df, pnormdf])
all_norm_df.head()

Unnamed: 0,2p,2p%,2pa,3p,3p%,3pa,ast,blk,drb,efg%,...,gs,mp,orb,pf,player,pts,stl,tov,trb,years_in_league
0,0.712871,0.322835,0.825,,,,0.704545,,,0.333333,...,,0.948357,,0.9,Kareem Abdul-Jabbar,0.757085,,,0.806452,1
1,0.871287,0.787402,0.85,,,,0.522727,,,0.790698,...,,0.807512,,0.6,Kareem Abdul-Jabbar,0.874494,,,0.927419,2
2,1.0,0.76378,1.0,,,,0.818182,,,0.767442,...,,1.0,,0.3,Kareem Abdul-Jabbar,1.0,,,0.975806,3
3,0.861386,0.606299,0.9,,,,0.909091,,,0.612403,...,,0.934272,,0.1,Kareem Abdul-Jabbar,0.813765,,,0.935484,4
4,0.742574,0.488189,0.8,,,,0.863636,,,0.496124,...,,0.981221,,0.3,Kareem Abdul-Jabbar,0.684211,,,0.806452,5


In [5]:
#rearrange order of dataframe so name and years_in_league are first columns
columns = list(all_norm_df.columns)
columns = ['player', 'years_in_league'] + columns
columns.pop(23)
columns.pop()
all_norm_df = all_norm_df[columns]
all_norm_df = all_norm_df.reset_index(drop=True)
all_norm_df.head()

Unnamed: 0,player,years_in_league,2p,2p%,2pa,3p,3p%,3pa,ast,blk,...,fta,g,gs,mp,orb,pf,pts,stl,tov,trb
0,Kareem Abdul-Jabbar,1,0.712871,0.322835,0.825,,,,0.704545,,...,1.0,1.0,,0.948357,,0.9,0.757085,,,0.806452
1,Kareem Abdul-Jabbar,2,0.871287,0.787402,0.85,,,,0.522727,,...,0.884058,1.0,,0.807512,,0.6,0.874494,,,0.927419
2,Kareem Abdul-Jabbar,3,1.0,0.76378,1.0,,,,0.818182,,...,0.985507,0.95,,1.0,,0.3,1.0,,,0.975806
3,Kareem Abdul-Jabbar,4,0.861386,0.606299,0.9,,,,0.909091,,...,0.565217,0.7,,0.934272,,0.1,0.813765,,,0.935484
4,Kareem Abdul-Jabbar,5,0.742574,0.488189,0.8,,,,0.863636,,...,0.434783,0.95,,0.981221,,0.3,0.684211,,,0.806452


In [6]:
# Some stats are not positive indicators such as turnovers
# so I want used the inverse of their values from the overall values
all_norm_df.tov = 1-all_norm_df.tov

In [7]:
# create individual normalized season sum column
s = [row[2:].sum() for i, row in all_norm_df.iterrows()]
season_sum = pd.DataFrame({'season_sum':s})

In [8]:
# create individual normalized season average column
# the average performance in each statistic
m = [row[2:].mean() for i, row in all_norm_df.iterrows()]
season_mean = pd.DataFrame({'season_mean':m})

In [9]:
statistic_df = pd.concat([all_norm_df[['player', 'years_in_league']],
                          season_sum, season_mean], axis=1)

In [10]:
statistic_df.head(10)

Unnamed: 0,player,years_in_league,season_sum,season_mean
0,Kareem Abdul-Jabbar,1,11.116465,0.694779
1,Kareem Abdul-Jabbar,2,12.603501,0.787719
2,Kareem Abdul-Jabbar,3,13.605082,0.850318
3,Kareem Abdul-Jabbar,4,11.360202,0.710013
4,Kareem Abdul-Jabbar,5,10.397593,0.64985
5,Kareem Abdul-Jabbar,6,10.613782,0.663361
6,Kareem Abdul-Jabbar,7,11.662827,0.728927
7,Kareem Abdul-Jabbar,8,10.865249,0.679078
8,Kareem Abdul-Jabbar,9,9.440216,0.590014
9,Kareem Abdul-Jabbar,10,10.432148,0.652009


In [11]:
statistic_df.describe()

Unnamed: 0,years_in_league,season_sum,season_mean
count,3956.0,3956.0,3885.0
mean,7.324065,11.287237,0.552472
std,4.371802,5.308588,0.203693
min,1.0,0.0,0.0
25%,4.0,7.336977,0.421777
50%,7.0,11.57779,0.588327
75%,10.0,15.777758,0.711209
max,21.0,22.690476,0.974768


In [12]:
# Show seasons in a player's career where they performed in top 75% 
# for all categories
print(len(statistic_df[statistic_df['season_mean'] > 0.75]))
statistic_df[statistic_df['season_mean'] > 0.75].head(20)

641


Unnamed: 0,player,years_in_league,season_sum,season_mean
1,Kareem Abdul-Jabbar,2,12.603501,0.787719
2,Kareem Abdul-Jabbar,3,13.605082,0.850318
42,Bill Russell,4,9.065811,0.755484
44,Bill Russell,6,10.405016,0.867085
58,Kobe Bryant,7,20.443757,0.81775
61,Kobe Bryant,10,20.822053,0.832882
62,Kobe Bryant,11,20.064829,0.802593
63,Kobe Bryant,12,19.502857,0.780114
74,Wilt Chamberlain,3,9.048562,0.754047
107,Tim Duncan,6,19.905438,0.796218


This process shows a bias towards older players as well as players who were less consistent through their careers. For instance, this line of thinking has led to results that ignore Michael Jordan and say Wilt Chamberlain had a single peak year. Perhaps by adding a measurement of a player's individual season against the entire sample, I can identify above average players from a historical sample and then identify how those seasons fit into their individual career narrative.