In [None]:
# Import necessary packages

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from scipy.stats import linregress
from operator import itemgetter

In [None]:
# Import the datasets

dfStars = pd.read_csv('stars-data.csv')
dfNoStars = pd.read_csv('no-stars-data.csv')

In [None]:
# Preview the dataset

dfNoStars.head()

In [None]:
# Create function that adjusts salary figures to add $ sign and divide by 1 million

def millions(x, pos):
    return '$%1.0fM' % (x*1e-6)

In [None]:
# Create function that returns r_value, r^2, and p_value for regressions to be used in graphs

def linStats(x, y):
    slope, intercept, r_value, p_value, std_err = linregress(x, y)
    
    if(p_value < 0.001):
        pString = ", p < 0.001"
    else:
        pString = ", p = %.3f" % p_value
        
    rsquared = r_value ** 2
    
    rpString = "r = " + str(round(r_value, 3)) + ", r-squared = " + str(round(rsquared, 3)) + pString
    return rpString

In [None]:
# Create function that plots the linear trendline for the graph

def linTrend(x, y, ax = None):
    ax = ax
    return ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)))

In [None]:
# Create function that plots the correlations between a given value x and a player's 2018-19 salary

y = '2018-19 Salary'

plt.style.use('fivethirtyeight')

def plot(x):
    z, ax = plt.subplots()

    ax.scatter(dfNoStars[x.upper()], dfNoStars[y], color = 'orange')
    z.suptitle("Correlation between %s and salary" % x.upper(), weight = 'bold', size = 18, y = 1.007)
    ax.set_xlabel("2017-18 %s" % x.upper())
    ax.set_ylabel(y)
    
    ax.tick_params(axis = 'both', which = 'major', labelsize = 18)
    ax.axhline(y = 0, color = 'black', linewidth = 1.3, alpha = .7)
    
    linTrend(dfNoStars[x.upper()], dfNoStars[y], ax)
    
    formatter = FuncFormatter(millions)
    ax.yaxis.set_major_formatter(formatter)
    
    ax.set_title("%s"%linStats(dfNoStars[x.upper()], dfNoStars[y]), size = 14, fontname = 'Rockwell')
    
    z.text(x = -.1, y = -.1,
        s = '__________________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left')
    
    z.text(x = -.1, y = -.18,
        s = 'dribbleanalytics.blogspot.com                     Source: Basketball Reference',
        fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

    z.savefig("%s-correl.png" % x, dpi = 400, bbox_inches = 'tight')

Note: all these correlations are done with dfNoStars, meaning the top 10 NBA players are excluded

# Correlation between PPG, WS, BPM, VORP and salary

In [None]:
plotList = ['ppg', 'ws', 'bpm', 'vorp']

for i in plotList:
    plot(i)

# Use average sum of stats to predict salary

In [None]:
# Create function that takes the sum of a stat and divides it by the sum of the salary of non-stars to create a salary/stat
# constant, which is then multiplied by the stars' stat to give a stat-adjusted star salary

totalSalary = dfNoStars['2018-19 Salary'].sum()
starNames = dfStars.iloc[:, 2]

def salary(x, y):
    totalStat = dfNoStars[x].sum()
    
    if(totalStat < 0):
        salaryPerStat = -(totalSalary / totalStat)
    else:
        salaryPerStat = totalSalary / totalStat
    
    print("Salary per %s: %.3f" %(x, salaryPerStat))
    
    starSalaryStat = []    
    for i in dfStars[x]:
        starSalaryStat.append(i * salaryPerStat)
        
    for(i, j) in zip(starNames, starSalaryStat):
        print(i, j)
    
    for i in starSalaryStat:
        y.append(i)

In [None]:
# Create function that graphs the adjusted salary data given a list of the data, the title of the graph, the name to save
# the file, and various numbers of how high to move the data labels

xBar = np.arange(len(starNames))
plt.style.use('fivethirtyeight')

def salaryPlot(x, title, filename, textThreshold, textBottom, textTop):
    
    combinedList = [[i, j] for i, j in zip(starNames, x)]
    sortedList = sorted(combinedList, key = itemgetter(1), reverse = True)
    sortedData = [row[1] for row in sortedList]
    sortedNames = [row[0] for row in sortedList]
    
    z, ax = plt.subplots()
    
    ax.bar(xBar, sortedData, edgecolor = 'white', linewidth = 3, color = 'skyblue')
    
    labels = sortedNames
    rects = ax.patches
    for rect, label in zip(rects, labels):
        if(rect.get_height() < textThreshold):
            ax.text(rect.get_x() + rect.get_width() / 1.8, rect.get_height() + textTop, label,
            ha='center', va='bottom', rotation = 'vertical', color = 'black', size = 15)
        elif(rect.get_height() > textThreshold):
            ax.text(rect.get_x() + rect.get_width() / 1.8, textBottom, label,
            ha='center', va='bottom', rotation = 'vertical', color = 'black', size = 15)
        
    z.suptitle("%s" % title, weight = 'bold', size = 18, y = .95)
    formatter = FuncFormatter(millions)
    ax.yaxis.set_major_formatter(formatter)
    ax.xaxis.set_visible(False)
    ax.set_ylabel("Adjusted 2018-19 Salary")
    
    ax.tick_params(axis = 'both', which = 'major', labelsize = 18)
    ax.axhline(y = 0, color = 'black', linewidth = 4, alpha = .7)
    
    z.text(x = -.12, y = -.01,
    s = '____________________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left')
    
    z.text(x = -.1, y = -.08,
    s = 'dribbleanalytics.blogspot.com                     Source: Basketball Reference',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')
    
    z.savefig("%s.png" % filename, dpi = 400, bbox_inches = 'tight')

In [None]:
obpmList = []

salary('OBPM', obpmList)

salaryPlot(obpmList, "OBPM-Adjusted Salary", "obpm-salary", 40000000, 5000000, 2000000)

In [None]:
vorpList = []

salary('VORP', vorpList)

salaryPlot(vorpList, "VORP-Adjusted Salary", "vorp-salary", 40000000, 5000000, 2000000)

In [None]:
bpmList = []

salary('BPM', bpmList)

salaryPlot(bpmList, "BPM-Adjusted Salary", "bpm-salary", 40000000, 5000000, 2000000)

# Use median sum of stats to predict salary

In the case of PPG, all players have a positive PPG, so the salary per PPG is very low. The same goes for WS (though not all players have positive WS, most players not on rookie contracts do). Therefore, to assign a salary on these stats, let's try distance from the median.

In [None]:
# Find medianWS and medianPPG 

medianWS = dfNoStars['WS'].median()
medianPPG = dfNoStars['PPG'].median()

In [None]:
dfNoStars['WS Distance from Median'] = abs(dfNoStars['WS'] - medianWS)
dfStars['WS Distance from Median'] = abs(dfStars['WS'] - medianWS)

wsDistMed = []

salary('WS Distance from Median', wsDistMed)

salaryPlot(wsDistMed, "Distance from Median WS-Adjusted Salary", "ws-dist-med-salary", 20000000, 2000000, 2000000)

The above stats are simply distance from the median. So, someone who scores 5 points below the median still has a +5 PPG, thereby inflating the stat. Let's see what happens if we make it that anyone who is below the median has 0.

In [None]:
dfNoStars['WS >0 Distance from Median'] = (dfNoStars['WS'] - medianWS).clip_lower(0)
dfStars['WS >0 Distance from Median'] = (dfStars['WS'] - medianWS).clip_lower(0)

ws0Med = []

salary('WS >0 Distance from Median', ws0Med)

salaryPlot(ws0Med, "Median WS-Adjusted Salary (≥ 0)", "ws-greater-0-med-salary", 20000000, 4000000, 2000000)

In [None]:
dfNoStars['WS - Median'] = (dfNoStars['WS'] - medianWS)
dfStars['WS - Median'] = (dfStars['WS'] - medianWS)

wsMinusMed = []

salary('WS - Median', wsMinusMed)

salaryPlot(wsMinusMed, "Median WS-Adjusted Salary", "ws-med-salary", 20000000, 10000000, 2000000)

In [None]:
dfNoStars['PPG Distance from Median'] = abs(dfNoStars['PPG'] - medianPPG)
dfStars['PPG Distance from Median'] = abs(dfStars['PPG'] - medianPPG)

ppgDistMed = []

salary('PPG Distance from Median', ppgDistMed)

salaryPlot(ppgDistMed, "Distance from Median PPG-Adjusted Salary", "ppg-dist-med-salary", 20000000, 2000000, 2000000)

In [None]:
# Subtract median PPG from individual PPG with a lower clip of 0, meaning all values are >= 0

dfNoStars['PPG >0 Distance from Median'] = (dfNoStars['PPG'] - medianPPG).clip_lower(0)
dfStars['PPG >0 Distance from Median'] = (dfStars['PPG'] - medianPPG).clip_lower(0)

ppg0Med = []

salary('PPG >0 Distance from Median', ppg0Med)

salaryPlot(ppg0Med, "Median PPG-Adjusted Salary (≥ 0)", "ppg-greater-0-med-salary", 20000000, 3000000, 2000000)

Now let's see what happens when the salary is actually adjusted to the salary per (stat - median stat)

In [None]:
dfNoStars['PPG - Median'] = (dfNoStars['PPG'] - medianPPG)
dfStars['PPG - Median'] = (dfStars['PPG'] - medianPPG)

ppgMinusMed = []

salary('PPG - Median', ppgMinusMed)

salaryPlot(ppgMinusMed, "Median PPG-Adjusted Salary", "ppg-med-salary", 20000000, 10000000, 2000000)