# Non-Parametric Gaussian Kernel Density Estimation Fantasy Football Draft Assist Tool

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import mixture_rvs

# Part 1 - Data Import and Merging

First, I got the raw position data from the following sources:

https://fantasy.espn.com/football/players/projections?leagueFormatId=1

https://www.fantasysharks.com/apps/Projections/SeasonProjections.php?pos=ALL

https://www.fantasypros.com/nfl/rankings/consensus-cheatsheets.php?loggedin=&my-experts=ALL

https://fftoolbox.fulltimefantasy.com/football/rankings/index.php?noppr=true

https://fantasy.nfl.com/research/projections?position=O&sort=projectedPts&statCategory=projectedStats&statSeason=2020&statType=seasonProjectedStats#researchProjections=researchProjections%2C%2Fresearch%2Fprojections%253Fposition%253DO%2526statCategory%253DprojectedStats%2526statSeason%253D2020%2526statType%253DseasonProjectedStats%2526statWeek%253D1%2Creplace

I also got Yahoo!'s draft position list from:
https://football.fantasysports.yahoo.com/f1/826195/1/editprerank

The scores and ratings from each source are all saved within this repository as individual .xlsx files in the `/raw_data` directory.

Next, I compiled and merged the data using the on player names from the Yahoo! draft list as the primary keys (since I'm going to index DAsHA on the player names)

In [None]:
# load Yahoo! draft positions
df = pd.read_excel('./raw_data/Yahoo_2020_Draft_Positions.xlsx')
df['Name'] = df['Name'].str.strip() # strip the names so they can be directly compared to other lists

# load Fantasy Pros Projections
df_fantasy_pros = pd.read_excel('./raw_data/Fantasy_Pros_2020_proj.xlsx')

# construct temporary data frame to merge relevant info
df_temp = pd.DataFrame()
df_temp['Name'] = df_fantasy_pros['PLAYER'].str.strip()
df_temp['fp_pts'] = df_fantasy_pros['FAN PTS']

# merge df with temporary df
df = pd.merge(df, df_temp, on='Name', how ='outer')

# import NFL projections
df_nfl = pd.read_excel('./raw_data/NFL_2020_proj.xlsx')
df_temp = pd.DataFrame()
df_temp['Name'] = df_nfl['PLAYER'].str.strip()
df_temp['nfl_pts'] = df_nfl['POINTS']

df = pd.merge(df, df_temp, on='Name', how ='outer')

# import ESPN projections
df_espn = pd.read_excel('./raw_data/ESPN_2020_proj.xlsx')

df_temp = pd.DataFrame()
df_temp['Name'] = df_espn['PLAYER'].str.strip()
df_temp['espn_pts'] = df_espn['TOT']

df = pd.merge(df, df_temp, on='Name', how ='outer')

# import Fantasy Shark projections
df_fantasy_shark = pd.read_excel('./raw_data/Fantasy_Shark_2020_proj.xlsx')
df_temp = pd.DataFrame()
df_temp['Name'] = df_fantasy_shark['Name'].str.strip()
df_temp['fs_pts'] = df_fantasy_shark['Fantasy Points']

df = pd.merge(df, df_temp, on='Name', how ='outer')

# import Sports Illustrated projections
df_si = pd.read_excel('./raw_data/Sports_Illustrated_2020_proj.xlsx')

df_temp = pd.DataFrame()
df_temp['Name'] = df_si['PLAYER'].str.strip()
df_temp['si_pts'] = df_si['POINTS']

df = pd.merge(df, df_temp, on='Name', how ='outer')

# Finally, drop the players not available in Yahoo!
df = df.dropna(subset=['Draft Position'])

# Part 2 - Gaussian Kernel Density Estimation

Now for the non-parametric fun!

I created a Gaussian kernel density estimation for each player using the projected points from each source (NFL, ESPN, Fantasy Sharks, etc.) for the kernels.

In [None]:
# create an empty column to store point projections for each player
df['pts'] = np.nan

# create an empty column to store the inverse cumulative density function data for each player's kde estimation
df['kde_icdf'] = [[] for _ in range(len(df))]

The following block of code generates the non-parametric estimations for each player and stores: (1) the median value from the KDE as their projected points and (2) the inverse CDF as an array in kde_icdf so we can generate the non-parametric confidence intervals for the draft tool.

This might take a minute or two to run.  I can't be bothered to speed it up.

In [31]:
# start from index 67 because the first 66 entries in my df are contextual cruft and not training data
for j in range(66,len(df)):
    # this block constructs an array of training data for each player of the different fantasy point estimates. This array is used to generate the KDE.
    training_data = []
    training_data.append(df['fp_pts'][j])
    training_data.append(df['nfl_pts'][j])
    training_data.append(df['espn_pts'][j])
    training_data.append(df['fs_pts'][j])
    training_data.append(df['si_pts'][j])
    training_data = [x for x in training_data if str(x) != 'nan'] # clean the training_data for NaN values - which interfere with the analysis below
    if len(training_data) == 0:
        training_data = [0]
    
    # this sets up and runs the non-parametric estimation
    kde = sm.nonparametric.KDEUnivariate(training_data)
    # Estimate the densities
    kde.fit(kernel='gau', bw='silverman', fft=False)

    # This for loop processes to find the median projection
    ci_50 = 0 # initialize median value to zero
    for i in range(0, len(kde.cdf)):
        if kde.cdf[i] < 0.50:
            i+=1
        if kde.cdf[i] >= 0.50:
            ci_50 = kde.support[i]
            break

    # Add data to main dataframe
    df['pts'][j]=ci_50 # add median projection
    df['kde_icdf'][j]=kde.icdf # add icdf for whisker plot construction
    
# export the dataset to a .csv file so we don't have to run the code above again (it's time consuming!)
df.to_csv(r'./prepared_data/2020_ffl_df.csv')

RuntimeError: Selected KDE bandwidth is 0. Cannot estimate density. Either provide the bandwidth during initialization or use an alternative method.

Below, I included an example of how the KDE estimation works.  `j` represents an arbitrary draft from which we can generate a Gaussian KDE probability density function for the player corresponding to that draft rank.

The various point estimates for that player (from ESPN, SI, ets.) are indicated by the red '+'s at the bottom of the graph. What KDE does is to center a normal gaussian distribution (with area = $\frac{1}{n}$ for n point estimates) over each of thes point estimates. Then, to generate the probabilty density function, we sum all of these "kernels" together - this summation is the orange line in the graph below.

Then, the following block of code generates a histogram from the KDE PDF showing the 1- and 2- standard deviation confidence intervals.

In [None]:
j = 125

training_data = []
training_data.append(df['fp_pts'][j])
training_data.append(df['nfl_pts'][j])
training_data.append(df['espn_pts'][j])
training_data.append(df['fs_pts'][j])
training_data.append(df['si_pts'][j])
training_data = [x for x in training_data if str(x) != 'nan']
if len(training_data) == 0:
    training_data = [0]
    
# this sets up and runs the non-parametric estimation
kde = sm.nonparametric.KDEUnivariate(training_data)
kde.fit(kernel='gau', bw='silverman', fft=False) # Estimate the densities


# Note: some plots for the kde visualizations that I'm turning off for the working loops
fig = plt.figure(figsize=(10, 5))
ax = fig.add_subplot(111)

# Plot the histrogram
ax.hist(training_data, bins=5, density=True, label='Histogram from forecasts',
        zorder=5, edgecolor='k', alpha=0.5)
# Plot the KDE as fitted using the default arguments
ax.plot(kde.support, kde.density, lw=3, label='KDE from projections', zorder=10)
    
# Plot the samples
ax.scatter(training_data, np.abs(np.random.randn(len(training_data)))/100000,marker='+', color='red', zorder=20, label='Point Forecasts', alpha=0.5)
ax.legend(loc='best')
ax.grid(True, zorder=-5)

Example for confidence interval plots generated from the KDE PDF above.

In [None]:
box_plot_data=kde.icdf
plt.boxplot(box_plot_data, vert=False, labels=[df['Name'][j]])
plt.show()