In [10]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [11]:
csv_path = '2019.csv'

In [12]:
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0.1,Unnamed: 0,Player,Tm,Pos,Age,G,GS,Cmp,Att,Yds,...,FumblesLost,PassingYds,PassingTD,PassingAtt,RushingYds,RushingTD,RushingAtt,ReceivingYds,ReceivingTD,FantasyPoints
0,0,Christian McCaffrey,CAR,RB,23.0,16.0,16.0,0.0,2.0,0.0,...,0.0,0.0,0.0,2.0,1387.0,15.0,287.0,1005.0,4.0,469.2
1,1,Lamar Jackson,BAL,QB,22.0,15.0,15.0,265.0,401.0,3127.0,...,2.0,3127.0,36.0,401.0,1206.0,7.0,176.0,0.0,0.0,415.68
2,2,Derrick Henry,TEN,RB,25.0,15.0,15.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,1540.0,16.0,303.0,206.0,2.0,294.6
3,3,Aaron Jones,GNB,RB,25.0,16.0,16.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1084.0,16.0,236.0,474.0,3.0,314.8
4,4,Ezekiel Elliott,DAL,RB,24.0,16.0,16.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1357.0,12.0,301.0,420.0,2.0,311.7


In [13]:
df = df.loc[df['Pos'] == 'RB', ['Player', 'Tgt', 'RushingAtt', 'FantasyPoints']]
df['Usage'] = df['Tgt'] + df['RushingAtt']

In [14]:
df.head(5)

Unnamed: 0,Player,Tgt,RushingAtt,FantasyPoints,Usage
0,Christian McCaffrey,142.0,287.0,469.2,429.0
2,Derrick Henry,24.0,303.0,294.6,327.0
3,Aaron Jones,68.0,236.0,314.8,304.0
4,Ezekiel Elliott,71.0,301.0,311.7,372.0
5,Dalvin Cook,63.0,250.0,292.4,313.0


In [15]:
df['UsageRank'] = df['Usage'].rank(ascending = False)
df['FantasyPointsRank'] = df['FantasyPoints'].rank(ascending = False)
df.sort_values(by = 'UsageRank').head(15)

Unnamed: 0,Player,Tgt,RushingAtt,FantasyPoints,Usage,UsageRank,FantasyPointsRank
0,Christian McCaffrey,142.0,287.0,469.2,429.0,1.0,1.0
4,Ezekiel Elliott,71.0,301.0,311.7,372.0,2.0,3.0
28,Leonard Fournette,100.0,265.0,259.4,365.0,3.0,7.0
8,Nick Chubb,49.0,298.0,255.2,347.0,4.0,8.0
2,Derrick Henry,24.0,303.0,294.6,327.0,5.0,5.0
17,Chris Carson,47.0,278.0,232.6,325.0,6.0,12.0
62,Le'Veon Bell,78.0,245.0,213.0,323.0,7.5,16.0
22,Joe Mixon,45.0,278.0,225.4,323.0,7.5,13.0
5,Dalvin Cook,63.0,250.0,292.4,313.0,9.0,6.0
3,Aaron Jones,68.0,236.0,314.8,304.0,10.0,2.0


In Machine Learning (talking about supervised machine learning here), there are two types of models:

those that deal with continuous outputs (For example, fantasy points, weight, stock price) which are classified as Regression models 

those that deal with classification (For example, is an email spam or not spam is a classic classification problem).

A simple linear regression model works by predicting a continuous output using one input or "feature".

Correlation is a value normalized from covariance and is always somewhere between -1 and 1. Two variables that are perfectly positively related to each other have a correlation coefficient of 1. This means that when one goes up, the other goes up too, always.

Two variables that are perfectly negatively correlated have a correlation coefficient of -1. One goes up, the other goes down, always.

Two variables that have a correlation coefficient of 0 have no correlation whatsoever.

In [24]:
def covariance(x, y):
    n = len(x)
    return sum((x - np.mean(x)) * (y - np.mean(y))) * 1/(n-1)

Ok so we use the len of x( for this to work numpy needs to be in arrays the same way that pandas needs to be in data frames)

So this formula is saying loop through each pair of points for x, and y and for each pair of points, subtract the mean of the x array from x, and subtract the mean of the y array from y. Then take those two values, and multiply them together and set them to the side. At the end sum all of these values up and divide the value we get back by the length of the sample minus 1.

he function we wrote above is the Python/Numpy representation of this function. We used a numpy array instead of a Python list as numpy allows us to do stuff like this really easily.

In [25]:
x - np.mean(x)

array([ 3.32686275e+02,  2.30686275e+02,  2.07686275e+02,  2.75686275e+02,
        2.16686275e+02,  2.50686275e+02,  1.43686275e+02,  1.34686275e+02,
        2.28686275e+02,  1.93686275e+02,  2.26686275e+02,  1.75686275e+02,
        2.68686275e+02,  1.72686275e+02,  1.45686275e+02,  1.71686275e+02,
        1.67686275e+02,  1.41686275e+02,  1.75686275e+02,  6.26862745e+01,
        2.26686275e+02,  1.80686275e+02,  1.64686275e+02,  1.70686275e+02,
        1.57686275e+02,  1.20686275e+02,  1.15686275e+02,  1.37686275e+02,
        6.56862745e+01,  9.26862745e+01,  9.56862745e+01,  7.06862745e+01,
        5.76862745e+01,  5.16862745e+01,  4.86862745e+01,  5.56862745e+01,
        4.46862745e+01,  8.16862745e+01,  3.66862745e+01,  8.56862745e+01,
        3.86862745e+01,  4.36862745e+01,  5.26862745e+01,  7.16862745e+01,
        4.86862745e+01,  8.56862745e+01,  3.16862745e+01,  6.68627451e+00,
        1.36862745e+01,  9.68627451e+00, -9.31372549e+00, -1.53137255e+01,
       -2.03137255e+01, -

In [26]:
x = df['Usage'].values
y = df['FantasyPoints'].values

In [28]:
x

array([429., 327., 304., 372., 313., 347., 240., 231., 325., 290., 323.,
       272., 365., 269., 242., 268., 264., 238., 272., 159., 323., 277.,
       261., 267., 254., 217., 212., 234., 162., 189., 192., 167., 154.,
       148., 145., 152., 141., 178., 133., 182., 135., 140., 149., 168.,
       145., 182., 128., 103., 110., 106.,  87.,  81.,  76.,  87., 112.,
        75.,  92., 123., 112.,  60.,  95.,  51.,  80.,  53.,  82.,  62.,
        92.,  32.,  26.,  73.,  93.,  86.,  57.,  98.,  96.,  59.,  44.,
        94.,  54.,  27.,  24.,  74.,  28.,  71.,  36.,  44.,  48.,  25.,
        47.,  40.,  33.,  49.,  35.,  45.,  31.,  32.,  13.,  35.,  29.,
        12.,  25.,  16.,  12.,  27.,   6.,  13.,  12.,  12.,  11.,  12.,
         8.,   2.,  11.,  22.,   3.,  13.,  24.,   9.,  17.,   5.,  21.,
        10.,  34.,  10.,  14.,  19.,  16.,  13.,  11.,  12.,   7.,   6.,
         5.,   6.,   5.,   8.,   6.,   8.,   8.,   3.,   4.,   5.,   2.,
         3.,   2.,   1.,   4.,   4.,   5.,   2.,   

In [29]:
y

array([4.6920e+02, 2.9460e+02, 3.1480e+02, 3.1170e+02, 2.9240e+02,
       2.5520e+02, 3.0900e+02, 2.3850e+02, 2.3260e+02, 2.4410e+02,
       2.2540e+02, 2.1740e+02, 2.5940e+02, 1.9160e+02, 2.1670e+02,
       2.4852e+02, 1.7930e+02, 2.1220e+02, 1.9770e+02, 1.6520e+02,
       2.1300e+02, 1.7040e+02, 1.5320e+02, 1.5260e+02, 1.9760e+02,
       1.8080e+02, 1.6630e+02, 1.4700e+02, 1.9820e+02, 1.5720e+02,
       1.4790e+02, 1.3540e+02, 1.4550e+02, 1.4110e+02, 1.5400e+02,
       1.4630e+02, 1.4150e+02, 1.1450e+02, 1.1140e+02, 1.4220e+02,
       1.1860e+02, 9.2600e+01, 1.2190e+02, 1.6390e+02, 1.0330e+02,
       9.4900e+01, 8.5000e+01, 1.0110e+02, 1.0590e+02, 8.7200e+01,
       9.6900e+01, 8.2800e+01, 7.5300e+01, 1.0140e+02, 6.8400e+01,
       5.9100e+01, 6.7200e+01, 1.0440e+02, 5.7900e+01, 6.7800e+01,
       9.3600e+01, 5.7400e+01, 8.3800e+01, 5.0000e+01, 8.2800e+01,
       5.4000e+01, 6.6200e+01, 4.6900e+01, 5.0900e+01, 4.9500e+01,
       4.3200e+01, 6.6300e+01, 7.5300e+01, 5.1800e+01, 6.8400e

In [27]:
def corr(x, y):
    return covariance(x, y)/(np.std(x) * np.std(y))
corr(x, y)

0.967330979222704