In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score
from sklearn import tree

In [None]:
#Question: Which defensive players in the NFL are worth their contract?

In [161]:
#Project Problems 1 - 4
#Use webscrapping to grab player stats and their salaries
players = pd.read_html('https://www.pro-football-reference.com/years/2020/defense.htm')
salary = pd.read_html('https://www.pro-football-reference.com/players/salary.htm')
#Cleaning the data to look nice
players = pd.DataFrame(players[0])
salary = pd.DataFrame(salary[0])
players.columns = players.columns.droplevel()
del players['Rk']
salary = salary.drop(['Rk','Tm', 'Pos'], axis = 1) 
#Combine the data sets
df = players.join(salary.set_index('Player'), on='Player')
df = df.fillna(0)
pd.set_option('display.max_columns', None)
df['Pos'] = df['Pos'].str.upper()
df = df[df.G != 'G']
df = df[df.Pos != 'QB']
df['Salary'] = df['Salary'].str.replace('$','')
df['Salary'] = df['Salary'].str.replace(',','')
d = {'TD': ['int_td', 'fum_td']}
df = df.rename(columns=lambda c: d[c].pop(0) if c in d.keys() else c)
#Create new features and a response variable called individual points a popular stat used to measure a players worth
df.iloc[:, 4:] = df.iloc[:, 4:].astype(float)
your_array = pd.to_numeric(df['Salary'])
df['Salary'] = np.rint(your_array)
df['tot_TD'] = df['int_td'] + df['fum_td']
df['ip'] = df['G'] + (5*df['GS']) + df['Sk'] + (4*df['FR']) + (4*df['Int']) + df['Comb'] + (5*df['tot_TD'])
salary = df['Salary']
del df['Salary']
df['Salary'] = salary
df.head()

Unnamed: 0,Player,Tm,Age,Pos,G,GS,Int,Yds,int_td,Lng,PD,FF,Fmb,FR,Yds.1,fum_td,Sk,Comb,Solo,Ast,TFL,QBHits,Sfty,tot_TD,ip,Salary
0,Roquan Smith,CHI,23,ILB,12,12,0,0,0,0,5,1,0,0,0,0,2,110,77,33,15,3,0,0,184,750000.0
1,Zach Cunningham,HOU,26,ILB,12,12,0,0,0,0,1,0,0,0,0,0,3,119,76,43,4,4,0,0,194,1074783.0
2,Jordan Poyer,BUF,29,SS,12,12,2,14,0,14,4,1,0,0,0,0,2,100,76,24,4,4,0,0,182,1525000.0
3,Budda Baker,ARI,24,SS,11,11,2,90,0,90,4,1,0,0,0,0,2,94,73,21,6,3,0,0,170,1000000.0
4,Devin White,TAM,22,ILB,12,12,0,0,0,0,2,1,0,1,2,0,5,109,70,39,9,10,0,0,190,675000.0


In [184]:
#Problems 5 - 7
#EDA and machine learning
df.sort_values(by=['Salary'], ascending = False).head(10)
df.sort_values(by=['ip'], ascending = False).head(10)
#We will be trying to predict salary, we will be using ip (Individual Points)as a
#measurement to see which players are worth the big bucks
df = df.dropna()
df.iloc[:, 4:] = df.iloc[:, 4:].astype(float)
X = df.iloc[:,:25]
y = df['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.1)

class ColumnExtractor(TransformerMixin):
    
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        Xcols = X[self.cols]
        return Xcols
    
#  Numerical features
numeric_features = X.select_dtypes(exclude='object').columns

#  Categorical features
categorical_features = X.select_dtypes('object').columns

pipeline = Pipeline([
    ('features', FeatureUnion([
        ('numeric', Pipeline([
            ('extract', ColumnExtractor(numeric_features)),
            ('impute', SimpleImputer()),
            ('scale', StandardScaler())
        ])),
        ('categorical', Pipeline([
            ('extract', ColumnExtractor(categorical_features)),
            ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encode', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])),
    ]))
])

#Linear Regression
full_lm_pipeline = Pipeline([('pipeline', pipeline),
                         ('lm', LinearRegression())])
full_lm_pipeline.fit(X_train, y_train)
yhat_lm = full_lm_pipeline.predict(X_test)
#Decision Trees
full_tree_pipeline = Pipeline([('pipeline', pipeline),
                         ('tree', tree.DecisionTreeRegressor(min_samples_leaf=16))])
full_tree_pipeline.fit(X_train, y_train)
yhat_tree = full_tree_pipeline.predict(X_test)
#Model Evaluation Linear
mse_lm = mean_squared_error(y_test, yhat_lm)
np.sqrt(mse_lm)
mean_absolute_error(y_test, yhat_lm)
#Model Evaluation Trees
mse_t = mean_squared_error(y_test, yhat_tree)
np.sqrt(mse_t)
mean_absolute_error(y_test, yhat_tree)
#Since the mse and mae for Decision Trees were much lower than the Linear model I decided to 
#do the predictions only for the decision trees

#Predictions Trees
final_tree_predictions = full_tree_pipeline.predict(df)
yhat_tree = np.rint(yhat_tree)
trees = pd.DataFrame({'Salary':y_test, 'TreesPredicted':yhat_tree})


Unnamed: 0,Salary,TreesPredicted
273,681000.0,2413428.0
631,610000.0,657806.0
14,5171000.0,2608301.0
32,3000000.0,2608301.0
592,825000.0,1206125.0


In [223]:
#8 
#Because the question is pretty vague, I want to look at 3 plyaers with recent new contracts and see
#what the model says
final = df.join(trees.set_index('Salary'), on='Salary', how='inner')
final = final.drop_duplicates('Player')
final.loc[final['Player'] == "Stephon Gilmore"]
final.loc[final['Player'] == "Chris Jones"]
final.loc[final['Player'] == "Jadeveon Clowney"]

#9
#In looking through the data I noticed that a lot of nfl players have the same contracts. Especially when it comes 
#to rookie contracts, most deals are the same, so sometimes I think the model would predict based on similar contracts
#of players had instead of looking at the stats

Unnamed: 0,Player,Tm,Age,Pos,G,GS,Int,Yds,int_td,Lng,PD,FF,Fmb,FR,Yds.1,fum_td,Sk,Comb,Solo,Ast,TFL,QBHits,Sfty,tot_TD,ip,Salary,TreesPredicted
426,Jadeveon Clowney,TEN,27,DE,8.0,8.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,19.0,14.0,5.0,4.0,6.0,0.0,0.0,67.0,5250000.0,2413428.0
