## Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import random

import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold



  from pandas.core import datetools


## Set Options

In [2]:
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline
sns.set_style('darkgrid')

random.seed(129)

# Combined Data

## Load Data

In [3]:
df = pickle.load(open('./clean_data/good_gk.pkl', 'rb'))

FileNotFoundError: [Errno 2] No such file or directory: './clean_data/good_gk.pkl'

## Inspect Data

In [None]:
df.info()

In [None]:
pd.concat([df.head(5), df.tail(5)], axis=0)

In [None]:
df.sample(10, random_state=129)

## Fix Integer Column Dtypes

### Rename, Reorder, and Drop Columns

In [None]:
# Rename columns that have spaces in names
df.rename(columns={'Base Salary': 'Salary', 
                   'Guaranteed Compensation': 'Total', 
                   'First Name': 'First',
                   'Last Name': 'Last'}, inplace=True)
df['Log_Salary'] = df['Salary'].map(np.log)

In [None]:
# idx = (df['GP'] == 0)
# df['Wpct'][~idx] = (df['W'][~idx] + df['T'][~idx] * 0.5) / df[~idx][['W', 'L', 'T']].sum(axis=1) * 100

In [None]:
df['Pts'] = df['W']*3 + df['L']*0 + df['T']*1

In [None]:
# df = df.drop(columns=['W', 'L', 'T', 'Wpct'])

In [None]:
df[df.isnull().any(axis=1)]

In [None]:
df.head()

## Targeting Salary

In [None]:
df2 = df[['Player', 'Club', 'POS', 'Salary', 'Log_Salary', 'Year', 'GP', 'GS', 'MINS', 
          'SHTS', 'SV', 'GA', 'GAA', 'ShO', 'SvPct', 'W', 'L', 'T']]

In [None]:
# df2 = df2[df2.Salary < 1e6]

In [None]:
df2.head()

In [None]:
plt.figure(num=None, figsize=(12, 8))
ax = sns.distplot(df2['Salary'])
ax.xaxis.set_label_text('Salary ($)')

In [None]:
plt.figure(num=None, figsize=(12, 8))
ax = sns.distplot(df2['Log_Salary'])
ax.xaxis.set_label_text('Log[Salary ($)]')

In [None]:
corr2 = df2.corr()

In [None]:
plt.figure(num=None, figsize=(12, 8))
sns.heatmap(corr2, center=0, cmap=sns.diverging_palette(10, 133, sep=80, n=20), vmin=-1, vmax=1, annot=True)
# plt.title('Correlation Heatmap')

In [None]:
plt.figure(num=None)
# sns.pairplot(df2.iloc[:, 3:])
sns.pairplot(df2.iloc[:, 3:], diag_kind="kde", kind='reg')
plt.title('Pairplot Matrix')

### Statsmodels

In [None]:
# Setup the data
X = df2.iloc[:, 5:]
X['Intercept'] = np.ones((len(X), 1))
y = df2.iloc[:, 4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=129)

# Create your model
model = sm.OLS(y_train, X_train, hasconst=True)
# Fit your model to your training set
fit = model.fit()
# Print summary statistics of the model's performance
fit.summary()

In [None]:
fit.resid.plot(style='o', figsize=(15,10))

### Sklearn

In [None]:
def rsquared_adj(model, X, y):
    return 1 - (1 - model.score(X, y)) * (len(y) - 1) / (len(y) - X.shape[1] - 1)

In [None]:
r_est = make_pipeline(PolynomialFeatures(1, interaction_only=True), RidgeCV(fit_intercept=True, normalize=True))
r_est.fit(X_train, y_train)
r_est.score(X_test, y_test)
print('rsq:', r_est.score(X_test, y_test), 
      ', rsq_adj:', rsquared_adj(r_est, X_test, y_test))
print('alpha:', r_est.steps[1][1].alpha_)

In [None]:
l_est = make_pipeline(PolynomialFeatures(1, interaction_only=True), LassoCV(fit_intercept=True, normalize=True))
l_est.fit(X_train, y_train)
l_est.score(X_test, y_test)
print('rsq:', l_est.score(X_test, y_test), 
      ', rsq_adj:', rsquared_adj(l_est, X_test, y_test))
print('alpha:', l_est.steps[1][1].alpha_)

In [None]:
e_est = make_pipeline(PolynomialFeatures(1, interaction_only=True), ElasticNetCV(l1_ratio=0.975, fit_intercept=True, normalize=True))
e_est.fit(X_train, y_train)
e_est.score(X_test, y_test)
print('rsq:', e_est.score(X_test, y_test), 
      ', rsq_adj:', rsquared_adj(e_est, X_test, y_test))
print('alpha:', e_est.steps[1][1].alpha_)

In [None]:
# Calculate Residuals
yhat = pd.Series(e_est.predict(X), name='Predicted')
residuals = yhat - y

In [None]:
# Plot Fit
sns.set_style('darkgrid')
plt.figure(num=None, figsize=(15, 10))
ax = sns.regplot(yhat, y,
                 line_kws={'color':'red'}, 
                 scatter_kws={'alpha':1, 's':10});
ax.set_xlabel('Predicted Salary [Log($)]')
ax.set_ylabel('Actual Salary [Log($)]')
ax.set_title('Predicted vs. Actual Salary')

In [None]:
plt.figure(num=None, figsize=(15, 10))
ax = sns.jointplot(y, yhat, kind="reg",
                   line_kws={'color':'red'}, 
                   scatter_kws={'alpha':1, 's':10});


In [None]:
plt.figure(num=None, figsize=(15, 10))
# ax = sns.jointplot(y, yhat, kind='resid',
#                    line_kws={'color':'red'}, 
#                    scatter_kws={'alpha':1, 's':10});
sns.jointplot(y, yhat, kind='kde')


In [None]:
plt.figure(num=None, figsize=(15, 10))
# ax = sns.jointplot(y, yhat, kind='resid',
#                    line_kws={'color':'red'}, 
#                    scatter_kws={'alpha':1, 's':10});
sns.regplot(y, yhat, kind='kde')


In [None]:
plt.figure(num=None, figsize=(15, 10))
ax = sns.jointplot(y, yhat, kind='resid',
                   line_kws={'color':'red'}, 
                   scatter_kws={'alpha':1, 's':10});
ax.set_axis_labels(ylabel='Residuals', xlabel='Actual Salary [Log($)]')

In [None]:
np.exp(1)

In [None]:
# Plot Residuals
plt.figure(num=None, figsize=(15, 10))

idx = np.random.permutation(np.arange(len(residuals)))
idx = np.arange(len(residuals))
ax = plt.scatter(idx, residuals)
plt.title('Prediciton Residuals')

# ax = sns.regplot(random_idx, residuals,
#                  line_kws={'color':'red'}, 
#                  scatter_kws={'alpha':1, 's':10});
# ax.set_ylabel('')
# ax.set_title('Prediction Residuals')

## Log-Log Modeling

In [None]:
df3 = df[['Player', 'Club', 'POS', 'Salary', 'Log_Salary']]

In [None]:
for col in df2.columns[5:]:
    df3['Log_' + col] = df[col].map(lambda x: np.log(x+1))

In [None]:
df3.head()

In [None]:
df3[df3.isnull().any(axis=1)]

In [None]:
corr3 = df3.corr()

In [None]:
plt.figure(num=None, figsize=(15, 10))
sns.heatmap(corr3, center=0, cmap=sns.diverging_palette(10, 220, sep=80, n=20), vmin=-1, vmax=1)
plt.title('Correlation Heatmap')

In [None]:
plt.figure(num=None)
# sns.pairplot(df3.iloc[:, 3:])
sns.pairplot(df3.iloc[:, 3:], diag_kind="kde", kind='reg')
plt.title('Pairplot Matrix')

### Statsmodels

In [None]:
# Setup the data
X = df3.iloc[:, 5:]
# X['Intercept'] = np.ones((len(X), 1))
y = df3.iloc[:, 4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=129)

# Create your model
model = sm.OLS(y_train, X_train, hasconst=True)
# Fit your model to your training set
fit = model.fit()
# Print summary statistics of the model's performance
fit.summary()

### Sklearn

In [None]:
r_est = make_pipeline(PolynomialFeatures(2, interaction_only=True), RidgeCV(fit_intercept=True, normalize=True))
r_est.fit(X_train, y_train)
r_est.score(X_test, y_test)
print('rsq:', r_est.score(X_test, y_test), ', rsq_adj:', rsquared_adj(r_est, X_test, y_test))

In [None]:
l_est = make_pipeline(PolynomialFeatures(2, interaction_only=True), LassoCV(fit_intercept=True, normalize=True))
l_est.fit(X_train, y_train)
l_est.score(X_test, y_test)
print('rsq:', l_est.score(X_test, y_test), ', rsq_adj:', rsquared_adj(l_est, X_test, y_test))

In [None]:
e_est = make_pipeline(PolynomialFeatures(2, interaction_only=True), ElasticNetCV(l1_ratio=0.975, fit_intercept=True, normalize=True))
e_est.fit(X_train, y_train)
e_est.score(X_test, y_test)
print('rsq:', e_est.score(X_test, y_test), ', rsq_adj:', rsquared_adj(e_est, X_test, y_test))

## Combo Modeling

In [None]:
df4 = df2.merge(df3, on=['Player', 'Club', 'POS', 'Salary', 'Log_Salary'])

### Statsmodels

In [None]:
# Setup the data
X = df4.iloc[:, 5:]
# X['Intercept'] = np.ones((len(X), 1))
y = df4.iloc[:, 4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=129)

# Create your model
model = sm.OLS(y_train, X_train, hasconst=True)
# Fit your model to your training set
fit = model.fit()
# Print summary statistics of the model's performance
fit.summary()

### Sklearn

In [None]:
r_est = make_pipeline(PolynomialFeatures(2, interaction_only=True), RidgeCV(fit_intercept=True, normalize=True))
r_est.fit(X_train, y_train)
r_est.score(X_test, y_test)
print('rsq:', r_est.score(X_test, y_test), ', rsq_adj:', rsquared_adj(r_est, X_test, y_test))

In [None]:
l_est = make_pipeline(PolynomialFeatures(2, interaction_only=True), LassoCV(fit_intercept=True, normalize=True))
l_est.fit(X_train, y_train)
l_est.score(X_test, y_test)
print('rsq:', l_est.score(X_test, y_test), ', rsq_adj:', rsquared_adj(l_est, X_test, y_test))

In [None]:
e_est = make_pipeline(PolynomialFeatures(2, interaction_only=True), ElasticNetCV(l1_ratio=0.975, fit_intercept=True, normalize=True))
e_est.fit(X_train, y_train)
e_est.score(X_test, y_test)
print('rsq:', e_est.score(X_test, y_test), ', rsq_adj:', rsquared_adj(e_est, X_test, y_test))

In [None]:
df.head()

In [None]:
# Setup the data
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 5:], df.iloc[:, 0], test_size=0.3, random_state=129)

# Create your model
model = sm.OLS(y_train, X_train)
# Fit your model to your training set
fit = model.fit()
# Print summary statistics of the model's performance
fit.summary()

In [None]:
# Setup the data
X = df.iloc[:, 5:]
# X['Intercept'] = np.ones((len(X), 1))
y = df.iloc[:, 4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=129)

# Create your model
model = sm.OLS(y_train, X_train, hasconst=False)
# Fit your model to your training set
fit = model.fit()
# Print summary statistics of the model's performance
fit.summary()

In [None]:
fit.resid.plot(style='o', figsize=(12,8));

In [None]:
# Setup the data
X = df3.iloc[:, 5:]
# X = df3.iloc[:, 19:]
# X['Intercept'] = np.ones((len(X), 1))
y = df3.iloc[:, 4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=129)

# Create your model
model = sm.OLS(y_train, X_train, hasconst=False)
# Fit your model to your training set
fit = model.fit()
# Print summary statistics of the model's performance
fit.summary()

In [None]:
fit.resid.plot(style='o', figsize=(12,8));

In [None]:
est = make_pipeline(PolynomialFeatures(2, interaction_only=True), LinearRegression())
est.fit(X_train, y_train)
est.score(X_test, y_test)

In [None]:
lr = LinearRegression(fit_intercept=False, normalize=False)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
lr = RidgeCV(fit_intercept=False, normalize=False)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
lr = LassoCV(fit_intercept=False, normalize=False)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
lr = ElasticNetCV(l1_ratio=0.025, fit_intercept=False, normalize=False)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

### Sandbox

In [None]:
df3.head()

In [None]:
# # Setup the data
# # X = df3.iloc[:, 5:]
# X = sm.add_constant(df[['GS', 'MINS', 'ShO']])
# # y = df3.iloc[:, 0]
# # y = df3['Salary']
# y = df3['Log_Salary']

# X = df3.iloc[:, 6]
# X = sm.add_constant(df3.iloc[:, 6])

df4 = df3[df3.GP > 0]
X = sm.add_constant(df4.iloc[:, list(np.arange(8,16)) + [17,18]])
y = df4['ShO'] #.map(lambda x: np.log(x+1))

In [None]:
plt.figure(num=None, figsize=(15, 10))
sns.heatmap(corr3, center=0, cmap=sns.diverging_palette(10, 220, sep=80, n=20), vmin=-1, vmax=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=129)

# Create your model
model = sm.OLS(y_train, X_train, hasconst=True)
# Fit your model to your training set
fit = model.fit()
# Print summary statistics of the model's performance
fit.summary()

In [None]:
fit.resid.plot(style='o', figsize=(12,8));

In [None]:
lr = LinearRegression(fit_intercept=True, normalize=True)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
lr.score(X_train, y_train)

In [None]:
lr.coef_

In [None]:
lr = RidgeCV(fit_intercept=True, normalize=True)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
lr.coef_

In [None]:
lr = LassoCV(fit_intercept=False, normalize=False)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
X.columns

In [None]:
lr.coef_

In [None]:
lr = ElasticNetCV(l1_ratio=0.95, fit_intercept=False, normalize=False)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
yhat = pd.Series(fit.predict(X_test), name='Pred_ShO')

plt.figure(num=None, figsize=(15, 10))
plt.scatter(yhat, y_test)
plt.xlabel('yhat')
plt.ylabel('ytest')

In [None]:
sns.pairplot(X.join(y), diag_kind="kde", kind='reg')

In [None]:
g = sns.PairGrid(X.join(y)) 
g.map_upper(sns.regplot) 
g.map_lower(sns.residplot) 
g.map_diag(sns.kdeplot, lw=3, legend=False) 

In [None]:
plt.figure(num=None, figsize=(15, 10))
sns.regplot(yhat, y_test);

In [None]:
corr3 = X.join(y).corr()
corr3

In [None]:
sns.pairplot(X.join(y))

In [None]:
# 
alphas = np.logspace(-9, 9, num=19)
rscores = np.zeros(len(alphas))
lscores = np.zeros(len(alphas))
escores = np.zeros(len(alphas))
for ii, a in enumerate(alphas):    
    # RidgeCV
    rcv = RidgeCV(cv=10, alphas=[a], fit_intercept=True)
    rcv.fit(X, y)
    rscores[ii] = rcv.score(X, y)
    # LassoCV
    lcv = LassoCV(cv=10, alphas=[a], fit_intercept=True)
    lcv.fit(X, y)
    lscores[ii] = lcv.score(X, y)    
    # ElasticNetCV
    ecv = ElasticNetCV(cv=10, alphas=[a], l1_ratio=0.5, fit_intercept=True)
    ecv.fit(X, y)
    escores[ii] = ecv.score(X, y)

In [None]:
np.array([alphas, rscores, lscores, escores]).T

In [None]:
# Plot the alpha vs. scores
plt.figure(figsize=(8,6))
plt.semilogx(alphas, rscores, color='g', label='Ridge')
plt.semilogx(alphas, lscores, color='b', label='Lasso')
plt.semilogx(alphas, escores, color='r', label='ElasticNet')
plt.ylim((0.0, 1e0))
plt.ylabel('R^2')
plt.xlabel('alpha')
plt.title('Regularized Regression Comparsion')
plt.legend(loc='lower left')

In [None]:
lr = Ridge(alpha=100, fit_intercept=True, normalize=True)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
coef = {}
for idx, c in enumerate(X.columns):
    coef[c] = lr.coef_[idx]

In [None]:
coef

In [None]:
lr = Lasso(alpha=0.01, fit_intercept=False, normalize=False)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
lr = ElasticNet(alpha=0.01, l1_ratio=0.5, fit_intercept=False, normalize=False)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)