# Pre-Processing

In [1]:
import pandas as pd
import numpy as np

In [2]:
basketball_data_preprocessing = pd.read_csv('basketball_data_preprocessing.csv')
basketball_data_preprocessing.drop(["Unnamed: 0"], axis="columns", inplace=True)

In [3]:
basketball_data_preprocessing

Unnamed: 0,Season,Age,Tm,Pos,G,GS,MP,FG,FGA,FG%,...,BLK,TOV,PF,PTS,Experience,Draft Selection Pick,Country or State,US or International,FT%,PTS per 21m
0,1996-97,20,VAN,PF,80,71,35.0,6.9,15.2,0.453,...,1.0,2.8,2.5,18.7,1,3,georgia,US,0.738462,11.220000
1,1997-98,21,VAN,SF,82,82,36.0,8.0,16.4,0.485,...,0.9,3.1,2.5,22.3,2,3,georgia,US,0.782051,13.008333
2,1998-99,22,VAN,SF,50,50,40.4,7.7,17.9,0.432,...,1.1,3.7,2.7,23.0,3,3,georgia,US,0.840909,11.955446
3,1999-00,23,VAN,SF,82,82,39.3,7.2,15.6,0.465,...,1.1,3.0,3.0,20.3,4,3,georgia,US,0.805970,10.847328
4,2000-01,24,VAN,SF,81,81,40.0,7.5,15.8,0.472,...,1.0,2.9,2.9,20.5,5,3,georgia,US,0.833333,10.762500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,2017-18,20,LAL,C,43,0,9.5,1.4,2.8,0.500,...,0.3,0.6,1.1,3.7,2,32,bosnia and herzegovina,International,0.750000,8.178947
11051,2018-19,21,TOT,C,59,37,17.6,3.6,6.4,0.559,...,0.9,1.2,2.3,8.9,3,32,bosnia and herzegovina,International,0.809524,10.619318
11052,2018-19,21,LAL,C,33,12,15.6,3.4,5.8,0.580,...,0.8,1.0,2.2,8.5,4,32,bosnia and herzegovina,International,0.850000,11.442308
11053,2018-19,21,LAC,C,26,25,20.2,3.8,7.2,0.538,...,0.9,1.4,2.5,9.4,5,32,bosnia and herzegovina,International,0.739130,9.772277


In [4]:
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

In [5]:
basketball_data_preprocessing.shape

(11055, 33)

## Train/Test Split

In [6]:
#Decided to go with a 70/30 split
len(basketball_data_preprocessing) * .7, len(basketball_data_preprocessing) * .3

(7738.499999999999, 3316.5)

In [10]:
X = basketball_data_preprocessing[['Experience', 'FG%', 'FT%', 'GS', 'MP', 'Age']] #These are my independent variables from the EDA
y = basketball_data_preprocessing['PTS'] #Points is my Dependent Variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 47)

In [11]:
X_train.shape, X_test.shape

((7738, 6), (3317, 6))

In [13]:
y_train.shape, y_test.shape

((7738,), (3317,))

In [14]:
X_train.dtypes

Experience      int64
FG%           float64
FT%           float64
GS              int64
MP            float64
Age             int64
dtype: object

In [15]:
X_test.dtypes

Experience      int64
FG%           float64
FT%           float64
GS              int64
MP            float64
Age             int64
dtype: object

## Initial Not-Even-A-Model

In [16]:
#Calculate the mean of `y_train`
train_mean = y_train.mean()
train_mean

8.567549754458517

In [17]:
#Fit the dummy regressor on the training data
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[8.56754975]])

## Metrics

### R-squared

In [18]:
def r_squared(y, ypred):
    """R-squared score.
    
    Calculate the R-squared, or coefficient of determination, of the input.
    
    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    ybar = np.sum(y) / len(y) #yes, we could use np.mean(y)
    sum_sq_tot = np.sum((y - ybar)**2) #total sum of squares error
    sum_sq_res = np.sum((y - ypred)**2) #residual sum of squares error
    R2 = 1.0 - sum_sq_tot / sum_sq_res
    return R2

In [19]:
y_tr_pred_ = train_mean * np.ones(len(y_train))
y_tr_pred_[:5]

array([8.56754975, 8.56754975, 8.56754975, 8.56754975, 8.56754975])

In [20]:
y_tr_pred = dumb_reg.predict(X_train)
y_tr_pred[:5]

array([8.56754975, 8.56754975, 8.56754975, 8.56754975, 8.56754975])

The DummyRegressor produces exactly the same results and saves you having to mess about broadcasting the mean.

In [21]:
r_squared(y_train, y_tr_pred)

0.0

In [22]:
y_te_pred = train_mean * np.ones(len(y_test))
r_squared(y_test, y_te_pred)

0.00035784804212313226

### Mean Absolute Error

In [23]:
def mae(y, ypred):
    """Mean absolute error.
    
    Calculate the mean absolute error of the arguments

    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    abs_error = np.abs(y - ypred)
    mae = np.mean(abs_error)
    return mae

In [24]:
mae(y_train, y_tr_pred)

4.827450593256362

In [25]:
mae(y_test, y_te_pred)

4.756843844998154

### Mean Squared Error

In [26]:
def mse(y, ypred):
    """Mean square error.
    
    Calculate the mean square error of the arguments

    Arguments:
    y -- the observed values
    ypred -- the predicted values
    """
    sq_error = (y - ypred)**2
    mse = np.mean(sq_error)
    return mse

In [27]:
mse(y_train, y_tr_pred)

35.95576140389565

In [28]:
mse(y_test, y_te_pred)

34.89064868595298

In [29]:
np.sqrt([mse(y_train, y_tr_pred), mse(y_test, y_te_pred)])

array([5.99631232, 5.90683068])

## Sklearn Metrics

### R-squared

In [30]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.0, -0.0003579761431851125)

### Mean absolute error

In [31]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(4.827450593256362, 4.756843844998154)

### Mean squared error

In [32]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(35.95576140389565, 34.89064868595298)

In [33]:
# train set - sklearn
# correct order, incorrect order
r2_score(y_train, y_tr_pred), r2_score(y_tr_pred, y_train)

(0.0, -1.1394835631327831e+31)

In [34]:
# test set - sklearn
# correct order, incorrect order
r2_score(y_test, y_te_pred), r2_score(y_te_pred, y_test)

(-0.0003579761431851125, -1.1057287937275145e+31)

In [35]:
# train set - using our homebrew function
# correct order, incorrect order
r_squared(y_train, y_tr_pred), r_squared(y_tr_pred, y_train)

(0.0, 1.0)

In [36]:
# test set - using our homebrew function
# correct order, incorrect order
r_squared(y_test, y_te_pred), r_squared(y_te_pred, y_test)

(0.00035784804212313226, 1.0)

## Initial Models

In [37]:
X_defaults_median = X_train.median()
X_defaults_median

Experience     5.000
FG%            0.439
FT%            0.750
GS            11.000
MP            20.500
Age           26.000
dtype: float64

In [38]:
X_tr = X_train.fillna(X_defaults_median)
X_te = X_test.fillna(X_defaults_median)

### Scale the data

In [39]:
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


### Train the model on the train split

In [41]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [42]:
#Call the `predict()` method of the model (`lm`) on both the (scaled) train and test data
#Assign the predictions to `y_tr_pred` and `y_te_pred`, respectively
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

In [43]:
# r^2 - train, test
median_r2 = r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)
median_r2

(0.8169982674098941, 0.817525213270828)

In [44]:
# MAE - train, test
median_mae = mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)
median_mae

(1.85799260410833, 1.8332919385822923)

In [45]:
#And also do the same using `sklearn`'s `mean_squared_error`
# MSE - train, test
median_mse = mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)
median_mse

(6.57996663350936, 6.364385379679774)

### Do the same thing with the mean

In [46]:
X_defaults_mean = X_train.mean()
X_defaults_mean

Experience     6.582967
FG%            0.437925
FT%            0.726573
GS            25.066167
MP            20.893073
Age           26.183122
dtype: float64

In [47]:
_tr = X_train.fillna(X_defaults_mean)
X_te = X_test.fillna(X_defaults_mean)

In [48]:
scaler = StandardScaler()
scaler.fit(X_tr)
X_tr_scaled = scaler.transform(X_tr)
X_te_scaled = scaler.transform(X_te)

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [49]:
lm = LinearRegression().fit(X_tr_scaled, y_train)

In [50]:
y_tr_pred = lm.predict(X_tr_scaled)
y_te_pred = lm.predict(X_te_scaled)

In [51]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.8169982674098941, 0.8174726379595427)

In [52]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(1.85799260410833, 1.834234740106355)

In [53]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(6.57996663350936, 6.36621910996369)

### I have now created dummy and indicator features for categorical variables, standardized the magnitude of numeric features, and split my data into testing and training datasets. The next step I will make predictive machine learning models to come to our conclusion. 