In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy.stats import f
from itertools import combinations

In [2]:
df = pd.read_csv("../data/Prostate Cancer.txt")
names = ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
df_train = df[df.train == 'T']
df_valid = df[df.train == 'F']
X, y = df_train[names].values, df_train[['lpsa']].values
X_valid, y_valid = df_valid[names].values, df_valid[['lpsa']].values
# TABLE 3.1. Correlation of predictors in the prostate cancer data.
df[names].corr()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45
lcavol,1.0,0.280521,0.225,0.02735,0.538845,0.67531,0.432417,0.433652
lweight,0.280521,1.0,0.347969,0.442264,0.155385,0.164537,0.056882,0.107354
age,0.225,0.347969,1.0,0.350186,0.117658,0.127668,0.268892,0.276112
lbph,0.02735,0.442264,0.350186,1.0,-0.085843,-0.006999,0.07782,0.07846
svi,0.538845,0.155385,0.117658,-0.085843,1.0,0.673111,0.320412,0.457648
lcp,0.67531,0.164537,0.127668,-0.006999,0.673111,1.0,0.51483,0.631528
gleason,0.432417,0.056882,0.268892,0.07782,0.320412,0.51483,1.0,0.751905
pgg45,0.433652,0.107354,0.276112,0.07846,0.457648,0.631528,0.751905,1.0


In [3]:
# PAGE 51. prediction using the mean training value of lpsa has
#          a test error of 1.057, which is called the "base error rate".
y_valid_hat = np.full(y_valid.shape, np.mean(y))
print('Baseline MSE: ', mean_squared_error(y_valid, y_valid_hat))

Baseline MSE:  1.056733228060382


In [4]:
# I don't think that scaling based on the whole data set is a good idea
# but without it we get different (compared to the book) coefficients from linear regression 
scaler = StandardScaler()
scaler.fit(np.vstack([X, X_valid]))
X = scaler.transform(X)
X_valid = scaler.transform(X_valid)

In [5]:
ls = sm.OLS(y, sm.add_constant(X)).fit()
# TABLE 3.2. Linear model fit to the prostate cancer data. The Z score (t value) is the
#            coefficient divided by its standard error (3.12). Roughly Z score larger
#            than two in absolute value is significantly nonzero at the p = 0.05 level.
print(ls.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.694
Model:                            OLS   Adj. R-squared:                  0.652
Method:                 Least Squares   F-statistic:                     16.47
Date:                Sun, 27 Jan 2019   Prob (F-statistic):           2.04e-12
Time:                        21:08:54   Log-Likelihood:                -67.505
No. Observations:                  67   AIC:                             153.0
Df Residuals:                      58   BIC:                             172.9
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.4649      0.089     27.598      0.0

In [6]:
# PAGE 51. The mean prediction error on the test data is 0.521.
y_valid_hat = ls.predict(sm.add_constant(X_valid))
print('Linear model MSE: ', mean_squared_error(np.squeeze(y_valid), y_valid_hat))

Linear model MSE:  0.5212740055075995


In [7]:
# PAGE 50. We can also test for the exclusion of a number of terms at once,
#          using the F-statistic (3.13). For example, we consider dropping all the 
#          non-significant terms in Table 3.2, namely age, lcp, gleason and ppg45. We get
y_hat1 = np.expand_dims(ls.predict(sm.add_constant(X)), 1)
RSS1, p1 = (y - y_hat1).T @ (y - y_hat1), 9
ls0 = sm.OLS(y, sm.add_constant(X[:,[0, 1, 3, 4]])).fit()
y_hat0 = np.expand_dims(ls0.predict(sm.add_constant(X[:,[0, 1, 3, 4]])), 1)
RSS0, p0 = (y - y_hat0).T @ (y - y_hat0), 5
# NOTE: believe that there should be additional -1 in the denominator, but
F = (((RSS0 - RSS1)/(p1 - p0))/(RSS1/(X.shape[0]-p1)))[0][0]
# PAGE 51. which has a p-value of 0.17, and hence is not significant
print('F-statistics:', F, 'Pr>F', 1 - f(4, 58).cdf(F))

F-statistics: 1.6697548846375219 Pr>F 0.16933707265225173


In [14]:
# 3.3.1 Best-subset Selection
for i in combinations(range(8), 7):
    print(np.array(i))

[0 1 2 3 4 5 6]
[0 1 2 3 4 5 7]
[0 1 2 3 4 6 7]
[0 1 2 3 5 6 7]
[0 1 2 4 5 6 7]
[0 1 3 4 5 6 7]
[0 2 3 4 5 6 7]
[1 2 3 4 5 6 7]
