In [1]:
# ridge model
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
Prostate=pd.read_csv('Prostate.txt', sep=' ')
Prostate.head()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783
2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519
3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519
4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519
5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564


In [2]:
Prostate.shape

(97, 9)

In [3]:
Prostate.dtypes

lcavol     float64
lweight    float64
age          int64
lbph       float64
svi          int64
lcp        float64
gleason      int64
pgg45        int64
lpsa       float64
dtype: object

In [4]:
Prostate.isnull().any(axis=0)

lcavol     False
lweight    False
age        False
lbph       False
svi        False
lcp        False
gleason    False
pgg45      False
lpsa       False
dtype: bool

In [5]:
Prostate.describe()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
count,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0
mean,1.35001,3.628943,63.865979,0.100356,0.216495,-0.179366,6.752577,24.381443,2.478387
std,1.178625,0.428411,7.445117,1.450807,0.413995,1.39825,0.722134,28.204035,1.154329
min,-1.347074,2.374906,41.0,-1.386294,0.0,-1.386294,6.0,0.0,-0.430783
25%,0.512824,3.37588,60.0,-1.386294,0.0,-1.386294,6.0,0.0,1.731656
50%,1.446919,3.623007,65.0,0.300105,0.0,-0.798508,7.0,15.0,2.591516
75%,2.127041,3.876396,68.0,1.558145,0.0,1.178655,7.0,40.0,3.056357
max,3.821004,4.780383,79.0,2.326302,1.0,2.904165,9.0,100.0,5.582932


In [6]:
# define x and y
X=Prostate[['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']]
Y=Prostate['lpsa']

In [7]:
# split to training/test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [8]:
#Grid Search Parameter Tuning
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001])
# create and fit a ridge regression model, testing each alpha
model1 = Ridge()
grid1 = GridSearchCV(estimator=model1, param_grid=dict(alpha=alphas),cv=5,scoring='neg_mean_squared_error')
grid1.fit(X_train, y_train)
print(grid1)
# summarize the results of the grid search
print(grid1.best_score_)
print(grid1.best_estimator_.alpha)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04])},
             scoring='neg_mean_squared_error')
-0.6098528327865174
0.1


In [9]:
# fit the model
ridge1 = Ridge(alpha=0.1)
ridge1.fit(X_train, y_train)


print ("RSS: %.2f for training" % np.sum((ridge1.predict(X_train) - y_train) ** 2))

print ('-'*50)

print ("RSS: %.2f for test" % np.sum((ridge1.predict(X_test) - y_test) ** 2)) #np.mean((ridge1.predict(X_test) - y_test) ** 2)

RSS: 36.61 for training
--------------------------------------------------
RSS: 6.94 for test


In [10]:
import math
rse=math.sqrt(np.sum((ridge1.predict(X_train)-y_train)**2)/(X_train.shape[0]-X_train.shape[1]-1))
print('RSE of training=',rse)
rse=math.sqrt(np.sum((ridge1.predict(X_test)-y_test)**2)/(X_test.shape[0]-X_test.shape[1]-1))
print('RSE of test=',rse)

RSE of training= 0.7337228471631819
RSE of test= 0.7943481318410404


In [11]:
colnames = ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
result1 = pd.DataFrame(ridge1.coef_).transpose()
result1.columns = colnames
result1['intercept'] = ridge1.intercept_ 
result1 = result1.transpose()
result1.columns = ['coefficient']
result1

Unnamed: 0,coefficient
lcavol,0.569284
lweight,0.552225
age,-0.022634
lbph,0.071061
svi,0.780059
lcp,-0.129597
gleason,0.069151
pgg45,0.004154
intercept,0.405359


In [12]:
# lasso model
from sklearn.linear_model import Lasso
#Grid Search Parameter Tuning
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001])
# create and fit a lasso regression model, testing each alpha
model3 = Lasso()
grid3 = GridSearchCV(estimator=model3, param_grid=dict(alpha=alphas), cv=5,scoring='neg_mean_squared_error')
grid3.fit(X_train, y_train)
print(grid3)
# summarize the results of the grid search
print(grid3.best_score_)
print(grid3.best_estimator_.alpha)

GridSearchCV(cv=5, estimator=Lasso(),
             param_grid={'alpha': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04])},
             scoring='neg_mean_squared_error')
-0.6094617362357264
0.01


In [13]:
# fit the model
lasso1 = Lasso(alpha=0.01)
lasso1.fit(X_train, y_train)

print ("RSS: %.2f for training" % np.sum((lasso1.predict(X_train) - y_train) ** 2))

print ('-'*50)

print ("RSS: %.2f for testing" % np.sum((lasso1.predict(X_test) - y_test) ** 2))

#RSS: 36.61 for training for ridge
#RSS: 6.94 for test for ridge

RSS: 36.84 for training
--------------------------------------------------
RSS: 6.98 for testing


In [14]:
rse=math.sqrt(np.sum((lasso1.predict(X_train)-y_train)**2)/(X_train.shape[0]-X_train.shape[1]-1))
print('RSE of training=',rse)
rse=math.sqrt(np.sum((lasso1.predict(X_test)-y_test)**2)/(X_test.shape[0]-X_test.shape[1]-1))
print('RSE of test=',rse)
#RSE of training= 0.7337228471631819
# RSE of test= 0.7943481318410404

RSE of training= 0.7360692225145374
RSE of test= 0.7964725702243072


In [15]:
colnames = ['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']
result3 = pd.DataFrame(lasso1.coef_).transpose()
result3.columns = colnames
result3['intercept'] = lasso1.intercept_ 
result3 = result3.transpose()
result3.columns = ['coefficient']
result3

Unnamed: 0,coefficient
lcavol,0.566727
lweight,0.499849
age,-0.019827
lbph,0.069023
svi,0.660666
lcp,-0.088302
gleason,0.0
pgg45,0.004676
intercept,0.906338


In [16]:
# the coefficients of gleason is shrank to 0, this feature is ignored from the model

In [17]:
# for predictive power, we chose ridge
# for feature selection, we chose lasso