In [13]:
import pandas as pd
import numpy as np
import itertools

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LarsCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

# Load data
data = pd.read_csv("prostate.data.txt", sep = "\t")
print(data.columns)
data = data.drop(columns='Unnamed: 0')

print(data.head())

# Train-test split
y_train = np.array(data[data.train == "T"]['lpsa'])
y_test = np.array(data[data.train == "F"]['lpsa'])
X_train = np.array(data[data.train == "T"].drop(['lpsa', 'train'], axis=1))
X_test = np.array(data[data.train == "F"].drop(['lpsa', 'train'], axis=1))


Index(['Unnamed: 0', 'lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp',
       'gleason', 'pgg45', 'lpsa', 'train'],
      dtype='object')
     lcavol   lweight  age      lbph  svi       lcp  gleason  pgg45      lpsa  \
0 -0.579818  2.769459   50 -1.386294    0 -1.386294        6      0 -0.430783   
1 -0.994252  3.319626   58 -1.386294    0 -1.386294        6      0 -0.162519   
2 -0.510826  2.691243   74 -1.386294    0 -1.386294        7     20 -0.162519   
3 -1.203973  3.282789   58 -1.386294    0 -1.386294        6      0 -0.162519   
4  0.751416  3.432373   62 -1.386294    0 -1.386294        6      0  0.371564   

  train  
0     T  
1     T  
2     T  
3     T  
4     T  


In [18]:

linreg_model = LinearRegression(normalize=True).fit(X_train, y_train)
linreg_prediction = linreg_model.predict(X_test)
linreg_mae = np.mean(np.abs(y_test - linreg_prediction))
linreg_coefs = dict(
    zip(['Intercept'] + data.columns.tolist()[:-1], 
        np.round(np.concatenate((linreg_model.intercept_, linreg_model.coef_), 
        axis=None), 4))
)

print('Linear Regression MAE: {}'.format(np.round(linreg_mae, 4)))
print('Linear Regression coefficients:')
linreg_coefs

#make a table

Linear Regression MAE: 0.5234
Linear Regression coefficients:


{'Intercept': 0.4292,
 'lcavol': 0.5765,
 'lweight': 0.614,
 'age': -0.019,
 'lbph': 0.1448,
 'svi': 0.7372,
 'lcp': -0.2063,
 'gleason': -0.0295,
 'pgg45': 0.0095}

In [29]:
results = pd.DataFrame(columns=['num_features', 'features', 'RSS'])

# Loop over all possible numbers of features to be included
for k in range(1, X_train.shape[1] + 1):
    # Loop over all possible subsets of size k
    for subset in itertools.combinations(range(X_train.shape[1]), k):
        subset = list(subset)
        linreg_model = LinearRegression(normalize=True).fit(X_train[:, subset], y_train)
        y_pred = linreg_model.predict(X_train[:, subset])
        #linreg_mae = np.mean(np.abs(y_test - linreg_prediction))
        RSS = ((y_train - y_pred) ** 2).sum() 
        results = results.append(pd.DataFrame([{'num_features': k,
                                                'features': subset,
                                                'RSS': RSS}]))

# Inspect best combinations
results = results.sort_values('RSS').reset_index()
results = results.drop(columns='index')
print(results.columns)
print(results.head())

# Fit best model
best_subset_model = LinearRegression(normalize=True).fit(X_train[:, results['features'][0]], y_train)
best_subset_coefs = dict(
    zip(['Intercept'] + data.columns.tolist()[:-1], 
        np.round(np.concatenate((best_subset_model.intercept_, best_subset_model.coef_), axis=None), 3))
)

print('Best Subset Regression RSS: {}'.format(np.round(results['RSS'][0], 3)))
print('Best Subset Regression coefficients:')
best_subset_coefs


Index(['RSS', 'features', 'num_features'], dtype='object')
         RSS                  features num_features
0  29.426384  [0, 1, 2, 3, 4, 5, 6, 7]            8
1  29.437300     [0, 1, 2, 3, 4, 5, 7]            7
2  30.414990     [0, 1, 3, 4, 5, 6, 7]            7
3  30.539778        [0, 1, 3, 4, 5, 7]            6
4  30.958630     [0, 1, 2, 3, 4, 5, 6]            7
Best Subset Regression RSS: 29.426
Best Subset Regression coefficients:


{'Intercept': 0.429,
 'lcavol': 0.577,
 'lweight': 0.614,
 'age': -0.019,
 'lbph': 0.145,
 'svi': 0.737,
 'lcp': -0.206,
 'gleason': -0.03,
 'pgg45': 0.009}