In [9]:
import pandas as pd
import numpy as np
import itertools

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LarsCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

# Load data
data = pd.read_csv("prostate.data.txt", sep = "\t")
data = data.drop(columns=['Unnamed: 0', 'train'])

print(data.head())

# Create Data matrix X and response vector y

y = np.array(data['lpsa'])
X = np.array(data.drop(['lpsa'], axis=1))



     lcavol   lweight  age      lbph  svi       lcp  gleason  pgg45      lpsa
0 -0.579818  2.769459   50 -1.386294    0 -1.386294        6      0 -0.430783
1 -0.994252  3.319626   58 -1.386294    0 -1.386294        6      0 -0.162519
2 -0.510826  2.691243   74 -1.386294    0 -1.386294        7     20 -0.162519
3 -1.203973  3.282789   58 -1.386294    0 -1.386294        6      0 -0.162519
4  0.751416  3.432373   62 -1.386294    0 -1.386294        6      0  0.371564


In [10]:

linreg_model = LinearRegression(normalize=True).fit(X, y)
y_pred = linreg_model.predict(X)
R2score = r2_score(y_pred, y)
linreg_coefs = dict(
    zip(['Intercept'] + data.columns.tolist()[:-1], 
        np.round(np.concatenate((linreg_model.intercept_, linreg_model.coef_), 
        axis=None), 4))
)

print('Linear Regression MAE: {}'.format(np.round(R2score, 4)))
print('Linear Regression coefficients:')
linreg_coefs

#make a table

Linear Regression MAE: 0.4926
Linear Regression coefficients:


{'Intercept': 0.1816,
 'lcavol': 0.5643,
 'lweight': 0.622,
 'age': -0.0212,
 'lbph': 0.0967,
 'svi': 0.7617,
 'lcp': -0.1061,
 'gleason': 0.0492,
 'pgg45': 0.0045}

In [33]:
results = pd.DataFrame(columns=['num_features', 'features', 'RSS','R^2'])

# Loop over all possible numbers of features to be included
for k in range(1, X.shape[1] + 1):
    # Loop over all possible subsets of size k
    for subset in itertools.combinations(range(X.shape[1]), k):
        subset = list(subset)
        linreg_model = LinearRegression(normalize=True).fit(X[:, subset], y)
        y_pred = linreg_model.predict(X[:, subset])
        
        RSS = ((y - y_pred) ** 2).sum() 
        R2score = r2_score(y_pred, y)
        results = results.append(pd.DataFrame([{'num_features': k,
                                                'features': subset,
                                                'RSS': RSS, 'R^2': R2score }]))

        
#Find best model for each subset size
models = pd.DataFrame(columns=['num_features', 'features', 'RSS','R^2'])
for k in range(1, X.shape[1] + 1):
    temp = temp = results[results['num_features'] == k].sort_values('R^2', ascending=False)
    models = models.append(temp.head(1))
        
print(models)        
        


         RSS       R^2                  features num_features
0  58.914784  0.146198                       [0]            1
0  51.742176  0.320750                    [0, 1]            2
0  46.568436  0.427549                 [0, 1, 4]            3
0  45.595472  0.446134              [0, 1, 3, 4]            4
0  44.436682  0.467703           [0, 1, 2, 3, 4]            5
0  43.775974  0.479735        [0, 1, 2, 3, 4, 7]            6
0  43.107558  0.491717     [0, 1, 2, 3, 4, 5, 7]            7
0  43.058419  0.492590  [0, 1, 2, 3, 4, 5, 6, 7]            8
