In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [4]:
diabetes = datasets.load_diabetes()

In [5]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [7]:
# Set Predictors
X = pd.DataFrame(diabetes.data, columns  = diabetes.feature_names)
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [9]:
# Set Target variable
y = diabetes.target
y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [11]:
# create training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [23]:
# forward feature selection to pick a good mode, add Rsquare, adjusted Rsquare 
show_steps = True
included = []
best = {'feature': '', 'r2': 0, 'a_r2' : 0}
model = LinearRegression()
n = X_test.shape[0]

r2_list = []
adjusted_r2_list = []

while True:
    changed = False
    if show_steps:
        print('')
    excluded = list(set(X.columns) - set(included))
    if show_steps:
        print('(step) Excluded = %s' % ', '.join(excluded))
    for new_column in excluded:
        if show_steps:
            print('(Step) Trying %s...' % new_column)
            print('(Step) - Features = %s' % ', '.join(included + [new_column]))
        fit = model.fit(X_train[included + [new_column]], y_train)
        r2 = fit.score(X_train[included + [new_column]], y_train)
        k = len(included) + 1
        adjusted_r2 = 1 - ((1-r2) * (n-1) /(n-k-1))
        if show_steps:
            print('(Step) - Adjusted R^2: This = %.3f; Best = %.3f' %
                  (adjusted_r2, best['a_r2']))
        if adjusted_r2 > best['a_r2']:
            best = {'feature': new_column, 'r2': r2, 'a_r2': adjusted_r2}
            changed = True
            if show_steps:
                print('(Step) - New Best ! : Feature = %s; R^2 = %.3f; Adjusted R^2 = %e.3f' %
                      (best['feature'], best['r2'], best['a_r2']))
    r2_list.append(best['r2'])
    adjusted_r2_list.append(best['a_r2'])
    if changed:
        included.append(best['feature'])
        excluded = list(set(excluded) - set(best['feature']))
        print('Add feature %-4s with R^2 = %.3f and adjusted R^2 = %.3f' %
              (best['feature'], best['r2'], best['a_r2']))
    else:
        print('*'*50)
        break
print('')
print('Resulting features:')
print(', '.join(included))


(step) Excluded = s6, s1, s3, age, bp, bmi, s4, s2, s5, sex
(Step) Trying s6...
(Step) - Features = s6
(Step) - Adjusted R^2: This = 0.143; Best = 0.000
(Step) - New Best ! : Feature = s6; R^2 = 0.152; Adjusted R^2 = 1.426406e-01.3f
(Step) Trying s1...
(Step) - Features = s1
(Step) - Adjusted R^2: This = 0.029; Best = 0.143
(Step) Trying s3...
(Step) - Features = s3
(Step) - Adjusted R^2: This = 0.138; Best = 0.143
(Step) Trying age...
(Step) - Features = age
(Step) - Adjusted R^2: This = 0.028; Best = 0.143
(Step) Trying bp...
(Step) - Features = bp
(Step) - Adjusted R^2: This = 0.189; Best = 0.143
(Step) - New Best ! : Feature = bp; R^2 = 0.198; Adjusted R^2 = 1.886035e-01.3f
(Step) Trying bmi...
(Step) - Features = bmi
(Step) - Adjusted R^2: This = 0.358; Best = 0.189
(Step) - New Best ! : Feature = bmi; R^2 = 0.366; Adjusted R^2 = 3.584336e-01.3f
(Step) Trying s4...
(Step) - Features = s4
(Step) - Adjusted R^2: This = 0.171; Best = 0.358
(Step) Trying s2...
(Step) - Features = s2


In [None]:
_range = range(1, len(r2_list) + 1)

plt.figure(figsize = (10, 5))
plt.plot(_range, r2_list, label = '$R^2$')
plt.plot(_range, adjusted_r2_list, lable = '$Adjusted \: R^2$')

plt.xlabel('Number of Features')
plt.legend()

plt.show()