In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston

In [2]:
X, Y = load_boston(return_X_y=True)

### Variables in the Boston Housing dataset
1. CRIM - per capita crime rate by town
2. ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
3. INDUS - proportion of non-retail business acres per town.
4. CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
5. NOX - nitric oxides concentration (parts per 10 million)
6. RM - average number of rooms per dwelling
7. AGE - proportion of owner-occupied units built prior to 1940
8. DIS - weighted distances to five Boston employment centres
9. RAD - index of accessibility to radial highways
10. TAX - full-value property-tax rate per USD 10,000
11. PTRATIO - pupil-teacher ratio by town
12. B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13. LSTAT - % lower status of the population
14. MEDV - Median value of owner-occupied homes in USD 1000's

In [3]:
X = pd.DataFrame(X, columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE',
                             'DIS','RAD','TAX','PTRATIO','BK','LSTAT'])

In [4]:
Y = pd.DataFrame(Y,columns=["MEDV"])

In [5]:
df = pd.concat([X,Y],axis=1)

In [6]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BK,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


# Model 1

In [7]:
from sklearn import linear_model
import sklearn.metrics as metrics

In [8]:
# train/test split with scikit-learn
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, shuffle=True)
print(len(X_train))
print(len(X_test))

404
102


In [9]:
# Train the model using the training sets
model = linear_model.LinearRegression()
model.fit(X_train, Y_train)

LinearRegression()

In [10]:
# Make predictions using the testing set
Y_pred = model.predict(X_test)

In [11]:
# The coefficients
print('Coefficients: \n', model.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % metrics.mean_squared_error(Y_pred, Y_test))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % metrics.r2_score(Y_pred, Y_test))

Coefficients: 
 [[-1.19443447e-01  4.47799511e-02  5.48526168e-03  2.34080361e+00
  -1.61236043e+01  3.70870901e+00 -3.12108178e-03 -1.38639737e+00
   2.44178327e-01 -1.09896366e-02 -1.04592119e+00  8.11010693e-03
  -4.92792725e-01]]
Mean squared error: 33.45
Coefficient of determination: 0.35


# Model 2

In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', linear_model.LinearRegression())])
# fit to an order-2 polynomial data
model = model.fit(X_train,Y_train)

In [13]:
# Make predictions using the testing set
Y_pred = model.predict(X_test)

In [14]:
# The coefficients
print('Coefficients: \n', model.named_steps['linear'].coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % metrics.mean_squared_error(Y_pred, Y_test))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % metrics.r2_score(Y_pred, Y_test))

Coefficients: 
 [[ 1.07346026e+09 -2.97510205e+00  2.50017017e-01 -5.75261979e+00
   1.46944572e+01  2.74617636e+01  7.40199263e+00  1.12613783e+00
  -1.25447428e+01  1.58814891e+00  2.57078562e-02  3.99790890e+00
  -1.36816479e-01 -1.46480161e+00  3.05609658e-03  2.84330181e-01
   6.11066785e-01  1.38035009e+00 -2.89001788e-01  1.00766741e-01
  -5.43540783e-03  2.08495666e-03  4.88227797e-01 -4.39740462e-02
   4.34919849e-01 -1.98328444e-04  2.18121807e-02 -9.33846956e-04
  -1.25699281e-03  7.38675176e-02 -1.67405073e+00 -7.68556797e-03
  -1.31535227e-04 -8.45662161e-04 -4.79350216e-04  6.85393430e-04
  -2.28277097e-02  2.07056574e-03 -3.07902288e-03  4.67327370e-02
   5.45977285e-01 -3.49428109e-02  4.66222468e-01  1.84068925e-03
   6.68200261e-02 -2.69658063e-01  3.09657143e-03 -1.36644525e-01
   9.79344685e-03  1.22866400e-02  1.46944572e+01 -3.31269439e+01
  -5.99197428e+00  4.72672498e-02 -1.30941051e-01  3.27906530e-01
  -1.63366151e-02 -9.53696091e-01  9.89732641e-02 -4.2428412

# Model 3

In [15]:
# correlation analysis
corrMatrix = df.corr()
corrMatrix.style.background_gradient(cmap='Blues')

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,BK,LSTAT,MEDV
CRIM,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621,-0.388305
ZN,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995,0.360445
INDUS,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038,-0.483725
CHAS,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929,0.17526
NOX,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879,-0.427321
RM,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808,0.69536
AGE,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339,-0.376955
DIS,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996,0.249929
RAD,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676,-0.381626
TAX,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993,-0.468536


In [16]:
X_copy = X.copy()

In [17]:
del(X['CHAS'])
del(X['DIS'])

In [18]:
X.columns

Index(['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'RAD', 'TAX', 'PTRATIO',
       'BK', 'LSTAT'],
      dtype='object')

In [19]:
# train/test split with scikit-learn
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, shuffle=True)
print(len(X_train))
print(len(X_test))

404
102


In [20]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', linear_model.LinearRegression())])
# fit to an order-2 polynomial data
model = model.fit(X_train,Y_train)

In [21]:
# Make predictions using the testing set
Y_pred = model.predict(X_test)

In [22]:
# The coefficients
print('Coefficients: \n', model.named_steps['linear'].coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % metrics.mean_squared_error(Y_pred, Y_test))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % metrics.r2_score(Y_pred, Y_test))

Coefficients: 
 [[-1.44213411e-07 -4.74713749e+00  3.16203733e-02 -3.70675405e+00
   2.37888369e+02  3.13380224e+01  9.47019286e-01  8.73349739e-01
   7.69997432e-02  3.89712231e+00 -4.97919471e-02  3.55537546e-01
   3.37041463e-04  3.74511327e-02  5.67217061e-01 -6.95105691e-01
   1.15866990e-01 -4.38196726e-03  3.69245850e-01 -2.76858453e-02
   1.84009227e-01  1.91967448e-04  1.41968690e-02 -3.33370148e-04
   2.82433055e-04 -8.54772257e-01  8.62906844e-04 -4.07842750e-04
  -9.95112854e-05  1.90825875e-04 -6.25978715e-03  1.09949340e-03
  -3.76316946e-03  2.32433200e-02  1.05579839e+00  1.28323812e-01
   2.89138239e-03  3.75180396e-04  1.56360031e-03  1.09515089e-02
   3.61590065e-03 -4.07639718e-02 -8.18556010e+01  2.73405431e+00
  -6.14655787e-01 -1.19081881e-01  1.03694452e-02 -7.58324618e+00
   1.74782438e-03  1.02857783e+00 -2.94605717e-01 -8.07663282e-02
  -2.86612486e-02 -1.86932961e-02 -6.53232883e-01  2.68872699e-03
  -1.09515683e-01  5.67437326e-04  2.27873657e-02 -6.5185680

# Model 4

In [23]:
X = X_copy.copy()

In [24]:
X.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'BK', 'LSTAT'],
      dtype='object')

In [25]:
# Recursive Feature Elimination - feature selection
from sklearn.feature_selection import RFE
model = linear_model.LinearRegression()
rfe = RFE(model, n_features_to_select=11)
fit = rfe.fit(X, Y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

Num Features: 11
Selected Features: [ True  True  True  True  True  True False  True  True  True  True False
  True]
Feature Ranking: [1 1 1 1 1 1 3 1 1 1 1 2 1]


In [26]:
del(X['AGE'])
del(X['BK'])

In [27]:
# train/test split with scikit-learn
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, shuffle=True)
print(len(X_train))
print(len(X_test))

404
102


In [28]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', linear_model.LinearRegression())])
# fit to an order-2 polynomial data
model = model.fit(X_train,Y_train)

In [29]:
# Make predictions using the testing set
Y_pred = model.predict(X_test)

In [30]:
# The coefficients
print('Coefficients: \n', model.named_steps['linear'].coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % metrics.mean_squared_error(Y_pred, Y_test))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % metrics.r2_score(Y_pred, Y_test))

Coefficients: 
 [[-1.19882601e+09 -5.34066408e+00  8.07771916e-02 -5.18652554e+00
   3.19599258e+01  5.72552518e+01  1.56791897e+01 -1.45821332e+01
   3.77385665e-01  9.90376309e-02  8.38669540e+00 -6.19530364e-01
   2.04182062e-03  5.62619198e-01  1.93044242e-01  1.89739873e+00
  -1.69239805e+00  2.14869705e-01  2.71824702e-01 -1.63077271e-01
   9.89286998e-03 -1.13594693e-01  2.83153510e-02  3.74197032e-04
  -2.49693517e-03  3.19894549e-02 -6.77663274e-01  2.75793442e-02
  -4.96347206e-03  3.12148661e-03  6.79791838e-04 -1.06302793e-02
   7.79745006e-04  3.76135826e-02  8.29577762e-01  3.76533562e+00
   5.06840084e-01  2.31900485e-01  2.33269987e-02 -5.74013690e-04
  -6.68647529e-02 -2.22667404e-02  3.19599258e+01 -3.26990492e+01
  -5.41710674e+00  5.46565185e-01  1.22922951e+00 -7.00752889e-02
  -2.67432953e-01 -2.87724243e-01 -3.98551914e+01  1.04036036e+00
   1.14913219e+01  4.29562886e-01 -4.51084913e-02 -5.91142432e+00
   1.02689418e+00  4.89611953e-01  1.18375718e+00  2.8915397

# Model 5

In [31]:
X = X_copy.copy()

In [32]:
# Recursive Feature Elimination - feature selection
from sklearn.feature_selection import RFE
model = linear_model.LinearRegression()
rfe = RFE(model, n_features_to_select=10)
fit = rfe.fit(X, Y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

Num Features: 10
Selected Features: [ True  True  True  True  True  True False  True  True False  True False
  True]
Feature Ranking: [1 1 1 1 1 1 4 1 1 2 1 3 1]


In [33]:
X.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'BK', 'LSTAT'],
      dtype='object')

In [34]:
del(X['AGE'])
del(X['TAX'])
del(X['BK'])

In [35]:
# train/test split with scikit-learn
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, shuffle=True)
print(len(X_train))
print(len(X_test))

404
102


In [36]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', linear_model.LinearRegression())])
# fit to an order-2 polynomial data
model = model.fit(X_train,Y_train)

In [37]:
# Make predictions using the testing set
Y_pred = model.predict(X_test)

In [38]:
# The coefficients
print('Coefficients: \n', model.named_steps['linear'].coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % metrics.mean_squared_error(Y_pred, Y_test))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % metrics.r2_score(Y_pred, Y_test))

Coefficients: 
 [[-1.72794910e+11 -1.10063006e+01  7.14335111e-02 -3.66405403e+00
   3.09077617e+01  5.90692903e+01  1.51283885e+01 -1.58886604e+01
   2.99811738e+00  9.27030282e+00 -6.46162245e-01  9.57475573e-04
   1.00549546e+00  3.08164162e-01  1.75596690e+00 -1.97352511e+00
   2.70986621e-01  2.80367011e-01 -2.87856158e-02  2.19766728e-01
   3.47497564e-02  1.92269693e-03  3.08350544e-03  2.40611440e-02
  -5.36977670e-01 -2.72297526e-03 -4.02883927e-03  5.94038512e-03
  -1.69227762e-03  6.58214584e-05  4.54346925e-02  3.78471289e-01
   3.48178165e+00  1.90565218e-01  3.18558354e-01 -6.44860808e-02
  -4.51425124e-02 -3.89901062e-02  3.09077617e+01 -3.61234531e+01
  -5.93959483e+00  1.64102844e+00  2.49356541e-01 -6.12743876e-01
  -2.88089184e-01 -5.16282669e+01 -7.85457988e-01  9.30847857e+00
  -5.50548905e-01 -4.86013033e+00  6.85981862e-01  5.22408641e-01
   1.28796477e+00 -2.86357529e-01 -1.21013852e+00  1.09368978e-01
   3.04762363e-01 -2.74953946e-01 -2.67782023e-02 -7.5765839

# Model 6 

In [39]:
X = X_copy.copy()

In [40]:
# Recursive Feature Elimination - feature selection
from sklearn.feature_selection import RFE
model = linear_model.LinearRegression()
rfe = RFE(model, n_features_to_select=10)
fit = rfe.fit(X, Y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

Num Features: 10
Selected Features: [ True  True  True  True  True  True False  True  True False  True False
  True]
Feature Ranking: [1 1 1 1 1 1 4 1 1 2 1 3 1]


In [41]:
del(X['AGE'])
del(X['TAX'])
del(X['BK'])

In [42]:
# train/test split with scikit-learn
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1, shuffle=True)
print(len(X_train))
print(len(X_test))

404
102


In [43]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', linear_model.LinearRegression())])
# fit to an order-2 polynomial data
model = model.fit(X_train,Y_train)

In [44]:
# Make predictions using the testing set
Y_pred = model.predict(X_test)

In [45]:
# The coefficients
print('Coefficients: \n', model.named_steps['linear'].coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % metrics.mean_squared_error(Y_pred, Y_test))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % metrics.r2_score(Y_pred, Y_test))

Coefficients: 
 [[ 8.64460359e-10 -6.31022864e+00 -8.25947130e-02 -4.02060888e+00
   3.28436741e+01  8.04986749e+01  2.00232819e+01 -1.27270225e+01
   1.36397827e+00  1.13811127e+01 -3.25906603e-01  2.97663037e-03
   3.77627521e-01  1.45783317e-01  2.47903144e+00 -2.10983716e+00
   1.11092873e-01  1.08629989e-01 -2.01157960e-02  1.91537872e-01
   2.34020416e-02  5.94402614e-04 -1.68405238e-03 -4.57188869e-02
  -4.58064509e-01  3.16345922e-02 -1.05971204e-02  1.10000138e-02
   2.46663329e-03  3.04384064e-04  1.46557494e-02 -9.09347465e-04
   4.19373598e+00  1.92235309e-01  2.08038436e-01 -5.64009097e-02
  -1.18374138e-03 -1.49471410e-02  3.28436741e+01 -3.55775575e+01
  -5.31890828e+00  1.45935996e+00  6.28100059e-02 -9.06639080e-01
  -2.01139656e-01 -4.94021986e+01 -4.45454773e+00  1.05969379e+01
  -3.79185916e-01 -5.28091030e+00  8.55359757e-01  3.36270300e-01
   4.03439220e-01 -2.19063989e-01 -9.39123917e-01 -1.68277748e-01
   4.05169810e-01 -3.12193412e-01  3.84735071e-03  5.1193724

# Model 7

In [46]:
X = X_copy.copy()

In [47]:
# Recursive Feature Elimination - feature selection
from sklearn.feature_selection import RFE
model = linear_model.LinearRegression()
rfe = RFE(model, n_features_to_select=11)
fit = rfe.fit(X, Y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

Num Features: 11
Selected Features: [ True  True  True  True  True  True False  True  True  True  True False
  True]
Feature Ranking: [1 1 1 1 1 1 3 1 1 1 1 2 1]


In [48]:
del(X['AGE'])
del(X['BK'])

In [49]:
# train/test split with scikit-learn
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2, shuffle=True)
print(len(X_train))
print(len(X_test))

404
102


In [50]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=2)),
                  ('linear', linear_model.LinearRegression())])
# fit to an order-2 polynomial data
model = model.fit(X_train,Y_train)

In [51]:
# Make predictions using the testing set
Y_pred = model.predict(X_test)

In [52]:
# The coefficients
print('Coefficients: \n', model.named_steps['linear'].coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % metrics.mean_squared_error(Y_pred, Y_test))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % metrics.r2_score(Y_pred, Y_test))

Coefficients: 
 [[ 1.16518912e-07 -4.52457118e+00  1.14521993e-01 -4.66159973e+00
   3.25232527e+01  9.78497796e+01  1.70137876e+01 -1.25124621e+01
   1.54898707e+00 -5.96886615e-03  6.60282188e+00 -1.07744294e+00
   2.57571779e-03  3.20594476e-01  3.09311399e-01  2.87876757e+00
  -1.55568816e+00  5.80445018e-02  2.59726364e-02  5.67854652e-02
  -3.15306901e-03 -1.64674070e-02  1.54948334e-02  3.17669935e-04
  -4.75917894e-03 -6.28043125e-02 -1.14257714e-01  3.29692793e-03
  -5.17942208e-03 -1.92718811e-03  4.82613160e-04 -1.12095095e-02
   1.80128953e-05  1.54725640e-02 -5.12234604e-02  4.49789298e+00
   2.01464666e-01  2.08730179e-01  2.09659321e-02 -8.25886752e-05
   1.28830763e-02 -2.31771390e-02  3.25232527e+01 -3.99622726e+01
  -4.87830425e+00  8.67078383e-01 -2.96795852e-01  1.23877481e-02
  -8.52680838e-01 -2.47453152e-01 -8.96592555e+01 -4.34983486e+00
   9.16208430e+00 -8.59937993e-01  2.78898460e-02 -4.32052734e+00
   1.36179042e+00  3.98630139e-01  3.61790008e-01  1.1999630