In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display

In [2]:
# Linear Regression over Boston Housing Dataset

In [3]:
# load Boston Housing dataset - goal, predict median home value (circa 1970)
from sklearn.datasets import load_boston
boston = load_boston()

In [4]:
# generate the training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=0)

In [5]:
# import the package for Linear Regression
from sklearn.linear_model import LinearRegression

In [6]:
# create a Linear Regression model
lr = LinearRegression()
# fit the Linear Regression model to the training data
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
# print the list of feature names for the boston data set
print(boston.feature_names)
# print the corresponding coefficient or "weight" for each input feature
# Note: the _ after the name of an attribute of a model, like "coef", means that the
# attribute is learned by the model, not entered as a parameter by the user
print("lr.coef_", lr.coef_)
# print the intercept
print("lr.intercept_", lr.intercept_)
print(max(lr.coef_), min(lr.coef_))

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
lr.coef_ [-1.17735289e-01  4.40174969e-02 -5.76814314e-03  2.39341594e+00
 -1.55894211e+01  3.76896770e+00 -7.03517828e-03 -1.43495641e+00
  2.40081086e-01 -1.12972810e-02 -9.85546732e-01  8.44443453e-03
 -4.99116797e-01]
lr.intercept_ 36.933255457118975
3.7689676985862004 -15.589421129396712


#### QUESTION: What do we observe that might be interesting when looking at the learned weights?

CRIM: Per capita crime rate by town
ZN: Proportion of residential land zoned for lots over 25,000 sq. ft
INDUS: Proportion of non-retail business acres per town
CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX: Nitric oxide concentration (parts per 10 million)
RM: Average number of rooms per dwelling
AGE: Proportion of owner-occupied units built prior to 1940
DIS: Weighted distances to five Boston employment centers
RAD: Index of accessibility to radial highways
TAX: Full-value property tax rate per $10,000
PTRATIO: Pupil-teacher ratio by town
B: 1000(Bk — 0.63)², where Bk is the proportion of [people of African American descent] by town
LSTAT: Percentage of lower status of the population

Q1: The positive coefficients of ZN, CHAS, RM, RAD and the negative coefficients of CRIM, INDUS, NOX, AGE, DIS, TAX, PTRATIO, and LSTAT make sense to me. Though I would not completely agree to the weigh of B, considering the weight of LSTAT, the weight of B can be explained. It is interesting to see that NOX affects the housing price the most (coeff. = -15.589)

In [9]:
# score our linear regression model against the training set
print("Training set score: {:.3f}".format(lr.score(X_train, y_train)))

Training set score: 0.770


In [10]:
# score our linear regression model against the test set
print("Test set score: {:.3f}".format(lr.score(X_test, y_test)))

Test set score: 0.635


#### QUESTION: How does this result compared to kNN?
Q2: Compared to the result of the kNN regressor with the best k value, the training score is a bit less, but close to the training score of kNN regressor (0.770 < 0.782). The testing score is greater for the linear regression model (0.635 > 0.510). ---> greater testing accuracy and less overfitting?

In [11]:
# OBSERVATION: Learning will not succeed if your input data does not represent enough
# information to generalize from.

In [12]:
# Approach One for improvement: extend our feature set
# Take the raw data in the Boston Housing Dataset and engineer a new dataset that includes
# not just the originally collected features but also all products of all features
# This lets the model take into account the interactions between features

In [13]:
# Reminder: original dataset has 506 examples, 13 features per example
boston.data.shape

(506, 13)

In [14]:
# Construct expanded feature set
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
expandedFeatures = MinMaxScaler().fit_transform(boston.data)
expandedFeatures = PolynomialFeatures(degree=2, include_bias=False).fit_transform(expandedFeatures)

In [15]:
# new feature set has 13 original features + 91 possible combinations of two features (with replacements)
expandedFeatures.shape

(506, 104)

In [8]:
boston.target.shape

(506,)

In [16]:
# Create a new training and test set
X_train, X_test, y_train, y_test = train_test_split(expandedFeatures, boston.target, random_state=0)

In [17]:
# Create and fit a Linear Regression model to this data
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
# score our linear regression model against the training set and test set
print("Training set score: {:.3f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.3f}".format(lr.score(X_test, y_test)))

Training set score: 0.952
Test set score: 0.607


#### QUESTION: Does our model learn better?
Q3: The training score has increased, so the model learned better with the extended features. However, the testing accuracy has decreased. 

In [19]:
# OBSERVATION: When there is a large discrepancy between the training set performance and the
# test set performance, it is a sign of overfitting - you are learning the training data well,
# to the detriment of generalizing to new examples.

In [20]:
# Approach Two for improvement: address overfitting by prefering models with small weights
# Ridge Regression
# A variation on Linear Regression, Ridge Regression learns weights for each feature that
# both predict the training data but also are as close to zero as possible.
# Controlling the weight values = regularization
# Restricting a model in this way can often address overfitting.

In [21]:
# import Ridge Regression
from sklearn.linear_model import Ridge

In [22]:
# Create the Ridge Regression model and fit it to the data
ridge = Ridge();
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [23]:
# score our ridge regression model against the training set and test set
print("Training set score: {:.3f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.3f}".format(ridge.score(X_test, y_test)))

Training set score: 0.886
Test set score: 0.753


In [24]:
# inspect the learned weights for the linear regression model
print("lr.coef_", lr.coef_)
print("largest weight: ", max(lr.coef_))
print("smallest weight: ", min(lr.coef_))

lr.coef_ [-4.12710947e+02 -5.22432068e+01 -1.31898815e+02 -1.20041365e+01
 -1.55107129e+01  2.87163342e+01  5.47040992e+01 -4.95346659e+01
  2.65823927e+01  3.70620316e+01 -1.18281674e+01 -1.80581965e+01
 -1.95246830e+01  1.22025403e+01  2.98078144e+03  1.50084257e+03
  1.14187325e+02 -1.69700520e+01  4.09613691e+01 -2.42636646e+01
  5.76157466e+01  1.27812142e+03 -2.23986944e+03  2.22825472e+02
 -2.18201083e+00  4.29960320e+01 -1.33981515e+01 -1.93893485e+01
 -2.57541277e+00 -8.10130128e+01  9.66019367e+00  4.91423718e+00
 -8.12114800e-01 -7.64694179e+00  3.37837099e+01 -1.14464390e+01
  6.85083979e+01 -1.73753604e+01  4.28128204e+01  1.13988209e+00
 -7.72696840e-01  5.68255921e+01  1.42875996e+01  5.39551110e+01
 -3.21709644e+01  1.92709675e+01 -1.38852338e+01  6.06343266e+01
 -1.23153942e+01 -1.20041365e+01 -1.77243899e+01 -3.39868183e+01
  7.08999816e+00 -9.22538241e+00  1.71980268e+01 -1.27718431e+01
 -1.19727581e+01  5.73871915e+01 -1.75331865e+01  4.10103194e+00
  2.93666477e+01

In [25]:
# inspect the learned weights for the ridge regression model
print("ridge.coef_", ridge.coef_)
print("largest weight: ", max(ridge.coef_))
print("smallest weight: ", min(ridge.coef_))

ridge.coef_ [-1.41368408e+00 -1.55661895e+00 -1.46543409e+00 -1.26616071e-01
 -7.91193605e-02  8.33161023e+00  2.54975060e-01 -4.94145701e+00
  3.89862268e+00 -1.05866058e+00 -1.58433734e+00  1.05103856e+00
 -4.01220799e+00  3.33720475e-01  3.64725471e-03 -8.49295793e-01
  7.44989267e-01 -1.43106836e+00 -1.62981017e+00 -1.40486294e+00
 -4.47314366e-02 -1.74619880e+00 -1.46715888e+00 -1.33237111e+00
 -1.69154625e+00 -5.06179637e-01  2.62197591e+00 -2.09210002e+00
  1.95074661e-01 -2.75469422e-01  5.11308202e+00 -1.67083739e+00
 -9.81863179e-02  6.34477127e-01 -6.10008281e-01  4.01805897e-02
 -1.27661999e+00 -2.91349679e+00  3.39544035e+00  7.91904036e-01
  1.35260232e+00 -4.03661265e+00  2.32361734e+00 -3.36712926e+00
  1.81279204e+00  3.01566897e+00 -1.89452070e+00 -2.50844073e-01
 -2.89543735e+00 -1.26616071e-01 -5.00217192e+00 -2.43951806e+00
  2.85071846e+00 -8.57081177e-01  2.99141960e+00  2.34589755e+00
  1.31207081e+00  1.71845119e+00 -2.59766697e+00 -1.32370675e+00
 -2.81242223e

#### QUESTION: Did our model learn better? Are we overfitting? Are we generalizing?
Q4: The training score of the ridge regression model is smaller than th etraining score of the extended feature lr model. So, the model is not learning better. However, the testing score increased and the discrepancy between the training score and the testing score is smaller. So, the risk of overfitting is less and the risk of generilization is less. 

In [26]:
# Tuning our Model

In [27]:
# Like with kNN, we can provide Ridge Regression with a parameter for its model
# Parameter indicates how strictly the model should prefer small weights over
# fitting the training data
# Larger alpha values indicate stronger preferences for small weights - MORE regularization
# smaller alpha values indicate greater preference for fitting training data - LESS regularization
# default alpha=1.0

In [28]:
# Try Ridge Regression with an alpha value of 10
ridge = Ridge(alpha=10);
ridge.fit(X_train, y_train)
print("Training set score: {:.3f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.3f}".format(ridge.score(X_test, y_test)))

Training set score: 0.788
Test set score: 0.636


In [29]:
# Try Ridge Regression with an alpha of 0.1
ridge = Ridge(alpha=0.1);
ridge.fit(X_train, y_train)
print("Training set score: {:.3f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.3f}".format(ridge.score(X_test, y_test)))

Training set score: 0.928
Test set score: 0.772


In [None]:
# TODO: Experiment and try to find the best value for alpha that avoids overfitting
# and yields the best performance on the test set.

In [46]:
alp = np.linspace(0,21,500)
for i in alp: 
    ridge = Ridge(alpha=i);
    ridge.fit(X_train, y_train)
    print(i, " Training set score: {:.3f}".format(ridge.score(X_train, y_train)), " Test set score: {:.3f}".format(ridge.score(X_test, y_test)), " ", ridge.score(X_train, y_train) - ridge.score(X_test, y_test))
    #print(ridge.score(X_train, y_train) - ridge.score(X_test, y_test))

0.0  Training set score: 0.952  Test set score: 0.604   0.34820975473289817
0.04208416833667335  Training set score: 0.936  Test set score: 0.758   0.1782000548020476
0.0841683366733467  Training set score: 0.930  Test set score: 0.770   0.15957074042748753
0.12625250501002006  Training set score: 0.926  Test set score: 0.774   0.15177611712453398
0.1683366733466934  Training set score: 0.922  Test set score: 0.775   0.14730691286532716
0.21042084168336672  Training set score: 0.919  Test set score: 0.775   0.14434421845051437
0.2525050100200401  Training set score: 0.916  Test set score: 0.774   0.1422099818881899
0.29458917835671344  Training set score: 0.914  Test set score: 0.773   0.14058850150789348
0.3366733466933868  Training set score: 0.912  Test set score: 0.772   0.13931084657083626
0.3787575150300601  Training set score: 0.909  Test set score: 0.771   0.138277481597178
0.42084168336673344  Training set score: 0.907  Test set score: 0.770   0.13742544236835907
0.46292585170

5.218436873747495  Training set score: 0.820  Test set score: 0.679   0.14170952568194983
5.260521042084169  Training set score: 0.820  Test set score: 0.678   0.14181841408250007
5.302605210420841  Training set score: 0.820  Test set score: 0.678   0.14192705211024248
5.344689378757515  Training set score: 0.819  Test set score: 0.677   0.14203543708872668
5.386773547094188  Training set score: 0.819  Test set score: 0.677   0.14214356649192916
5.428857715430862  Training set score: 0.819  Test set score: 0.676   0.14225143793799755
5.470941883767535  Training set score: 0.818  Test set score: 0.676   0.14235904918326747
5.513026052104208  Training set score: 0.818  Test set score: 0.675   0.14246639811654294
5.5551102204408815  Training set score: 0.817  Test set score: 0.675   0.14257348275362425
5.597194388777555  Training set score: 0.817  Test set score: 0.674   0.14268030123207587
5.6392785571142285  Training set score: 0.817  Test set score: 0.674   0.14278685180621786
5.681362

10.226452905811623  Training set score: 0.787  Test set score: 0.634   0.1527585298882529
10.268537074148297  Training set score: 0.787  Test set score: 0.634   0.15283603882641938
10.31062124248497  Training set score: 0.787  Test set score: 0.634   0.15291332290734716
10.352705410821644  Training set score: 0.786  Test set score: 0.633   0.15299038295682277
10.394789579158317  Training set score: 0.786  Test set score: 0.633   0.1530672197984917
10.43687374749499  Training set score: 0.786  Test set score: 0.633   0.1531438342538023
10.478957915831664  Training set score: 0.786  Test set score: 0.633   0.15322022714195427
10.521042084168338  Training set score: 0.786  Test set score: 0.632   0.1532963992798475
10.56312625250501  Training set score: 0.785  Test set score: 0.632   0.15337235148203587
10.605210420841683  Training set score: 0.785  Test set score: 0.632   0.1534480845606796
10.647294589178356  Training set score: 0.785  Test set score: 0.631   0.1535235993255033
10.68937

15.865731462925853  Training set score: 0.763  Test set score: 0.601   0.1614295318734289
15.907815631262526  Training set score: 0.763  Test set score: 0.601   0.16148317875642149
15.949899799599198  Training set score: 0.762  Test set score: 0.601   0.16153668659523224
15.991983967935871  Training set score: 0.762  Test set score: 0.601   0.16159005585332864
16.034068136272545  Training set score: 0.762  Test set score: 0.600   0.1616432869920339
16.07615230460922  Training set score: 0.762  Test set score: 0.600   0.1616963804705378
16.118236472945892  Training set score: 0.762  Test set score: 0.600   0.16174933674590863
16.160320641282567  Training set score: 0.762  Test set score: 0.600   0.16180215627310346
16.20240480961924  Training set score: 0.761  Test set score: 0.600   0.16185483950498303
16.24448897795591  Training set score: 0.761  Test set score: 0.599   0.1619073868923181
16.286573146292586  Training set score: 0.761  Test set score: 0.599   0.1619597988838054
16.3286

20.705410821643287  Training set score: 0.746  Test set score: 0.579   0.16678873273590167
20.74749498997996  Training set score: 0.746  Test set score: 0.579   0.1668289704830328
20.789579158316634  Training set score: 0.746  Test set score: 0.579   0.16686911094128565
20.831663326653306  Training set score: 0.746  Test set score: 0.579   0.16690915439107246
20.87374749498998  Training set score: 0.745  Test set score: 0.579   0.16694910111166827
20.915831663326653  Training set score: 0.745  Test set score: 0.578   0.16698895138121594
20.95791583166333  Training set score: 0.745  Test set score: 0.578   0.167028705476731
21.0  Training set score: 0.745  Test set score: 0.578   0.16706836367411004


#### QUESTION: What was the best alpha value found? How did you find it?
Q5: When α = 1.3226, the testing score is 0.744 and the discrepancy between the training and testing scores is 0.13271. This selection is based on the mininum value of discrepancy between the training and testing scores. As α gets smaller, the performance of the prediction gets better. Though as α gets smaller, the discrepancy between the training and testing scores fluctuates, the discrepancy does not vary much (from 0.13271 to 0.1661). I tested the model on 500 α values between 1 and 20 (1 inclusive). ---> alp = np.linspace(0,20,500)