In [41]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import math
from itertools import combinations

In [6]:
boston = pd.read_csv("boston.csv")
boston.head(6)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7


# 1. Run a univariate Linear Regression with the feature that should have the best predictive power. 

In [38]:
def get_uni_R2(X, y=boston["MEDV"]):
    return LinearRegression().fit(X.values.reshape(-1, 1), 
                                  y.values).score(X.values.reshape(-1, 1), y.values)

unilr_r2 = {}
for col in boston.columns:
    unilr_r2[col] = get_uni_R2(boston[col])
unilr_r2.pop("MEDV")
unilr_r2 = pd.Series(unilr_r2)
unilr_r2[unilr_r2.values == max(unilr_r2)]

LSTAT    0.544146
dtype: float64

In [127]:
MSE_1 = np.nanmean(np.square(np.subtract(LinearRegression().fit(
    boston["LSTAT"].values.reshape(-1, 1), 
    boston["MEDV"].values).predict(boston["LSTAT"].values.reshape(-1, 1)), 
                                        boston["MEDV"].values)))
print(f"MSE: {MSE_1}")

MSE: 38.48296722989414


LSTAT would provide best predictive power in univariate linear regression, which has MSE = 38.48296722989414.

# 2. Run a multivariate Linear Regression with the two features that should have the best predictive power.

In [50]:
feature_lst = list(boston.columns)
feature_lst.remove("MEDV")
feature_lst
two_feature_comb = list(combinations(feature_lst, 2))

In [124]:
def get_multi2_R2(comb, y=boston["MEDV"]):
    X = boston[[comb[0], comb[1]]]
    return LinearRegression().fit(X, y).score(X, y)

multi2_r2 = {}
for comb in two_feature_comb:
    multi2_r2[comb] = get_multi2_R2(comb)
multi2_r2 = pd.Series(multi2_r2)
multi2_r2[multi2_r2.values == max(multi2_r2)]

RM  LSTAT    0.638562
dtype: float64

In [125]:
MSE_2 = np.nanmean(np.square(np.subtract(LinearRegression().fit(
    boston[["LSTAT", "RM"]], boston["MEDV"]).predict(boston[["LSTAT", "RM"]]),
                                         boston["MEDV"])))
print(f"MSE: {MSE_2}")

MSE: 30.51246877729947


The feature combination of LSTAT and RM would provide best predictive power in univariate linear regression, which has MSE = 30.51246877729947.

# 3. Using the metric you have used before for feature selection: Determine whether you would want to run a Ridge, Lasso or ElasticNet regression next. Run the regression and comment on your choice.

In [102]:
from sklearn import linear_model

In [134]:
ridge = linear_model.Ridge()
np.nanmean(np.square(np.subtract(ridge.fit(boston[feature_lst], boston["MEDV"]).predict(boston[feature_lst]), boston["MEDV"])))

22.044452243088198

In [135]:
Lasso = linear_model.Lasso()
np.nanmean(np.square(np.subtract(Lasso.fit(boston[feature_lst], boston["MEDV"]).predict(boston[feature_lst]), boston["MEDV"])))

26.79609915726647

13