# Regression

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import statsmodels.api as sm
from statsmodels.tools import eval_measures

## Helper Functions

In [2]:
def evaluate(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    print('=== Train Statistics ===')
    print('R^2 Score:', r2_score(y_train, y_pred_train))
    print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred_train))
    print('Mean Squared Error:', mean_squared_error(y_train, y_pred_train))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_train, y_pred_train)))

    print()

    print('=== Test Statistics ===')
    print('R^2 Score:', r2_score(y_test, y_pred_test))
    print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_test))
    print('Mean Squared Error:', mean_squared_error(y_test, y_pred_test))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_test)))

## Load Data

In [3]:
dataset = load_boston(as_frame=True)
dataset

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


## Train-Test Split

In [4]:
X = dataset.drop('medv', axis=1)
y = dataset['medv']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print('Shape of X_train', X_train.shape)
print('Shape of X_test', X_test.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of y_test', y_test.shape)

Shape of X_train (404, 13)
Shape of X_test (102, 13)
Shape of y_train: (404,)
Shape of y_test (102,)


## Linear Regression

In [6]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

print(linear_regression)

LinearRegression()


In [7]:
evaluate(linear_regression, X_train, y_train, X_test, y_test)

=== Train Statistics ===
R^2 Score: 0.7415244219726306
Mean Absolute Error: 3.226962133564672
Mean Squared Error: 21.19713992222643
Root Mean Squared Error: 4.604035178213393

=== Test Statistics ===
R^2 Score: 0.7263451459702531
Mean Absolute Error: 3.3677909837965907
Mean Squared Error: 25.419587126821646
Root Mean Squared Error: 5.0417841214020305


## Bonus: Linear Regression Evaluation using Hypothesis Testing

In [9]:
X2_train = sm.add_constant(X_train)
X2_test = sm.add_constant(X_test)

print('Shape of X2_train', X2_train.shape)
print('Shape of X2_test', X2_test.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of y_test', y_test.shape)


Shape of X2_train (404, 14)
Shape of X2_test (102, 14)
Shape of y_train: (404,)
Shape of y_test (102,)


In [10]:
sm_linear_regression = sm.OLS(y_train, X2_train)
results = sm_linear_regression.fit()
results.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.742
Model:,OLS,Adj. R-squared:,0.733
Method:,Least Squares,F-statistic:,86.07
Date:,"Sat, 10 Apr 2021",Prob (F-statistic):,7.71e-106
Time:,02:06:52,Log-Likelihood:,-1190.1
No. Observations:,404,AIC:,2408.0
Df Residuals:,390,BIC:,2464.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,35.5549,5.499,6.466,0.000,24.743,46.366
crim,-0.1160,0.037,-3.130,0.002,-0.189,-0.043
zn,0.0471,0.016,3.021,0.003,0.016,0.078
indus,0.0083,0.070,0.118,0.906,-0.129,0.145
chas,3.2340,1.011,3.199,0.001,1.247,5.221
nox,-16.6866,4.146,-4.025,0.000,-24.837,-8.536
rm,3.8841,0.455,8.529,0.000,2.989,4.779
age,-0.0109,0.015,-0.739,0.460,-0.040,0.018
dis,-1.5413,0.224,-6.875,0.000,-1.982,-1.101

0,1,2,3
Omnibus:,140.148,Durbin-Watson:,1.751
Prob(Omnibus):,0.0,Jarque-Bera (JB):,573.629
Skew:,1.487,Prob(JB):,2.7400000000000003e-125
Kurtosis:,8.023,Cond. No.,14900.0


## Support Vector Regression

In [11]:
svr = SVR(kernel='linear', C=1)
svr.fit(X_train, y_train)

print(svr)

SVR(C=1, kernel='linear')


In [12]:
evaluate(svr, X_train, y_train, X_test, y_test)

=== Train Statistics ===
R^2 Score: 0.7119965694119231
Mean Absolute Error: 3.093280934499329
Mean Squared Error: 23.618668590849474
Root Mean Squared Error: 4.859904175068627

=== Test Statistics ===
R^2 Score: 0.6746277508908065
Mean Absolute Error: 3.2179733740549548
Mean Squared Error: 30.223575840470946
Root Mean Squared Error: 5.497597278854731


## Decision Tree Regressor

In [13]:
decision_tree = DecisionTreeRegressor(criterion='mse', max_features='auto')
decision_tree.fit(X_train, y_train)

print(decision_tree)

DecisionTreeRegressor(max_features='auto')


In [14]:
evaluate(decision_tree, X_train, y_train, X_test, y_test)

=== Train Statistics ===
R^2 Score: 1.0
Mean Absolute Error: 0.0
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0

=== Test Statistics ===
R^2 Score: 0.7247300962442302
Mean Absolute Error: 3.143137254901961
Mean Squared Error: 25.56960784313726
Root Mean Squared Error: 5.056639975629792


## K-Nearest Neighbor Regressor

In [15]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

print(knn)

KNeighborsRegressor()


In [16]:
evaluate(knn, X_train, y_train, X_test, y_test)

=== Train Statistics ===
R^2 Score: 0.699394320949916
Mean Absolute Error: 3.4297524752475246
Mean Squared Error: 24.652157425742573
Root Mean Squared Error: 4.965093898985454

=== Test Statistics ===
R^2 Score: 0.49138219139185624
Mean Absolute Error: 4.77764705882353
Mean Squared Error: 47.24511372549019
Root Mean Squared Error: 6.873508109072848
