In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


In [55]:
df = pd.read_csv('boston.csv')
df.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  price    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [57]:
from sklearn.metrics import r2_score, mean_squared_error

def calculate_metrics(actual, predicted, num_features):

    r_squared = r2_score(actual, predicted)
    n = len(actual)
    adjusted_r_squared = 1 - ((1 - r_squared) * (n - 1) / (n - num_features - 1))
    mse = mean_squared_error(actual, predicted)
    
    return r_squared, adjusted_r_squared, mse


In [58]:
y = df['price']
X = df.drop('price', axis=1)

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [60]:
model = LinearRegression()
model.fit(X_train, y_train)

In [61]:
y_hat_lr = model.predict(X_test)
df_predictions = pd.DataFrame({'y_test':y_test, 'y_hat_test_lr':y_hat_lr, 'resid': y_test - y_hat_lr})
df_predictions.head(3)

Unnamed: 0,y_test,y_hat_test_lr,resid
198,34.6,34.016513,0.583487
229,31.5,31.051895,0.448105
502,20.6,22.338845,-1.738845


In [62]:
r_squared, adjusted_r_squared, mse = calculate_metrics(df_predictions['y_test'], df_predictions['y_hat_test_lr'], 13)
print(f'r2={r_squared}')
print(f'r2 adjusted={adjusted_r_squared}')
print(f'mse={mse}')

r2=0.7057919873264542
r2 adjusted=0.6780767397557579
mse=29.79884430147881


In [86]:
pf = PolynomialFeatures(degree=2)
model = LinearRegression()
model.fit(pf.fit_transform(X_train), y_train)


In [87]:
y_hat_pf = model.predict(pf.transform(X_test))
df_predictions_pf = pd.DataFrame({'y_test':y_test, 'y_hat_pf':y_hat_pf, 'resid': y_test - y_hat_pf})
df_predictions_pf.head(3)

Unnamed: 0,y_test,y_hat_pf,resid
198,34.6,34.706588,-0.106588
229,31.5,27.429078,4.070922
502,20.6,14.044683,6.555317


In [90]:
r_squared, adjusted_r_squared, mse = calculate_metrics(df_predictions_pf['y_test'], df_predictions_pf['y_hat_pf'], 105)
print(f'r2={r_squared}')
print(f'r2 adjusted={adjusted_r_squared}')
print(f'mse={mse}')

r2=0.7263552942048883
r2 adjusted=0.10173150923778551
mse=27.71609076113282
