
# Importing Packages

In [4]:
# import packages for data cleaning and processing  
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

# import modules for preprocessing
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from mlxtend.evaluate import bias_variance_decomp

# pd.set_option('display.max_rows', 200)
# pd.set_option('display.max_columns', 200)

In [5]:
def get_bias_variance(model, X_train, y_train, X_test, y_test, loss_type):
    avg_expected_loss, avg_bias, avg_variance = bias_variance_decomp(model, X_train.values, y_train.values, X_test.values, y_test.values, loss=loss_type, random_seed=42)
    print('Average expected loss: %.3e' % avg_expected_loss)
    print('Average bias: %.3e' % avg_bias)
    print('Average variance: %.3e' % avg_variance)
    return

def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def train_test_metrics(y_train, y_test, y_train_pred, y_test_pred):
	print('Training R^2 Score: ', round(r2_score(y_train, y_train_pred), 4))
	print('Training RMSE: %d' % rmse(y_train, y_train_pred))
	print('Testing R^2 Score: ', round(r2_score(y_test, y_test_pred), 4))
	print('Testing RMSE: %d' % rmse(y_test, y_test_pred))
	return

In [6]:
df = pd.read_csv('data/kc_house_data.csv')

In [7]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [8]:
df.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540088.1,3.370842,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,7.656873,1788.390691,291.509045,1971.005136,84.402258,98077.939805,47.560053,-122.213896,1986.552492,12768.455652
std,2876566000.0,367127.2,0.930062,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,1.175459,828.090978,442.575043,29.373411,401.67924,53.505026,0.138564,0.140828,685.391304,27304.179631
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0
25%,2123049000.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.5718,-122.23,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.7776,-121.315,6210.0,871200.0


In [9]:
df.isna().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

# Feature Selection

In [10]:
# Extract sale date for later processing
df['sale_date'] = [x[:8] for x in df.date]
df.sale_date = df.sale_date.apply(lambda x: datetime.strptime(x, '%Y%m%d'))
df.drop(columns='date', inplace=True)
df.drop(['id'], inplace=True, axis=1)

# Replace anomalous bedroom value and check values in column
df.replace({'bedrooms': {33: 3}}, inplace=True)
df.replace({'bathrooms': {0: 0.25}}, inplace=True)

# Create new feature to incorporate age at the time of sale
df['sale_age'] = df.sale_date.dt.year - df[['yr_built', 'yr_renovated']].max(axis=1)
df.replace({'sale_age': {-1: 0}}, inplace=True)

# Create new feature for age from year built
df['age'] = df.sale_date.dt.year - df.yr_built
df.replace({'age': {-1: 0}}, inplace=True)

# Create binary variables for whether there has been a renovation, the property has a bathroom, and has been viewed
df['renovated'] = df.yr_renovated.apply(lambda x: x if x==0 else 1)
df['basement'] = df.sqft_basement.apply(lambda x: x if x==0 else 1)
df['viewed'] = df.view.apply(lambda x: x if x==0 else 1)

# Drop original columms as well as the sale_date columns since it is in datetime format
df.drop(['yr_built', 'yr_renovated', 'sale_date', 'sqft_basement', 'view'], inplace=True, axis=1)

# Drop latitude and longitude as zip code has stronger correlation
df.drop(['lat', 'long'], inplace=True, axis=1)

In [11]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,zipcode,sqft_living15,sqft_lot15,sale_age,age,renovated,basement,viewed
0,221900.0,3,1.0,1180,5650,1.0,0,3,7,1180,98178,1340,5650,59,59,0,0,0
1,538000.0,3,2.25,2570,7242,2.0,0,3,7,2170,98125,1690,7639,23,63,1,1,0
2,180000.0,2,1.0,770,10000,1.0,0,3,6,770,98028,2720,8062,82,82,0,0,0
3,604000.0,4,3.0,1960,5000,1.0,0,5,7,1050,98136,1360,5000,49,49,0,1,0
4,510000.0,3,2.0,1680,8080,1.0,0,3,8,1680,98074,1800,7503,28,28,0,0,0


# Dummy Variables and Polynomial/Interaction Features

In [12]:
# Grab indices of columns for creating dummy variables and create dataframe with dummy variables
dum_feat = df[['bedrooms', 'bathrooms', 'floors', 'condition', 'grade', 'zipcode']]
dum_index = dum_feat.columns

# Create dummy variables then drop one of the dummy variables, as well as original categorical variable used in creating the dummy variables
df_dum = pd.get_dummies(data=dum_feat, columns=dum_index, drop_first=True, prefix=['bdr', 'bth', 'flr', 'cnd', 'grd', 'zip'])
df_dum.head()

Unnamed: 0,bdr_1,bdr_2,bdr_3,bdr_4,bdr_5,bdr_6,bdr_7,bdr_8,bdr_9,bdr_10,...,zip_98146,zip_98148,zip_98155,zip_98166,zip_98168,zip_98177,zip_98178,zip_98188,zip_98198,zip_98199
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Set target variable
y = df['price']

# Baseline Model

In [14]:
poly_feat_1 = df.drop(['price', 'bedrooms', 'bathrooms', 'floors', 'condition', 'grade', 'zipcode'], axis=1)
X = pd.concat([poly_feat_1, df_dum], axis=1)
X.head()

Unnamed: 0,sqft_living,sqft_lot,waterfront,sqft_above,sqft_living15,sqft_lot15,sale_age,age,renovated,basement,...,zip_98146,zip_98148,zip_98155,zip_98166,zip_98168,zip_98177,zip_98178,zip_98188,zip_98198,zip_98199
0,1180,5650,0,1180,1340,5650,59,59,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2570,7242,0,2170,1690,7639,23,63,1,1,...,0,0,0,0,0,0,0,0,0,0
2,770,10000,0,770,2720,8062,82,82,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1960,5000,0,1050,1360,5000,49,49,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1680,8080,0,1680,1800,7503,28,28,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, shuffle=True)

In [16]:
lr = LinearRegression().fit(X_train, y_train)
loss, bias, var = bias_variance_decomp(lr, X_train.values, y_train.values, X_test.values, y_test.values, loss='mse', random_seed=42)
print('Average expected loss: %.3e' % loss)
print('Average bias: %.3e' bias)
print('Average variance: %.3e' % var)
y_pred_train_lr = lr.predict(X_train)
y_pred_test_lr = lr.predict(X_test)
train_test_metrics(y_train, y_test, y_pred_train_lr, y_pred_test_lr)


Average expected loss: 3.218e+10
Average bias: 3.052e+10
Average variance: 1.668e+09
Training R^2 Score:  0.8352
Training RMSE: 146745
Testing R^2 Score:  0.815
Testing RMSE: 167237


# Degree-2 Polynomial Features Model

In [19]:
# Grab columns for polynominal and interaction features from the original dataframe without dummy variables
poly_feat_2 = df.drop(['price', 'bedrooms', 'bathrooms', 'floors', 'condition', 'grade', 'zipcode'], axis=1)
# Use PolynomialFeatures to create binomial and interaction features
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
poly_data_2 = poly_2.fit_transform(poly_feat_2)
poly_columns_2 = poly_2.get_feature_names(poly_feat_2.columns)
df_poly_2 = pd.DataFrame(poly_data_2, columns=poly_columns_2)
# Concatenating two dataframes together for input into linear regression model
X_poly_2 = pd.concat([df_poly_2, df_dum], axis=1)
X_poly_2.head()

Unnamed: 0,sqft_living,sqft_lot,waterfront,sqft_above,sqft_living15,sqft_lot15,sale_age,age,renovated,basement,...,zip_98146,zip_98148,zip_98155,zip_98166,zip_98168,zip_98177,zip_98178,zip_98188,zip_98198,zip_98199
0,1180.0,5650.0,0.0,1180.0,1340.0,5650.0,59.0,59.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1,2570.0,7242.0,0.0,2170.0,1690.0,7639.0,23.0,63.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,770.0,10000.0,0.0,770.0,2720.0,8062.0,82.0,82.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1960.0,5000.0,0.0,1050.0,1360.0,5000.0,49.0,49.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,1680.0,8080.0,0.0,1680.0,1800.0,7503.0,28.0,28.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_poly_2, y, random_state=42, test_size=0.2, shuffle=True)
lr_2 = LinearRegression().fit(X_train_2, y_train_2)
loss_2, bias_2, var_2 = bias_variance_decomp(lr_2, X_train_2.values, y_train_2.values, X_test_2.values, y_test_2.values, loss='mse', random_seed=42)
print('Average expected loss: %.3e' % loss_2)
print('Average bias: %.3e' bias_2)
print('Average variance: %.3e' % var_2)
y_pred_train_lr_2 = lr_2.predict(X_train_2)
y_pred_test_lr_2 = lr_2.predict(X_test_2)
train_test_metrics(y_train_2, y_test_2, y_pred_train_lr_2, y_pred_test_lr_2)

Average expected loss: 2.595e+10
Average bias: 2.460e+10
Average variance: 1.350e+09
Training R^2 Score:  0.8675
Training RMSE: 131554
Testing R^2 Score:  0.8362
Testing RMSE: 157363


# Degree-3 Polynomial Features Model

In [21]:
poly_feat_3 = df.drop(['price', 'bedrooms', 'bathrooms', 'floors', 'condition', 'grade', 'zipcode'], axis=1)
poly_3 = PolynomialFeatures(degree=3, include_bias=False)
poly_data_3 = poly_3.fit_transform(poly_feat_3)
poly_columns_3 = poly_3.get_feature_names(poly_feat_3.columns)
df_poly_3 = pd.DataFrame(poly_data_3, columns=poly_columns_3)
X_poly_3 = pd.concat([df_poly_3, df_dum], axis=1)
X_poly_3.head()

Unnamed: 0,sqft_living,sqft_lot,waterfront,sqft_above,sqft_living15,sqft_lot15,sale_age,age,renovated,basement,...,zip_98146,zip_98148,zip_98155,zip_98166,zip_98168,zip_98177,zip_98178,zip_98188,zip_98198,zip_98199
0,1180.0,5650.0,0.0,1180.0,1340.0,5650.0,59.0,59.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1,2570.0,7242.0,0.0,2170.0,1690.0,7639.0,23.0,63.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,770.0,10000.0,0.0,770.0,2720.0,8062.0,82.0,82.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1960.0,5000.0,0.0,1050.0,1360.0,5000.0,49.0,49.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,1680.0,8080.0,0.0,1680.0,1800.0,7503.0,28.0,28.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_poly_3, y, random_state=42, test_size=0.2, shuffle=True)
lr_3 = LinearRegression().fit(X_train_3, y_train_3)
loss_3, bias_3, var_3 = bias_variance_decomp(lr_3, X_train_3.values, y_train_3.values, X_test_3.values, y_test_3.values, loss='mse', random_seed=42)
print('Average expected loss: %.3e' % loss_3)
print('Average bias: %.3e' bias_3)
print('Average variance: %.3e' % var_3)
y_pred_train_lr_3 = lr_3.predict(X_train_3)
y_pred_test_lr_3 = lr_3.predict(X_test_3)
train_test_metrics(y_train_3, y_test_3, y_pred_train_lr_3, y_pred_test_lr_3)

Average expected loss: 1.711e+11
Average bias: 5.695e+10
Average variance: 1.141e+11
Training R^2 Score:  0.1216
Training RMSE: 338762
Testing R^2 Score:  0.5435
Testing RMSE: 262698


# Degree-4 Polynomal Features Model

In [23]:
poly_feat_4 = df.drop(['price', 'bedrooms', 'bathrooms', 'floors', 'condition', 'grade', 'zipcode'], axis=1)
poly_4 = PolynomialFeatures(degree=4, include_bias=False)
poly_data_4 = poly_4.fit_transform(poly_feat_4)
poly_columns_4 = poly_4.get_feature_names(poly_feat_4.columns)
df_poly_4 = pd.DataFrame(poly_data_4, columns=poly_columns_4)
X_poly_4 = pd.concat([df_poly_4, df_dum], axis=1)
X_poly_4.head()

Unnamed: 0,sqft_living,sqft_lot,waterfront,sqft_above,sqft_living15,sqft_lot15,sale_age,age,renovated,basement,...,zip_98146,zip_98148,zip_98155,zip_98166,zip_98168,zip_98177,zip_98178,zip_98188,zip_98198,zip_98199
0,1180.0,5650.0,0.0,1180.0,1340.0,5650.0,59.0,59.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
1,2570.0,7242.0,0.0,2170.0,1690.0,7639.0,23.0,63.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,770.0,10000.0,0.0,770.0,2720.0,8062.0,82.0,82.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1960.0,5000.0,0.0,1050.0,1360.0,5000.0,49.0,49.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,1680.0,8080.0,0.0,1680.0,1800.0,7503.0,28.0,28.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_poly_4, y, random_state=42, test_size=0.2, shuffle=True)
lr_4 = LinearRegression().fit(X_train_4, y_train_4)
loss_4, bias_4, var_4 = bias_variance_decomp(lr_4, X_train_4.values, y_train_4.values, X_test_4.values, y_test_4.values, loss='mse', random_seed=42)
print('Average expected loss: %.3e' % loss_4)
print('Average bias: %.3e' bias_4)
print('Average variance: %.3e' % var_4)
y_pred_train_lr_4 = lr_4.predict(X_train_4)
y_pred_test_lr_4 = lr_4.predict(X_test_4)
train_test_metrics(y_train_4, y_test_4, y_pred_train_lr_4, y_pred_test_lr_4)

Average expected loss: 3.801e+14
Average bias: 4.626e+12
Average variance: 3.755e+14
Training R^2 Score:  -230.2572
Training RMSE: 5496675
Testing R^2 Score:  -8270.3959
Testing RMSE: 35361595


# 

In [26]:
loss, bias, var = bias_variance_decomp(lr, X_train.values, y_train.values, X_test.values, y_test.values, loss='mse', random_seed=42)
loss_2, bias_2, var_2 = bias_variance_decomp(lr_2, X_train_2.values, y_train_2.values, X_test_2.values, y_test_2.values, loss='mse', random_seed=42)
loss_3, bias_3, var_3 = bias_variance_decomp(lr_3, X_train_3.values, y_train_3.values, X_test_3.values, y_test_3.values, loss='mse', random_seed=42)
loss_4, bias_4, var_4 = bias_variance_decomp(lr_4, X_train_4.values, y_train_4.values, X_test_4.values, y_test_4.values, loss='mse', random_seed=42)

In [33]:
data = [['%.3e' % bias, '%.3e' % var, round(r2_score(y_train, y_pred_train_lr), 4), round(r2_score(y_test, y_pred_test_lr), 4), int(rmse(y_train, y_pred_train_lr)), int(rmse(y_test, y_pred_test_lr))],
        ['%.3e' % bias_2, '%.3e' % var_2, round(r2_score(y_train_2, y_pred_train_lr_2), 4), round(r2_score(y_test_2, y_pred_test_lr_2), 4), int(rmse(y_train_2, y_pred_train_lr_2)), int(rmse(y_test_2, y_pred_test_lr_2))],
        ['%.3e' % bias_3, '%.3e' % var_3, round(r2_score(y_train_3, y_pred_train_lr_3), 4), round(r2_score(y_test_3, y_pred_test_lr_3), 4), int(rmse(y_train_3, y_pred_train_lr_3)), int(rmse(y_test_3, y_pred_test_lr_3))],
        ['%.3e' % bias_4, '%.3e' % var_4, round(r2_score(y_train_4, y_pred_train_lr_4), 4), round(r2_score(y_test_4, y_pred_test_lr_4), 4), int(rmse(y_train_4, y_pred_train_lr_4)), int(rmse(y_test_4, y_pred_test_lr_4))]]

index = ["Baseline", "Poly-2", "Poly-3", "Poly-4"]
columns = ["Average Bias", "Average Variance", "Training R^2 Score", "Testing R^2 Score", "Training RMSE", "Testing RMSE"]

poly_feat_df = pd.DataFrame(data=data, index=index, columns=columns)

In [34]:
poly_feat_df

Unnamed: 0,Average Bias,Average Variance,Training R^2 Score,Testing R^2 Score,Training RMSE,Testing RMSE
Baseline,30520000000.0,1668000000.0,0.8352,0.815,146745,167237
Poly-2,24600000000.0,1350000000.0,0.8675,0.8362,131554,157363
Poly-3,56950000000.0,114100000000.0,0.1216,0.5435,338762,262698
Poly-4,4626000000000.0,375500000000000.0,-230.2572,-8270.3959,5496675,35361595


# Baseline Model 2

In [35]:
y = df['price']
dum_feat = df[['bathrooms', 'condition', 'grade', 'zipcode']]
dum_index = dum_feat.columns
df_dum = pd.get_dummies(data = dum_feat, columns = dum_index, drop_first = True, prefix = ['bth', 'cnd', 'grd', 'zip'])
poly_feat = df.drop(['price', 'bedrooms', 'bathrooms', 'floors', 'condition', 'grade', 'zipcode', 'sqft_lot15', 'sqft_above', 'sqft_lot'], axis = 1)
poly = PolynomialFeatures(degree = 2, include_bias = False)
poly_data = poly.fit_transform(poly_feat)
poly_columns = poly.get_feature_names(poly_feat.columns)
df_poly = pd.DataFrame(poly_data, columns = poly_columns)
X = pd.concat([df_poly, df_dum], axis = 1)

In [39]:
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X, y, random_state = 42, test_size = 0.2)
lr_5 = LinearRegression().fit(X_train_5, y_train_5)
y_pred_train_lr_5 = lr_5.predict(X_train_5)
y_pred_test_lr_5 = lr_5.predict(X_test_5)
loss_5, bias_5, var_5 = bias_variance_decomp(lr_5, X_train_5.values, y_train_5.values, X_test_5.values, y_test_5.values, loss='mse', random_seed=42)
print('Average expected loss: %.3e' % loss_5)
print('Average bias: %.3e' % bias_5)
print('Average variance: %.3e' % var_5)
train_test_metrics(y_train_5, y_test_5, y_pred_train_lr_5, y_pred_test_lr_5)


Average expected loss: 2.831e+10
Average bias: 2.723e+10
Average variance: 1.080e+09
Training R^2 Score:  0.8673
Training RMSE: 131678
Testing R^2 Score:  0.8086
Testing RMSE: 170105


# Ridge Regression Model

In [40]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train_5)
X_test_scaled = ss.transform(X_test_5)
X_train_sc = pd.DataFrame(X_train_scaled, columns=X_train_5.columns)
X_test_sc = pd.DataFrame(X_test_scaled, columns=X_test_5.columns)

In [41]:
ridge = Ridge(alpha=1).fit(X_train_sc, y_train_5)
y_pred_train_ridge = ridge.predict(X_train_sc)
y_pred_test_ridge = ridge.predict(X_test_sc)
loss_6, bias_6, var_6 = bias_variance_decomp(ridge, X_train_5.values, y_train_5.values, X_test_5.values, y_test_5.values, loss='mse', random_seed=42)
print('Average expected loss: %.3e' % loss_5)
print('Average bias: %.3e' % bias_5)
print('Average variance: %.3e' % var_5)
get_bias_variance(ridge, X_train_sc, y_train_5, X_test_sc, y_test_5, 'mse')
train_test_metrics(y_train_5, y_test_5, y_pred_train_ridge, y_pred_test_ridge)

Average expected loss: 2.832e+10
Average bias: 2.725e+10
Average variance: 1.073e+09
Training R^2 Score:  0.8673
Training RMSE: 131679
Testing R^2 Score:  0.8084
Testing RMSE: 170176


# Lasso Regression Model

In [None]:
lasso = Lasso(alpha=1).fit(X_train_sc, y_train_2)
y_pred_lasso_tr = lasso.predict(X_train_sc)
y_pred_lasso_tt = lasso.predict(X_test_sc)
get_bias_variance(lasso, X_train_sc, y_train_2, X_test_sc, y_test_2, 'mse')
train_test_metrics(y_train_2, y_test_2, y_pred_lasso_tr, y_pred_lasso_tt)

In [63]:
train_rmse_lasso = []
test_rmse_lasso = []
alphas_lasso = []

for alpha in np.linspace(0, 1000, num=20):
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train_sc, y_train_2)
    train_pred = lasso.predict(X_train_sc)
    train_rmse_lasso.append(rmse(y_train_2, train_pred))
    test_pred = lasso.predict(X_test_sc)
    test_rmse_lasso.append(rmse(y_test_2, test_pred))
    alphas_lasso.append(alpha)

fig, ax = plt.subplots()
ax.plot(alphas_lasso, train_rmse_lasso, label="Train")
ax.plot(alphas_lasso, test_rmse_lasso, label="Test")
ax.set_xlabel("Alpha")
ax.set_ylabel("RMSE")
optimal_alpha = alphas_lasso[np.argmin(test_rmse_lasso)]
ax.axvline(optimal_alpha, color="black", linestyle="--")
print(f'Optimal Alpha Value: {int(optimal_alpha)}')

NameError: name 'X_tr_sc' is not defined

In [64]:
best_lasso = Lasso(alpha=600).fit(X_train_sc, y_train_2)
y_pred_best_lasso_tr = best_lasso.predict(X_train_sc)
y_pred_best_lasso_tt = best_lasso.predict(X_test_sc)
get_bias_variance(best_lasso, X_train_sc, y_train_2, X_test_sc, y_test_2, 'mse')
train_test_metrics(y_train_2, y_test_2, y_pred_best_lasso_tr, y_pred_best_lasso_tt)

NameError: name 'X_tr_sc' is not defined

# Decision Tree Regressor

In [13]:
dtr = DecisionTreeRegressor(random_state=42).fit(X_train_5, y_train_5)
y_pred_dt_train = dtr.predict(X_train_5) 
y_pred_dt_test = dtr.predict(X_test_5)
get_bias_variance(dtr, X_train_5, y_train_5, X_test_5, y_test_5, 'mse')
train_test_metrics(y_train_5, y_test_5, y_pred_dt_train, y_pred_dt_test)


Average expected loss: 6.088e+10
Average bias: 3.170e+10
Average variance: 2.918e+10
Training R^2 Score:  0.9998
Training RMSE: 4839
Testing R^2 Score:  0.585
Testing RMSE: 250484


# Random Forest Regressor

In [14]:
rf = RandomForestRegressor(n_estimators=100, 
                           max_features="auto", 
                           max_depth=100, 
                           min_samples_leaf=4, 
                           min_samples_split=10, 
                           random_state=1).fit(X_train_5, y_train_5)
y_pred_rf_train = rf.predict(X_train_5)
y_pred_rf_test = rf.predict(X_test_5)
get_bias_variance(rf, X_train_5, y_train_5, X_test_5, y_test_5, 'mse')
train_test_metrics(y_train_5, y_test_5, y_pred_rf_train, y_pred_rf_test)

KeyboardInterrupt: 

# DataFrame of Metrics

In [None]:
data = [[int(rmse(y_tr, y_tr_pred)), int(rmse(y_tt, y_tt_pred)), bias(y_tr, y_tr_pred), variance(y_tr_pred), bias(y_tt, y_tt_pred), variance(y_tt_pred)],
        [int(rmse(y_tr_2, y_tr_pred_2)), int(rmse(y_tt_2, y_tt_pred_2)), bias(y_tr_2, y_tr_pred_2), variance(y_tr_pred_2), bias(y_tt_2, y_tt_pred_2), variance(y_tt_pred_2)],
        [int(rmse(y_tr_3, y_tr_pred_3)), int(rmse(y_tt_3, y_tt_pred_3)), bias(y_tr_3, y_tr_pred_3), variance(y_tr_pred_3), bias(y_tt_3, y_tt_pred_3), variance(y_tt_pred_3)],
        [int(rmse(y_tr_4, y_tr_pred_4)), int(rmse(y_tt_4, y_tt_pred_4)), bias(y_tr_4, y_tr_pred_4), variance(y_tr_pred_4), bias(y_tt_4, y_tt_pred_4), variance(y_tt_pred_4)],
        [int(rmse(y_tr_5, y_tr_pred_5)), int(rmse(y_tt_5, y_tt_pred_5)), bias(y_tr_5, y_tr_pred_5), variance(y_tr_pred_5), bias(y_tt_5, y_tt_pred_5), variance(y_tt_pred_5)]]

index = ["Baseline 2", "Ridge", "Lasso", "Decision Tree", "Random Forest"]

columns = ["Average Bias", "Average Variance", "", "Train Variance", "Test Bias", "Test Variance"]

poly_feat_df = pd.DataFrame(data=data, index=index, columns=columns)