In [111]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from plotnine import *
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import mean_squared_error, r2_score
from tpot import TPOTRegressor
from itertools import combinations

In [112]:
ames = pd.read_csv("AmesHousing.csv")

# Question 1

Consider four possible models for predicting house prices:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [113]:
X = ames[["Bldg Type", "Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

## Model 1: Using only the size and number of rooms.

In [114]:
# create model to predict size and number of rooms

lr = LinearRegression()

ct = ColumnTransformer(
  [
    ("standardize", 
    StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline1 = Pipeline(
  [
  ("preprocessing", ct),
  ("linear_regression", lr)]
)

lr_fitted1 = lr_pipeline1.fit(X_train, y_train)

y_pred_test1 = lr_pipeline1.predict(X_test)

mse_test1 = mean_squared_error(y_test, y_pred_test1)
rmse1 = np.sqrt(mse_test1)
rsquared_test1 = r2_score(y_test, y_pred_test1)

print("Model 1: (size and number of rooms)")
print("Test RMSE:", rmse1.__round__(2))
print("Test R^2: ", rsquared_test1.__round__(2))


Model 1: (size and number of rooms)
Test RMSE: 51020.35
Test R^2:  0.58
Model 1: (size and number of rooms)
Test RMSE: 51020.35
Test R^2:  0.58


## Model 2: (size, number of rooms, building type)

In [115]:
# create model Using size, number of rooms, and building type.

lr = LinearRegression()

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"]),
    ("standardize", 
    StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline2 = Pipeline(
  [
  ("preprocessing", ct),
  ("linear_regression", lr)]
)

lr_fitted2 = lr_pipeline2.fit(X_train, y_train)

y_pred_test2 = lr_pipeline2.predict(X_test)

mse_test2 = mean_squared_error(y_test, y_pred_test2)
rmse2 = np.sqrt(mse_test2)
rsquared_test2 = r2_score(y_test, y_pred_test2)

print("Model 2: (size, number of rooms, building type)")
print("Test RMSE:", rmse2.__round__(2))
print("Test R^2: ", rsquared_test2.__round__(2))


Model 2: (size, number of rooms, building type)
Test RMSE: 49737.74
Test R^2:  0.6
Model 2: (size, number of rooms, building type)
Test RMSE: 49737.74
Test R^2:  0.6


## Model 3: Using size and building type, and their interaction.

In [116]:
# create model Using size and building type, and their interaction.

lr = LinearRegression()

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline3 = Pipeline(
  [
  ("preprocessing", ct),
  ("interaction", PolynomialFeatures(degree = 2, interaction_only = True, include_bias=False)),
  ("linear_regression", lr)]
)

lr_fitted3 = lr_pipeline3.fit(X_train, y_train)

y_pred_test3 = lr_pipeline3.predict(X_test)

mse_test3 = mean_squared_error(y_test, y_pred_test3)
rmse3 = np.sqrt(mse_test3)
rsquared_test3 = r2_score(y_test, y_pred_test3)

print("Model 3: (size, building type, and their interaction)")
print("Test RMSE:", rmse3.__round__(2))
print("Test R^2: ", rsquared_test3.__round__(2))


Model 3: (size, building type, and their interaction)
Test RMSE: 49190.44
Test R^2:  0.61
Model 3: (size, building type, and their interaction)
Test RMSE: 49190.44
Test R^2:  0.61


## Model 4: Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

In [117]:
# create model sing a 5-degree polynomial on size, 
# a 5-degree polynomial on number of rooms, and also building type.

lr = LinearRegression()

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
    
  ],
  remainder = "drop"
)

lr_pipeline4 = Pipeline(
  [
  ("preprocessing", ct),
  ("polynomial", PolynomialFeatures(degree=5, include_bias=False)),
  ("linear_regression", lr)]
)

lr_fitted4 = lr_pipeline4.fit(X_train, y_train)

y_pred_test4 = lr_pipeline4.predict(X_test)

mse_test4 = mean_squared_error(y_test, y_pred_test4)
rmse4 = np.sqrt(mse_test4)
rsquared_test4 = r2_score(y_test, y_pred_test4)

print("Model 4: (size, building type, and their interaction)")
print("Test RMSE:", rmse4.__round__(2))
print("Test R^2: ", rsquared_test4.__round__(2))


Model 4: (size, building type, and their interaction)
Test RMSE: 62731.13
Test R^2:  0.37
Model 4: (size, building type, and their interaction)
Test RMSE: 62731.13
Test R^2:  0.37


In [118]:
# print RMSE's 

print("Model 1 RMSE: ", rmse1.__round__(2))
print("Model 2 RMSE: ", rmse2.__round__(2))
print("Model 3 RMSE: ", rmse3.__round__(2))
print("Model 4 RMSE: ", rmse4.__round__(2))

Model 1 RMSE:  51020.35
Model 2 RMSE:  49737.74
Model 3 RMSE:  49190.44
Model 4 RMSE:  62731.13
Model 1 RMSE:  51020.35
Model 2 RMSE:  49737.74
Model 3 RMSE:  49190.44
Model 4 RMSE:  62731.13


Based on the RMSE values, we would choose Model 3 (Using size and building type, and their interaction) as it has the lowest RMSE value and therefore suggesting that the model’s predictions are closer to the actual values, meaning it has better predictive performance

# Question 2
Once again consider four modeling options for house price:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [119]:
score1 = cross_val_score(lr_pipeline1, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
score1 = np.sqrt(-score1)
cv_score1 = score1.mean()

score2 = cross_val_score(lr_pipeline2, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
score2 = np.sqrt(-score2)
cv_score2 = score2.mean()

score3 = cross_val_score(lr_pipeline3, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
score3 = np.sqrt(-score3)
cv_score3 = score3.mean()

score4 = cross_val_score(lr_pipeline4, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
score4 = np.sqrt(-score4)
cv_score4 = score4.mean()

print("Model 1 CV score: ", cv_score1)
print("Model 2 CV score: ", cv_score2)
print("Model 3 CV score: ", cv_score3)
print("Model 4 CV score: ", cv_score4)

Model 1 CV score:  56663.06248423473
Model 2 CV score:  54753.3363274388
Model 3 CV score:  54040.16033128493
Model 4 CV score:  207601.3846530061
Model 1 CV score:  56663.06248423473
Model 2 CV score:  54753.3363274388
Model 3 CV score:  54040.16033128493
Model 4 CV score:  207601.3846530061


Looking at the Cross-validated scores, Model 3 is still the preferred model as it has the lowest value. This agrees with the conclusion from earlier.

# Question 3
Consider one hundred modeling options for house price:

* House size, trying degrees 1 through 10
* Number of rooms, trying degrees 1 through 10
* Building Type
Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

In [120]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial_size", PolynomialFeatures(), ["Gr Liv Area"]),
    ("polynomial_room", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial_size__degree': np.arange(1, 11),
           'preprocessing__polynomial_room__degree': np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

neg_mse_scores = gscv_fitted.cv_results_['mean_test_score']
rmse_scores = np.sqrt(-neg_mse_scores)

results_df = pd.DataFrame(data = {
    "polynomial_size_degree": np.tile(np.arange(1, 11), 10),
    "polynomial_room_degree": np.repeat(np.arange(1, 11), 10),
    "mean_test_score": rmse_scores
})

best_model = results_df.loc[results_df['mean_test_score'].idxmin()]
best_model




In [110]:
gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)


results_df = pd.DataFrame(data = {
    "polynomial_size_degree": np.tile(np.arange(1, 11), 10),
    "polynomial_room_degree": np.repeat(np.arange(1, 11), 10),
    "mean_test_score": gscv_fitted.cv_results_['mean_test_score']
})

best_model = results_df.loc[results_df['mean_test_score'].idxmax()]
best_model

polynomial_size_degree    3.000000
polynomial_room_degree    1.000000
mean_test_score           0.557641
Name: 2, dtype: float64

The best model is a model with a degree 3 polynomial on size and 1 degree on room.
It has an RMSE of 52896.32167 and an R^2 of 0.5576

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

The downside is that it is very computationally intensive and takes a lot of time and power. Using other statistical techniques can help optimize and improve efficiency.