In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
lr = LinearRegression()

ames = pd.read_csv("https://www.dropbox.com/scl/fi/g0n5le5p6fr136ggetfsf/AmesHousing.csv?rlkey=jlr9xtz1o6u5rghfo29a5c02f&dl=1")
X = ames[["Gr Liv Area", "TotRms AbvGrd"]]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train_s = (X_train - X_train.mean())/X_train.std()

lr_fitted = lr.fit(X_train_s, y_train)
lr_fitted.coef_

array([ 75022.70126879, -22304.36352472])

In [4]:
y_preds = lr_fitted.predict(X_test)

r2_score(y_test, y_preds)

-2186577.713755723

### This is problematic because we have scaled our data for the training data, but we didn't scale our test data before predicting

In [5]:
new_house = pd.DataFrame(data = {"Gr Liv Area": [889], "TotRms AbvGrd": [6]})
new_house

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,889,6


In [6]:
new_house_s = (new_house - new_house.mean())/new_house.std()
new_house_s

Unnamed: 0,Gr Liv Area,TotRms AbvGrd
0,,


#### We have to put our test data through the **exact** same calculations we put the training data through. This means, if we normalize the training data, we will normalize the testing data by subtracting the TRAINING mean and dividing by the TRAINING standard deviation.

In [7]:
X_test_s = (X_test - X_train.mean())/X_train.std()
y_preds = lr_fitted.predict(X_test_s)

r2_score(y_test, y_preds)

0.4287509270693476

In [8]:
new_house_s = (new_house - X_train.mean())/X_train.std()
lr_fitted.predict(new_house_s)

array([94057.18476423])

#### Pipelines will set up a procedure of everything that happens to the data (excluding cleaning) before a model is is trained.

In [9]:
lr_pipeline = Pipeline(
  [StandardScaler(),
   LinearRegression()]
)

lr_pipeline

#### We can name our steps in our pipeline:

In [10]:
lr_pipeline = Pipeline(
  [("standardize", StandardScaler()),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [11]:
lr_pipeline_fitted = lr_pipeline.fit(X_train, y_train)

y_preds = lr_pipeline_fitted.predict(X_test)
r2_score(y_test, y_preds)

0.42875092706934737

In [12]:
lr_pipeline_fitted.predict(new_house)

array([94057.18476423])

#### Column Transformers:
- Like a pipeline, but is applied to only specific columns in the dataframe
- remainder = "drop" says to get rid of all the extra columns which are not specified in the column transformer.

In [13]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

lr_pipeline

In [14]:
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

lr_fitted = lr_pipeline.fit(X_train, y_train)
lr_fitted

#### We can fit a column transformer directly on the dataset to see what the transformed dataset will look like.

In [15]:
ct_fitted = ct.fit(X_train)

ct.transform(X_train)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.66741485,  0.35986586],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.6290322 , -0.27314078],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.42736266, -0.27314078],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         4.16494089,  0.35986586],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.87676704,  0.9928725 ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.03743762,  0.35986586]])

In [16]:
ct.transform(X_test)

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.66549419,  0.35986586],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -1.36656652, -0.90614742],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.46001392,  0.35986586],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.36013948, -0.27314078],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.93822823,  0.35986586],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.42928332, -0.27314078]])

### Challenges:
- We used to be able to call `fitted_model.coef_` to get out the coefficients of a model.
- In a pipeline you have to call the named step to get out the coefficients: `fitted_pipeline.named_steps["lr"].coef_`
- Most of the outputs are numpy arrays, and that can be hard to see the results. Therefore we can use the `.set_output(transform="pandas")` to see the output as a pandas dataframe.


In [17]:
lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


ct.fit_transform(X_train)

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,standardize__Gr Liv Area,standardize__TotRms AbvGrd
1969,1.0,0.0,0.0,0.0,0.0,0.667415,0.359866
1237,1.0,0.0,0.0,0.0,0.0,-0.629032,-0.273141
1948,1.0,0.0,0.0,0.0,0.0,-0.427363,-0.273141
2810,1.0,0.0,0.0,0.0,0.0,0.093137,0.359866
696,1.0,0.0,0.0,0.0,0.0,-0.379346,-0.273141
...,...,...,...,...,...,...,...
2357,1.0,0.0,0.0,0.0,0.0,-0.373584,-0.906147
1350,1.0,0.0,0.0,0.0,0.0,-0.419680,-0.273141
2737,1.0,0.0,0.0,0.0,0.0,4.164941,0.359866
55,1.0,0.0,0.0,0.0,0.0,0.876767,0.992873


#### Notice that our column names got changed. There is a step label attached to each new column name as well.

##### Structure for transformed dummy column variables:
`[step name]__[variable name]_[category]`



#### Interaction Terms:

In [18]:
ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["Gr Liv Area", "TotRms AbvGrd"])
  ], remainder = "drop").set_output(transform = "pandas")

ct_inter.fit_transform(X_train)

Unnamed: 0,interaction__1,interaction__Gr Liv Area,interaction__TotRms AbvGrd,interaction__Gr Liv Area TotRms AbvGrd
1969,1.0,1851.0,7.0,12957.0
1237,1.0,1176.0,6.0,7056.0
1948,1.0,1281.0,6.0,7686.0
2810,1.0,1552.0,7.0,10864.0
696,1.0,1306.0,6.0,7836.0
...,...,...,...,...
2357,1.0,1309.0,5.0,6545.0
1350,1.0,1285.0,6.0,7710.0
2737,1.0,3672.0,7.0,25704.0
55,1.0,1960.0,8.0,15680.0


#### To do an interaction term with a dummy variable, we must do two column transformers, because we need to get the output of the first dummify column transformer before we can feed it to the second interaction term transformer.

In [19]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "passthrough"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["remainder__TotRms AbvGrd", "dummify__Bldg Type_1Fam"]),
  ],
  remainder = "drop"
).set_output(transform = "pandas")

X_train_dummified = ct_dummies.fit_transform(X_train)
X_train_dummified

Unnamed: 0,dummify__Bldg Type_1Fam,dummify__Bldg Type_2fmCon,dummify__Bldg Type_Duplex,dummify__Bldg Type_Twnhs,dummify__Bldg Type_TwnhsE,remainder__Order,remainder__PID,remainder__MS SubClass,remainder__MS Zoning,remainder__Lot Frontage,...,remainder__Screen Porch,remainder__Pool Area,remainder__Pool QC,remainder__Fence,remainder__Misc Feature,remainder__Misc Val,remainder__Mo Sold,remainder__Yr Sold,remainder__Sale Type,remainder__Sale Condition
1969,1.0,0.0,0.0,0.0,0.0,1970,535476360,60,RL,80.0,...,0,0,,MnWw,,0,7,2007,WD,Normal
1237,1.0,0.0,0.0,0.0,0.0,1238,535152200,20,RL,74.0,...,95,0,,,,0,5,2008,WD,Normal
1948,1.0,0.0,0.0,0.0,0.0,1949,535378080,50,RL,60.0,...,0,0,,,,0,8,2007,WD,Abnorml
2810,1.0,0.0,0.0,0.0,0.0,2811,907410110,20,RL,84.0,...,0,0,,,,0,7,2006,New,Partial
696,1.0,0.0,0.0,0.0,0.0,697,902105050,50,RM,90.0,...,0,0,,MnPrv,,0,7,2009,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2357,1.0,0.0,0.0,0.0,0.0,2358,527377030,20,RL,78.0,...,0,648,Fa,GdPrv,,0,1,2006,WD,Normal
1350,1.0,0.0,0.0,0.0,0.0,1351,903236010,50,RM,50.0,...,0,0,,MnPrv,,0,1,2008,WD,Normal
2737,1.0,0.0,0.0,0.0,0.0,2738,905427030,75,RL,60.0,...,0,0,,,,0,12,2006,WD,Normal
55,1.0,0.0,0.0,0.0,0.0,56,528240070,60,RL,,...,0,0,,,,0,5,2010,WD,Normal


In [20]:
ct_inter.fit_transform(X_train_dummified)

Unnamed: 0,interaction__1,interaction__remainder__TotRms AbvGrd,interaction__dummify__Bldg Type_1Fam,interaction__remainder__TotRms AbvGrd dummify__Bldg Type_1Fam
1969,1.0,7.0,1.0,7.0
1237,1.0,6.0,1.0,6.0
1948,1.0,6.0,1.0,6.0
2810,1.0,7.0,1.0,7.0
696,1.0,6.0,1.0,6.0
...,...,...,...,...
2357,1.0,5.0,1.0,5.0
1350,1.0,6.0,1.0,6.0
2737,1.0,7.0,1.0,7.0
55,1.0,8.0,1.0,8.0


## Practice Activity: Pipelines
Consider four possible models for predicting house prices:

- Using only the size and number of rooms.
- Using size, number of rooms, and building type.
- Using size and building type, and their interaction.
- Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.



In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

In [22]:
rmse_df = pd.DataFrame({"RMSE" : [], "Predictor Vars" : []})

In [23]:
# Using only size and number of rooms:

lr = LinearRegression()

ct = ColumnTransformer([
    ("keep", FunctionTransformer(lambda x: x), ["Gr Liv Area", "TotRms AbvGrd"])
])

pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", lr)]
)

pipeline_1_fitted = pipeline_1.fit(X_train, y_train)
preds_1 = pipeline_1_fitted.predict(X_test)
rmse_1 = mean_squared_error(y_test, preds_1, squared=False)

new_row = [rmse_1, "Gr Liv Area, TotRms AbvGrd"]
rmse_df.loc[len(rmse_df.index)] = new_row
rmse_df

Unnamed: 0,RMSE,Predictor Vars
0,53916.474342,"Gr Liv Area, TotRms AbvGrd"


In [24]:
# Using size, number of rooms, and building type:

lr = LinearRegression()
enc = OneHotEncoder()

ct = ColumnTransformer([
    ("one_hot", enc, ["Bldg Type"]),
    ("keep", FunctionTransformer(lambda x: x), ["Gr Liv Area", "TotRms AbvGrd"])
])

pipeline_2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", lr)]
)

pipeline_2_fitted = pipeline_2.fit(X_train, y_train)
preds_2 = pipeline_2_fitted.predict(X_test)
rmse_2 = mean_squared_error(y_test, preds_2, squared=False)

new_row = [rmse_2, "Gr Liv Area, TotRms AbvGrd, Bldg Type"]
rmse_df.loc[len(rmse_df.index)] = new_row
rmse_df

Unnamed: 0,RMSE,Predictor Vars
0,53916.474342,"Gr Liv Area, TotRms AbvGrd"
1,51902.382754,"Gr Liv Area, TotRms AbvGrd, Bldg Type"


In [25]:
# Using size and building type, and their interaction.

lr = LinearRegression()
poly = PolynomialFeatures(interaction_only=True)
enc = OneHotEncoder(sparse_output=False)

ct_1 = ColumnTransformer([
    ("one_hot", enc, ["Bldg Type"]),
    ("keep", FunctionTransformer(lambda x: x), ["Gr Liv Area"])
])

ct_2 = ColumnTransformer([
    ("inter_term_1", poly, ["one_hot__Bldg Type_1Fam", "keep__Gr Liv Area"]),
    ("inter_term_2", poly, ["one_hot__Bldg Type_2fmCon", "keep__Gr Liv Area"]),
    ("inter_term_3", poly, ["one_hot__Bldg Type_Duplex", "keep__Gr Liv Area"]),
    ("inter_term_4", poly, ["one_hot__Bldg Type_Twnhs", "keep__Gr Liv Area"]),
    ("inter_term_5", poly, ["one_hot__Bldg Type_TwnhsE", "keep__Gr Liv Area"])
])

pipeline_3 = Pipeline(
  [("one_hot_enc", ct_1),
   ("inter_term", ct_2),
  ("linear_regression", lr)]
).set_output(transform = "pandas")

pipeline_3_fitted = pipeline_3.fit(X_train, y_train)
preds_3 = pipeline_3_fitted.predict(X_test)
rmse_3 = mean_squared_error(y_test, preds_3, squared=False)

new_row = [rmse_3, "Gr Liv Area, Bldg Type, and Interactions"]
rmse_df.loc[len(rmse_df.index)] = new_row
rmse_df



Unnamed: 0,RMSE,Predictor Vars
0,53916.474342,"Gr Liv Area, TotRms AbvGrd"
1,51902.382754,"Gr Liv Area, TotRms AbvGrd, Bldg Type"
2,51468.847174,"Gr Liv Area, Bldg Type, and Interactions"


In [26]:
# Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms,
# and also building type. Set up a pipeline for each of these four models.

lr = LinearRegression()
poly = PolynomialFeatures(5)
enc = OneHotEncoder(sparse_output=False)

ct_1 = ColumnTransformer([
    ("one_hot", enc, ["Bldg Type"]),
    ("poly_deg5_1", poly, ["Gr Liv Area"]),
    ("poly_deg5_2", poly, ["TotRms AbvGrd"])
])

pipeline_4 = Pipeline(
  [("col_transform", ct_1),
  ("linear_regression", lr)]
).set_output(transform = "pandas")

pipeline_4_fitted = pipeline_4.fit(X_train, y_train)
preds_4 = pipeline_4_fitted.predict(X_test)
rmse_4 = mean_squared_error(y_test, preds_4, squared=False)

new_row = [rmse_4, "Gr Liv Area Degree 5, TotRms AbvGrd Degree 5, Bldg Type"]
rmse_df.loc[len(rmse_df.index)] = new_row
rmse_df

Unnamed: 0,RMSE,Predictor Vars
0,53916.474342,"Gr Liv Area, TotRms AbvGrd"
1,51902.382754,"Gr Liv Area, TotRms AbvGrd, Bldg Type"
2,51468.847174,"Gr Liv Area, Bldg Type, and Interactions"
3,54087.215042,"Gr Liv Area Degree 5, TotRms AbvGrd Degree 5, ..."


In [27]:
rmse_df.sort_values(by=["RMSE"])

Unnamed: 0,RMSE,Predictor Vars
2,51468.847174,"Gr Liv Area, Bldg Type, and Interactions"
1,51902.382754,"Gr Liv Area, TotRms AbvGrd, Bldg Type"
0,53916.474342,"Gr Liv Area, TotRms AbvGrd"
3,54087.215042,"Gr Liv Area Degree 5, TotRms AbvGrd Degree 5, ..."


#### The model that performed best was the model with Gr Liv Area, Bldg Type, and interactions between the two.

#### Cross-Validation
Procedure for 5-fold cross-validation:
1. Randomly divide the houses into 5 sets. Call these fold1, fold2, ..., fold5.
2. Make fold1 the test set, and fold2-fold5 the train set
3. Fir the data on the houses in the training set, predict the prices of the houses test set, and record the resulting R-squared.
4. Repeat 2 and 3 and let each fold have a turn as the test set.
5. Take the average of the 5 different R-squared values

In [28]:
from sklearn.model_selection import cross_val_score

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer([

    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])

], remainder = "drop")

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline_1, X, y, cv=10, scoring='r2')
scores

array([0.53292087, 0.48425809, 0.21608789, 0.55091396, 0.39478537,
       0.38370869, 0.65638132, 0.4604257 , 0.65604063, 0.46110802])

In [29]:
scores.mean()

0.47966305427780015

### Practice Activity: Cross Validation
Once again consider four modeling options for house price:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [30]:
rmse_df["5CV_RMSE"] = [abs(cross_val_score(pipeline_1, X, y, cv=5, scoring="neg_root_mean_squared_error")).mean(),
                       abs(cross_val_score(pipeline_2, X, y, cv=5, scoring="neg_root_mean_squared_error")).mean(),
                       abs(cross_val_score(pipeline_3, X, y, cv=5, scoring="neg_root_mean_squared_error")).mean(),
                       abs(cross_val_score(pipeline_4, X, y, cv=5, scoring="neg_root_mean_squared_error")).mean()]


In [31]:
rmse_df.sort_values(by=["5CV_RMSE"])

Unnamed: 0,RMSE,Predictor Vars,5CV_RMSE
2,51468.847174,"Gr Liv Area, Bldg Type, and Interactions",53440.886755
1,51902.382754,"Gr Liv Area, TotRms AbvGrd, Bldg Type",54168.081429
0,53916.474342,"Gr Liv Area, TotRms AbvGrd",55806.326349
3,54087.215042,"Gr Liv Area Degree 5, TotRms AbvGrd Degree 5, ...",56303.178211


#### The model we prefer is still the model with Interactions, though when we average we see the model in truth performed worse than we thought from a single train/test split in the beginning. My conclusion from earlier is still valid.

#### Model Tuning

In [32]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [33]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.00771232, 0.00778399, 0.01406164, 0.00550132, 0.00396399,
        0.00402951, 0.00446267, 0.0043016 , 0.00435343]),
 'std_fit_time': array([4.77763727e-03, 3.23396414e-03, 6.75467624e-03, 3.05752405e-03,
        6.73467962e-05, 2.67173524e-05, 2.70040524e-04, 6.07819446e-05,
        1.25571478e-04]),
 'mean_score_time': array([0.00332651, 0.00205212, 0.00509195, 0.00174155, 0.00168881,
        0.0016964 , 0.00180655, 0.00178132, 0.00177422]),
 'std_score_time': array([2.12556488e-03, 4.11382879e-04, 2.90563536e-03, 8.96660040e-05,
        1.90637087e-05, 4.34785549e-06, 1.17592665e-04, 5.91488770e-05,
        4.33593143e-05]),
 'param_preprocessing__polynomial__degree': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'params': [{'preprocessing__polynomial__degree': 1},
  {'preprocessing__polynomial__degree

### This gives us too much information. Instead we want to access the cross-validated metric.

In [34]:
gscv_fitted.cv_results_['mean_test_score']

array([ 0.52988868,  0.5314061 ,  0.55123636,  0.54211913,  0.45186012,
        0.33383744,  0.02932172, -0.96809611, -4.54559382])

In [35]:
pd.DataFrame(data = {"degrees": np.arange(1, 10), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.529889
1,2,0.531406
2,3,0.551236
3,4,0.542119
4,5,0.45186
5,6,0.333837
6,7,0.029322
7,8,-0.968096
8,9,-4.545594


### Practice Activity: Grid Search
Consider one hundred modeling options for house price:

1. House size, trying degrees 1 through 10
2. Number of rooms, trying degrees 1 through 10
3. Building Type

Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [36]:
ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial_house_size", PolynomialFeatures(), ["Gr Liv Area"]),
    ("polynomial_room_num", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {"preprocessing__polynomial_house_size__degree" : np.arange(1, 11),
           "preprocessing__polynomial_room_num__degree" : np.arange(1, 11)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring="neg_root_mean_squared_error")

In [37]:
gscv_fitted = gscv.fit(X, y)

In [38]:
results = gscv_fitted.cv_results_

data_to_frame = []

for i in range(len(results['params'])):
    row_entry = {
        'Gr Liv Area Deg': results['params'][i]['preprocessing__polynomial_house_size__degree'],
        'TotRms AbvGrd Deg': results['params'][i]['preprocessing__polynomial_room_num__degree'],
        'RMSE': -results['mean_test_score'][i]
    }
    data_to_frame.append(row_entry)

df = pd.DataFrame(data_to_frame)

df.sort_values(by=["RMSE"]).head()

Unnamed: 0,Gr Liv Area Deg,TotRms AbvGrd Deg,RMSE
20,3,1,52781.985438
33,4,4,52808.685447
34,4,5,52831.98798
21,3,2,52837.445387
36,4,7,52956.07659


In [42]:
ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial_house_size", PolynomialFeatures(3), ["Gr Liv Area"]),
    ("polynomial_room_num", PolynomialFeatures(1), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

fitted_lr_pipeline = lr_pipeline_poly.fit(X_train, y_train)

In [43]:
y_preds = fitted_lr_pipeline.predict(X_test)
r2_score(y_test, y_preds)

0.5885255839334983

### Q1: The best model is a degree 3 polynomial applied to Liv Area Deg, and the standard single-degree TotRms AbvGrd.

### Q2: This could end up taking a long time to perform the grid search, and we want to see the results at the end which was a little annoying to lay out in a for loop. We could space out some tuning values, say 1, 4, 6, and 10, for each of the variables. Then, if the values the model settled on where 4 for var1 and 6 for var2, we could then try 2,3,4,5 for var1 and 5,6,7,8,9 for var2. We could narrow down more slowly on the optimal values.