In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
lr = LinearRegression()


ames = pd.read_csv("/content/AmesHousing.csv")


cross validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer([
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores

array([0.53197809, 0.53225302, 0.43051812, 0.56616942, 0.60636221])

In [None]:
scores.mean()

0.5334561732637108

Using only the size and number of rooms.

In [None]:
ct = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
scores
print(scores.mean())
scores1 = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(abs(scores1.mean()))

0.504208752508862
55806.32634926364


Using size, number of rooms, and building type.

In [None]:
ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
).set_output(transform = "pandas")


lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
scores
print(scores.mean())
scores1 = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(abs(scores1.mean()))

0.5334561732637108
54137.64338938809


Using size and building type, and their interaction.

In [None]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
  ("standardize", StandardScaler(), ["Gr Liv Area"])],
  remainder = "drop"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_1Fam"]),
    ("interaction1", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["standardize__Gr Liv Area", "dummify__Bldg Type_TwnhsE"])
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

lr_pipeline = Pipeline(
  [("preprocessing", ct_dummies),
   ("preprocessing2", ct_inter),
  ("linear_regression", LinearRegression())]
)

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
scores
print(scores.mean())
scores1 = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(abs(scores1.mean()))

0.544770254238968
53436.84123518415


Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.


In [None]:
ct_dummies = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
  ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])],
  remainder = "drop"
).set_output(transform = "pandas")

ct_poly = ColumnTransformer(
  [
    ("poly", PolynomialFeatures(degree=5, interaction_only = False), ["standardize__TotRms AbvGrd"]),
    ("poly1", PolynomialFeatures(degree=5, interaction_only = False), ["standardize__Gr Liv Area"])

  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

lr_pipeline = Pipeline(
  [("preprocessing", ct_dummies),
   ("preprocessing2", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')
scores
print(scores.mean())
scores1 = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')
print(abs(scores1.mean()))


0.5106643234404126
55176.965943380856


number 3 is the best one

tuning

In [None]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [None]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.03542948, 0.04030037, 0.0178256 , 0.02168918, 0.01830306,
        0.01798182, 0.01780739, 0.01843557, 0.01857958]),
 'std_fit_time': array([0.01371656, 0.02684485, 0.00032444, 0.0042769 , 0.00038145,
        0.00046289, 0.00026831, 0.00049416, 0.00029878]),
 'mean_score_time': array([0.01473002, 0.0159565 , 0.00955772, 0.01210489, 0.01078916,
        0.00986848, 0.00998712, 0.01048956, 0.01044073]),
 'std_score_time': array([0.00413865, 0.00793411, 0.00020166, 0.00523731, 0.00197782,
        0.00019139, 0.00086691, 0.0018496 , 0.00138233]),
 'param_preprocessing__polynomial__degree': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=999999),
 'params': [{'preprocessing__polynomial__degree': 1},
  {'preprocessing__polynomial__degree': 2},
  {'preprocessing__polynomial__degree': 3},
  {'preprocessing__polynomial__degree': 4},
  {'preprocessing

In [None]:
gscv_fitted.cv_results_['mean_test_score']

array([ 0.52988868,  0.5314061 ,  0.55123644,  0.54201499,  0.45186012,
        0.33383743,  0.02932179, -0.96809594, -4.54560239])

In [None]:
pd.DataFrame(data = {"degrees": np.arange(1, 10), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.529889
1,2,0.531406
2,3,0.551236
3,4,0.542015
4,5,0.45186
5,6,0.333837
6,7,0.029322
7,8,-0.968096
8,9,-4.545602


Consider one hundred modeling options for house price:

House size, trying degrees 1 through 10
Number of rooms, trying degrees 1 through 10
Building Type
Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [None]:
ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"]),
    ("polynomial1", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

polyreg = {
    'preprocessing__polynomial__degree': np.arange(1, 11),
    'preprocessing__polynomial1__degree': np.arange(1, 11)
}

gscv = GridSearchCV(lr_pipeline_poly, polyreg, cv = 5, scoring='r2')


In [None]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.04425426, 0.06943216, 0.07240076, 0.0327909 , 0.05632915,
        0.13203063, 0.05879312, 0.04782686, 0.05704975, 0.04358191,
        0.02778916, 0.02272754, 0.02304897, 0.02228627, 0.02206755,
        0.02794442, 0.05036769, 0.04991555, 0.04387555, 0.04174156,
        0.0302577 , 0.02248573, 0.02204604, 0.02384973, 0.02198901,
        0.06249747, 0.03957171, 0.0369669 , 0.03943949, 0.03897867,
        0.02883897, 0.02368398, 0.02212739, 0.02308865, 0.02867188,
        0.04308267, 0.04893417, 0.04248314, 0.04383059, 0.05138197,
        0.0338861 , 0.02277637, 0.02472539, 0.07093019, 0.07801738,
        0.07731805, 0.11630349, 0.08644938, 0.16447401, 0.07585015,
        0.03334169, 0.02432942, 0.03817716, 0.06889057, 0.04788246,
        0.0881856 , 0.03047986, 0.05204849, 0.05215492, 0.06698046,
        0.03267765, 0.04819798, 0.03401084, 0.06696324, 0.06819367,
        0.05424232, 0.0583796 , 0.04070411, 0.04199553, 0.06096087,
        0.03009272, 0.05052242,

In [None]:
gscv_fitted.cv_results_['mean_test_score']

array([ 5.32882439e-01,  5.37471938e-01,  5.57640609e-01,  5.49239651e-01,
        4.51860123e-01,  3.33837438e-01,  2.93217770e-02, -9.68094802e-01,
       -4.54560839e+00, -1.61879353e+01,  5.32382847e-01,  5.33567353e-01,
        5.56857257e-01,  5.50157883e-01,  4.51860121e-01,  3.33837438e-01,
        2.93217540e-02, -9.68096194e-01, -4.54560441e+00, -1.61879378e+01,
        5.35924169e-01,  5.34134134e-01,  5.54039049e-01,  5.50627573e-01,
        5.05207779e-01,  3.33837438e-01,  2.93216723e-02, -9.68095979e-01,
       -4.54560441e+00, -1.61879378e+01,  5.41528749e-01,  5.35417599e-01,
        5.50392432e-01,  5.56932107e-01,  4.96715171e-01,  3.33837440e-01,
        2.93216921e-02, -9.68095979e-01, -4.54560441e+00, -1.61879378e+01,
        5.41066183e-01,  5.30267305e-01,  5.46549255e-01,  5.56413549e-01,
        4.92694119e-01,  3.33837437e-01,  2.93216921e-02, -9.68095979e-01,
       -4.54560441e+00, -1.61879378e+01,  5.34862257e-01,  5.33313563e-01,
        5.45170683e-01,  

In [None]:
gscv_fitted.cv_results_['params']
params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])

Unnamed: 0,preprocessing__polynomial1__degree,preprocessing__polynomial__degree
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5


In [None]:
results_df = params_df.assign(scores=gscv_fitted.cv_results_['mean_test_score'])
results_df.sort_values(by='scores', ascending=False)

Unnamed: 0,preprocessing__polynomial1__degree,preprocessing__polynomial__degree,scores
2,1,3,0.557641
33,4,4,0.556932
12,2,3,0.556857
43,5,4,0.556414
22,3,3,0.554039
...,...,...,...
89,9,10,-16.188760
99,10,10,-16.188760
90,10,1,-184.221203
91,10,2,-189.473656


The best model is one with total rooms to degree 1, and total space to degree 3. With the best R^2 value at .5576