In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [3]:
ames = pd.read_csv("AmesHousing.csv")
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [4]:
y = ames["SalePrice"]
X = ames.drop(columns="SalePrice", axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [6]:
#model 1: Using only the size and number of rooms

from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


In [7]:
lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")
lr_fitted = lr_pipeline.fit(X_train, y_train)


In [8]:
y_pred_model1  = lr_fitted.predict(X_test)
r2_score(y_test, y_pred_model1)

0.5591948812206817

In [9]:
#model 2


ct_2= ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

model2_pipeline = Pipeline(
  [("preprocessing", ct_2),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")
model2_fitted = model2_pipeline.fit(X_train, y_train)
y_pred_model2  = model2_fitted.predict(X_test)
r2_score(y_test, y_pred_model2)

0.5992139661113314

In [10]:
#model 3

ct_step_1= ColumnTransformer(
    [
      ("standardize", StandardScaler(), ["Gr Liv Area"]),
      ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])
    ],
    remainder = "drop"
).set_output(transform="pandas")

# transformed = ct_step_1.fit_transform(X_train)
# transformed

In [11]:
ct_step_2 = ColumnTransformer(
    [
      ("interaction1", PolynomialFeatures(interaction_only= True), ["standardize__Gr Liv Area", "dummify__Bldg Type_1Fam"]),
      ("interaction2", PolynomialFeatures(interaction_only= True), ["standardize__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
      ("interaction3", PolynomialFeatures(interaction_only= True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
      ("interaction4", PolynomialFeatures(interaction_only= True), ["standardize__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
      ("interaction5", PolynomialFeatures(interaction_only= True), ["standardize__Gr Liv Area", "dummify__Bldg Type_TwnhsE"])
    ],
    remainder = "passthrough"
).set_output(transform="pandas")

In [14]:
# ct_step_2.fit_transform(transformed)

In [12]:
model3_pipeline = Pipeline(
  [("preprocessing", ct_step_1),
   ("interacion", ct_step_2),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")
model3_fitted = model3_pipeline.fit(X_train, y_train)
y_pred_model3  = model3_fitted.predict(X_test)
r2_score(y_test, y_pred_model3)

0.6113459026161346

In [23]:
#model 4  5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.


ct_step_1= ColumnTransformer(
    [
      ("standardize", StandardScaler(), ["TotRms AbvGrd"]),
      ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])
    ],
    remainder = "drop"
).set_output(transform="pandas")

transformed = ct_step_1.fit_transform(X_train)

ct_step_2 = ColumnTransformer(
    [
      ("polynomial1", PolynomialFeatures(degree=5), ["standardize__TotRms AbvGrd"]),
      ("polynomial2", PolynomialFeatures(degree=5), ["dummify__Bldg Type_1Fam"])
    ],
    remainder = "drop"
).set_output(transform="pandas")

# ct_step_2.fit_transform(transformed)
model4_pipeline = Pipeline(
  [("preprocessing", ct_step_1),
   ("interacion", ct_step_2),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

In [24]:
model4_fitted = model4_pipeline.fit(X_train, y_train)
y_pred_model4  = model4_fitted.predict(X_test)
r2_score(y_test, y_pred_model4)

0.26390785279973217

In [14]:
from sklearn.model_selection import cross_val_score

X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")


scores = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring='r2')
scores

array([0.53197809, 0.53225302, 0.43051812, 0.56616942, 0.60636221])

In [15]:
scores.mean()


0.5334561732637108

In [25]:
score1 = (cross_val_score(lr_pipeline, X, y, cv=5, scoring='r2')).mean()
score2 = (cross_val_score(model2_pipeline, X, y, cv=5, scoring='r2')).mean()
score3 = (cross_val_score(model3_pipeline, X, y, cv=5, scoring='r2')).mean()
score4 = (cross_val_score(model4_pipeline, X, y, cv=5, scoring='r2')).mean()


In [27]:
R2Scores = pd.DataFrame({"Model" : ["Model 1", "Model 2", "Model 3", "Model 4"], "R2 Score" : [score1, score2, score3, score4]})
R2Scores.sort_values("R2 Score", ascending= False)

#This states that model 3 is the best with an R2 of .5448, without cross val, model 3 was the best but with a higher r2 score

Unnamed: 0,Model,R2 Score
2,Model 3,0.54477
1,Model 2,0.533456
0,Model 1,0.504209
3,Model 4,0.224686


In [28]:
from sklearn.model_selection import GridSearchCV

ct_poly = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial", PolynomialFeatures(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline_poly = Pipeline(
  [("preprocessing", ct_poly),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial__degree': np.arange(1, 10)}

gscv = GridSearchCV(lr_pipeline_poly, degrees, cv = 5, scoring='r2')

In [29]:
gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_

{'mean_fit_time': array([0.05461555, 0.06137147, 0.04283214, 0.03633232, 0.0313343 ,
        0.04798107, 0.02136884, 0.01854639, 0.01887398]),
 'std_fit_time': array([0.0095543 , 0.01507286, 0.01364535, 0.0131133 , 0.01267348,
        0.02434488, 0.0063961 , 0.00076719, 0.00099813]),
 'mean_score_time': array([0.0315938 , 0.03137927, 0.02152319, 0.02084579, 0.01313572,
        0.01510296, 0.00986218, 0.00997286, 0.01015286]),
 'std_score_time': array([0.01646285, 0.00840299, 0.00571476, 0.00783358, 0.0030269 ,
        0.00632613, 0.0002177 , 0.00046236, 0.00029043]),
 'param_preprocessing__polynomial__degree': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=999999),
 'params': [{'preprocessing__polynomial__degree': 1},
  {'preprocessing__polynomial__degree': 2},
  {'preprocessing__polynomial__degree': 3},
  {'preprocessing__polynomial__degree': 4},
  {'preprocessing

In [30]:
gscv_fitted.cv_results_['mean_test_score']

array([ 0.52988868,  0.5314061 ,  0.55123644,  0.54201499,  0.45186012,
        0.33383743,  0.02932179, -0.96809594, -4.54560239])

In [31]:
pd.DataFrame(data = {"degrees": np.arange(1, 10), "scores": gscv_fitted.cv_results_['mean_test_score']})

Unnamed: 0,degrees,scores
0,1,0.529889
1,2,0.531406
2,3,0.551236
3,4,0.542015
4,5,0.45186
5,6,0.333837
6,7,0.029322
7,8,-0.968096
8,9,-4.545602


In [None]:
#Consider one hundred modeling options for house price:

# House size, trying degrees 1 through 10
# Number of rooms, trying degrees 1 through 10
# Building Type
# Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

# Q1: Which model performed the best?

# Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [41]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
        ("polynomial_size", PolynomialFeatures(include_bias= False), ["Gr Liv Area"]),
        ("polynomial_rooms", PolynomialFeatures(include_bias = False), ["TotRms AbvGrd"])
    ],
).set_output(transform = "pandas")
ct.fit_transform(X_train)

tuning_model = Pipeline(
    [("preprocessing", ct),
    ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial_size__degree': np.arange(1, 11),
        'preprocessing__polynomial_rooms__degree': np.arange(1, 11)
}

gscv = GridSearchCV(tuning_model, degrees, cv=5, scoring='r2')
gscv_fitted = gscv.fit(X, y)



In [58]:
results = pd.DataFrame({"degrees": gscv_fitted.cv_results_['params'], "scores": gscv_fitted.cv_results_['mean_test_score']})
print(results.sort_values("scores", ascending= False).head(1))


                                             degrees    scores
2  {'preprocessing__polynomial_rooms__degree': 1,...  0.557641


In [55]:
#downside is that it took a bit of time for the result to be computed