In [12]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


#Reading the data

In [13]:
house_df = pd.read_csv("/Users/conniechou/Downloads/AmesHousing.csv")
house_df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


# Model 1: using only size and number of rooms

In [14]:
#size = GrLivArea
#number of rooms = TotRms AbvGrd
X = house_df.drop("SalePrice", axis = 1) #keep all columns except ur y
y = house_df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [15]:
lr = LinearRegression()
std_s = StandardScaler()

ct = ColumnTransformer(
    [("standardize", std_s, ["Gr Liv Area", "TotRms AbvGrd"])],
    remainder = "drop" #drop the columns we dont need
)

lr_pipeline = Pipeline(
  [("standardize", ct),
  ("linear_regression", lr)]
)

lr_pipeline_fitted = lr_pipeline.fit(X_train, y_train) #fit the model with training data (learn from training data)
y_preds = lr_pipeline_fitted.predict(X_test)

In [16]:
rmse1 = mean_squared_error(y_test, y_preds, squared = False)
rmse1

51648.26945316397

#Model 2: Using size, number of rooms, and building type

In [17]:
enc = OneHotEncoder(sparse_output = False)

ct = ColumnTransformer(
    [("standardize", std_s, ["Gr Liv Area", "TotRms AbvGrd"]),
     ("dummify", enc, ["Bldg Type"])],
    remainder = "drop" #drop the columns we dont need
)

lr_pipeline2 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", lr)]
)

lr_pipeline_fitted2 = lr_pipeline2.fit(X_train, y_train) #fit the model with training data (learn from training data)
y_preds2 = lr_pipeline_fitted2.predict(X_test)

In [18]:
rmse2 = mean_squared_error(y_test, y_preds2, squared = False)
rmse2

50172.44036368484

#Model 3: using size and building type, and their interaction

In [19]:
#size = numerical
#building type = dummy
enc = OneHotEncoder(sparse_output = False)

#first transform building type into dummy in its own column transformer
ct_dummy = ColumnTransformer(
    [("dummify", enc, ["Bldg Type"])], remainder = "passthrough"
).set_output(transform = "pandas")

ct_dummy.fit_transform(X_train)

#then another column transformer to have interaction between size and dummified building type
ct_interact = ColumnTransformer(
    [("interaction", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_1Fam"]),
    ("interaction2", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_2fmCon"]),
    ("interaction3", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_Duplex"]),
    ("interaction4", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_Twnhs"]),
    ("interaction5", PolynomialFeatures(interaction_only = True), ["remainder__Gr Liv Area", "dummify__Bldg Type_TwnhsE"])],
).set_output(transform = "pandas")

lr_pipeline_inter = Pipeline(
    [("dummify", ct_dummy),
     ("interacting", ct_interact),
      ("linear regression", lr)]
).set_output(transform = "pandas")

#fitting model on training data
lr_pipeline_fitted3 = lr_pipeline_inter.fit(X_train, y_train) #fit the model with training data (learn from training data)
y_preds3 = lr_pipeline_fitted3.predict(X_test)

In [20]:
rmse3 = mean_squared_error(y_test, y_preds3, squared = False)
rmse3

49781.53976645589

#Model 4: using a 5-degree polynomial on size, a 5 degree polynomial on number of rooms, and also building type

In [21]:
#size = numerical
#building type = dummy
enc = OneHotEncoder(sparse_output = False)

#first transform building type into dummy in its own column transformer
ct_preprocess = ColumnTransformer(
    [("dummify", enc, ["Bldg Type"]),
     ("polynomial", PolynomialFeatures(degree = 5), ["Gr Liv Area", "TotRms AbvGrd"])], remainder = "drop"
).set_output(transform = "pandas")

ct_preprocess.fit_transform(X_train)

lr_pipeline_poly = Pipeline(
    [("preprocessing", ct_preprocess),
      ("linear regression", lr)]
).set_output(transform = "pandas")

#fitting model on training data
lr_pipeline_fitted4 = lr_pipeline_poly.fit(X_train, y_train) #fit the model with training data (learn from training data)
y_preds4 = lr_pipeline_fitted4.predict(X_test)

In [22]:
rmse4 = mean_squared_error(y_test, y_preds4, squared = False)
rmse4

51389.070588149676

The model that performed the best (that had the lowest RSME) is model 2.

#Cross Validation

In [23]:
#rsme cross validation for model 1
scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')
-scores.mean()

55806.32634926364

In [24]:
#rsme cross validation for model 2
scores2 = cross_val_score(lr_pipeline2, X, y, cv=5, scoring='neg_root_mean_squared_error')
-scores2.mean()

54168.08142919383

In [25]:
#rsme cross validation for model 3
scores3 = cross_val_score(lr_pipeline_inter, X, y, cv=5, scoring='neg_root_mean_squared_error')
-scores3.mean()

53430.92197532802

In [26]:
#rsme cross validation for model 4
scores4 = cross_val_score(lr_pipeline_poly, X, y, cv=5, scoring='neg_root_mean_squared_error')
-scores4.mean()

61241.88398053136

TO INTERPRET RSME, TAKE THE ABSOLUTE VALUE
After taking the absolute value, we can see that model 3 performed better.

# Tuning

In [27]:
ct_tuning = ColumnTransformer(
  [("size_polynomial", PolynomialFeatures(), ["Gr Liv Area"]),
  ("rooms_polynomial", PolynomialFeatures(), ["TotRms AbvGrd"]),
   ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"])],
  remainder = "drop"
)

lr_pipeline_tuning = Pipeline(
  [("preprocessing", ct_tuning),
  ("linear_regression", lr)]
).set_output(transform="pandas")

degrees = {'preprocessing__size_polynomial__degree': np.arange(1, 11),
           "preprocessing__rooms_polynomial__degree": np.arange(1, 11)} #create a dictionary of the polynomials

gscv = GridSearchCV(lr_pipeline_tuning, degrees, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)

gscv_fitted.cv_results_ #ugly format, so put it into a pandas dataframe for easy reading

{'mean_fit_time': array([0.01564851, 0.01349711, 0.01461496, 0.01630807, 0.01728597,
        0.01646156, 0.01602664, 0.01663923, 0.01701298, 0.01436744,
        0.01526542, 0.02240667, 0.01383085, 0.0141654 , 0.02059355,
        0.01515145, 0.01384864, 0.01615996, 0.0185174 , 0.01423745,
        0.01394234, 0.01714735, 0.01984382, 0.01696134, 0.01560931,
        0.01402168, 0.01396179, 0.01443186, 0.01429801, 0.01438317,
        0.0132998 , 0.0135251 , 0.01380186, 0.01384964, 0.01410298,
        0.01509743, 0.02086859, 0.02109132, 0.01418896, 0.01688285,
        0.02102151, 0.01427021, 0.0140626 , 0.01425033, 0.0163013 ,
        0.02027321, 0.02021923, 0.01442204, 0.01431928, 0.01522188,
        0.01379457, 0.01471171, 0.01388254, 0.01390982, 0.01416588,
        0.01574149, 0.015025  , 0.01469984, 0.01494622, 0.01488891,
        0.01382432, 0.04547791, 0.01425533, 0.01417475, 0.01472316,
        0.01573882, 0.01598706, 0.01653366, 0.01745577, 0.01924624,
        0.01570139, 0.0145916 ,

In [29]:
gscv_df = pd.DataFrame(gscv_fitted.cv_results_)
gscv_df = gscv_df[["param_preprocessing__rooms_polynomial__degree",
                   "param_preprocessing__size_polynomial__degree",
                   "mean_test_score"]]
gscv_df.rename(columns = {
    "param_preprocessing__rooms_polynomial__degree" : "room_degrees",
    "param_preprocessing__size_polynomial__degree" : "size_degrees",
    "mean_test_score": "r2"
}, inplace = True)

gscv_df.sort_values(by = "r2", ascending = False).head()

Unnamed: 0,room_degrees,size_degrees,r2
2,1,3,0.557641
12,2,3,0.556857
33,4,4,0.556855
43,5,4,0.556531
22,3,3,0.554039



The best final model is the model with the polynomial degree of 3 for the size of the house and a polynomial degree of 1 for the number of rooms. The R^2 value is 0.557641.