In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### Part A

In [73]:
# Load the train and test datasets into a Pandas DataFrame.
trainData = pd.read_csv("/home/cwbenton/projects/STAT-4140/HW/HW3/house-prices/train.csv")
testData = pd.read_csv("/home/cwbenton/projects/STAT-4140/HW/HW3/house-prices/test.csv")

### Part B

In [74]:
# Explore categorical and continuous input variables appropriately
#trainData.info()

In [75]:
features = trainData.drop(columns=["SalePrice"])
target = trainData["SalePrice"]

# Grab all the categorical features (ordinal data is categorical, but coded as numbers)
categorical_features = trainData.select_dtypes(include=object).columns
categorical_features = categorical_features.union(["MSSubClass", "OverallQual", "OverallCond"])

# Grab all the continuous features (some categorical data are coded as numbers)
continuous_features = trainData.select_dtypes(exclude=object).columns
continuous_features = continuous_features.drop(["Id", "MSSubClass", "OverallQual", "OverallCond"])

In [76]:
# View the categorical features
print(categorical_features)

# View the continuous features
print(continuous_features)

Index(['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'BsmtQual', 'CentralAir', 'Condition1', 'Condition2',
       'Electrical', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd',
       'Fence', 'FireplaceQu', 'Foundation', 'Functional', 'GarageCond',
       'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 'HeatingQC',
       'HouseStyle', 'KitchenQual', 'LandContour', 'LandSlope', 'LotConfig',
       'LotShape', 'MSSubClass', 'MSZoning', 'MasVnrType', 'MiscFeature',
       'Neighborhood', 'OverallCond', 'OverallQual', 'PavedDrive', 'PoolQC',
       'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType', 'Street',
       'Utilities'],
      dtype='object')
Index(['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'Kitch

In [77]:
# See if there are NaN values for our variables. We will have to do some imputing.
trainData.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [78]:
'''
We can impute all continuous features with the median. As for categorical imputation, we may need
to try different methods. It may not be wise to do most_frequent for all categorical variables. Let's
take a look at them.
'''
toReplace = ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
for col in toReplace:
    trainData.fillna({col: "NA"}, inplace=True)

'''
We will imupte the rest of the continuous variables with the median, however, for houses without
a garage, I will replace the garage built year with the year that the house was built. Also, the
MasVnrType variable uses 'None' instead of 'NA'
'''
trainData.fillna({'GarageYrBlt': trainData['YearBuilt']}, inplace=True)
trainData.fillna({'MasVnrType': "None"}, inplace=True)

# We use the fillna({col : value}, inplace=True) notation because Pandas documentation says it will be standard in Pandas 3.0.

In [79]:
# See if any NaN values remain
trainData.isnull().sum()

# The remaining nulls can be imputed with the pipeline created below.

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

### Part C/D: Preprocessing and Feature Engineering

In [80]:
'''
Upon looking at all histograms for the continuous features, there is skew. This should make sense
since some houses are larger than others (and some can be really big). Therefore, for imputation, we
will be using the median for all these continuous features.
Also, due to the outliers, we will be using RobustScaler to scale our continuous features.
'''
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Create a pipeline for continuous imputation.
continuous_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

# Create a pipeline for categorical imputation.
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine these two pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('continuous', continuous_pipeline, continuous_features),
        ('categorical', categorical_pipeline, categorical_features)
    ])

# Create a pipeline with the preprocessing steps and additional steps if needed
cleaning_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# View the pipeline
cleaning_pipeline

In [81]:
# Fit our pipeline to the model.
train_data_preprocessed = cleaning_pipeline.fit_transform(trainData)

# Store column names after encoding
categorical_columns_encoded = preprocessor.named_transformers_['categorical']['onehot'].get_feature_names_out(categorical_features)

In [82]:
# Put the preprocessed data to a dataframe. Grab the column names and replace them
train_data_preprocessed = pd.DataFrame(train_data_preprocessed.toarray(), columns=list(continuous_features) + list(categorical_columns_encoded))
train_data_preprocessed

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,Street_Grvl,Street_Pave,Utilities_AllPub,Utilities_NoSeWa
0,-0.210526,-0.254076,0.652174,0.243243,1.193303,0.452790,0.0,-0.559829,-0.269652,-0.453608,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.578947,0.030015,0.065217,-0.486486,0.000000,0.834679,0.0,-0.330769,0.538308,0.343643,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,-0.052632,0.437624,0.608696,0.216216,0.986301,0.143910,0.0,-0.074359,-0.142289,-0.327933,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,-0.473684,0.017663,-1.260870,-0.648649,0.000000,-0.235170,0.0,0.106838,-0.468657,-0.247423,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,0.789474,1.181201,0.586957,0.162162,2.130898,0.381186,0.0,0.021368,0.305473,0.113893,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-0.368421,-0.385746,0.565217,0.162162,0.000000,-0.538435,0.0,0.812821,-0.076617,-0.263132,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1456,0.842105,0.913167,0.108696,-0.162162,0.724505,0.570727,163.0,0.190598,1.095522,1.936181,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1457,-0.157895,-0.107831,-0.695652,0.324324,0.000000,-0.152334,0.0,0.682906,0.319403,0.198331,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1458,-0.052632,0.058918,-0.500000,0.054054,0.000000,-0.469638,1029.0,-0.816239,0.172139,-0.017673,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


### Part E

In [83]:
# Split the train dataset into training and validation, and identify the target variable.
# Here, the target variable is SalePrice -- we want to see if we can predict if a passenger survived.
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(train_data_preprocessed, trainData["SalePrice"], test_size=0.20)

### Part F

In [84]:
# Implement Linear Regression, find best hyperparameters, and report the appropriate evaluation metrics.
from sklearn.linear_model import LinearRegression

# View parameters for Linear Regression
LinearRegression().get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [85]:
# Identify best hyperparameters through a grid.
model_linreg = LinearRegression()

# Explore possible hyperparameters.
param_grid_linreg = {
    'copy_X': [True, False],
    'fit_intercept': [True, False],
}

In [86]:
# Use grid search
from sklearn.model_selection import GridSearchCV

grid_f = GridSearchCV(model_linreg, param_grid_linreg, cv=5)

In [87]:
# Fit the training data to our model.
grid_f.fit(X_train, Y_train)

In [88]:
# See the best parameters
grid_f.best_params_

{'copy_X': True, 'fit_intercept': True}

In [89]:
# Predicted sale prices using our validation set.
y_pred_f = grid_f.predict(X_val)

In [90]:
from sklearn.metrics import mean_squared_error

# For regression, we show the MSE, not the accuracy.
mse_f = mean_squared_error(Y_val, y_pred_f)
print("Root Mean Squared Error:", np.sqrt(mse_f))

# Very low RMSE value -- this is good, but is this too good?

Root Mean Squared Error: 1.106966618393692e-08


### Part G

In [91]:
# Implement Random Forest Regression, find best hyperparameters, and report the appropriate evaluation metrics.
from sklearn.ensemble import RandomForestRegressor

# View parameters for Random Forest Regression
RandomForestRegressor().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [92]:
# Identify best hyperparameters through a grid.
model_rfr = RandomForestRegressor()

# Explore possible hyperparameters.
param_grid_rfr = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['absolute_error', 'squared_error', 'poisson', 'friedman_mse'],
    'max_leaf_nodes': [None, 10, 20, 50],
    'n_jobs': [-1]
}

In [93]:
# Use grid search
grid_g = GridSearchCV(model_rfr, param_grid_rfr, cv=5)

In [94]:
# Fit the training data to our model.
grid_g.fit(X_train, Y_train)

In [95]:
# See the best parameters
grid_g.best_params_

{'criterion': 'poisson',
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'n_estimators': 200,
 'n_jobs': -1}

In [96]:
# Predicted sale prices using our validation set.
y_pred_g = grid_g.predict(X_val)

In [97]:
# For regression, we show the MSE, not the accuracy.
mse_g = mean_squared_error(Y_val, y_pred_g)
print("Root Mean Squared Error:", np.sqrt(mse_g))

# On average, our model is $30,000 off the true sale price.

Root Mean Squared Error: 29884.688468695625
