## 1. Data reading & splitting

In [1]:
import seaborn as sns
import pandas as pd

#load dataset
url = '/Users/G/WBS Bootcamp/7. Supervised Machine Learning/Data/iter-7/housing_prices.csv'
housing = pd.read_csv(url)

#splitting
from sklearn.model_selection import train_test_split
X = housing.copy()
y = X.pop("SalePrice")

X.drop("Id", axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

## 2. Data Preparation (categorical features)

In [2]:
#isolation of the categorical features
ordered_categories_column_names = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                   'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence']


ExterQual_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
ExterCond_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtQual_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtCond_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
BsmtExposure_cat = ['NA', 'No', 'Mn', 'Av', 'Gd']
BsmtFinType1_cat = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
BsmtFinType2_cat = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
HeatingQC_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
KitchenQual_cat = ['Po', 'Fa', 'TA', 'Gd', "Ex"]
FireplaceQu_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
GarageFinish_cat = ['NA', 'Unf', 'RFn', 'Fin']
GarageQual_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
GarageCond_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
PoolQC_cat = ['Na', 'Po', 'Fa', 'TA', 'Gd', "Ex"]
Fence_cat = ['Na', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

ordinal_cats_list = [ExterQual_cat, ExterCond_cat, BsmtQual_cat, BsmtCond_cat, BsmtExposure_cat, BsmtFinType1_cat,
                     BsmtFinType2_cat, HeatingQC_cat, KitchenQual_cat, FireplaceQu_cat, GarageFinish_cat, GarageQual_cat, GarageCond_cat, PoolQC_cat, Fence_cat]

## 3. Pipeline preparation

In [3]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer



#preparation for the pipes
X_cat_columns = X.select_dtypes(exclude='number').copy()

X_num_columns = X.select_dtypes(include='number').copy()
X_cat_ordered_columns = X_cat_columns[ordered_categories_column_names]
X_cat_unordered_columns = X_cat_columns.drop(
    ordered_categories_column_names, axis=1)

# Numerical pipe
numeric_pipe = make_pipeline(SimpleImputer())

# Unorderd categorical pipe
unordered_categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown='ignore')
)

# Orderd categorical pipeline
ordered_categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="Na"),
    OrdinalEncoder(categories=ordinal_cats_list,
                   handle_unknown='use_encoded_value', unknown_value=-1)
)

# Create a ColumnTransformer to apply the transformers to the appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, X_num_columns.columns),
        ('unordered', unordered_categoric_pipe, X_cat_unordered_columns.columns),
        ('ordered', ordered_categoric_pipe, X_cat_ordered_columns.columns)
    ])

In [4]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

# Combine the preprocessor and the model into a single pipeline
pipeline = make_pipeline(preprocessor, LinearRegression())
pipeline


# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print(f"Improved Model Train RMSE: {train_rmse:.12f}")
print(f"Improved Model Test RMSE: {test_rmse:.12f}")

# Evaluate the model using R^2 score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Improved Model Train R^2 Score: {train_r2:.2f}")
print(f"Improved Model Test R^2 Score: {test_r2:.2f}")


Improved Model Train RMSE: 22014.776989079150
Improved Model Test RMSE: 27200.412983119062
Improved Model Train R^2 Score: 0.92
Improved Model Test R^2 Score: 0.88


## Decision Tree

In [14]:
# initialize the pipe
dtR = DecisionTreeRegressor()
full_pipeline_dtR = make_pipeline(
    preprocessor, dtR)
dtR

In [15]:
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# train a DecisionTree with GridSearch cross validation
# parameter grid
param_grid_dtR = {
    'columntransformer__num__simpleimputer__strategy': ['mean', 'median'],
    'decisiontreeregressor__max_depth': range(2, 140),
    'decisiontreeregressor__min_samples_leaf': range(3, 10, 2),
    'decisiontreeregressor__min_samples_split': range(3, 40, 5),
    'decisiontreeregressor__max_leaf_nodes': range(1, 100),
    'decisiontreeregressor__max_features': range(1,79)
}

# define cross validation
#search_dt = GridSearchCV(full_pipeline_dt,param_grid_dt,cv=5,verbose=0)
search_dtR = RandomizedSearchCV(full_pipeline_dtR, param_grid_dtR,
                                n_iter=2, cv=7, scoring='neg_mean_absolute_percentage_error', random_state=123, verbose=0)


# fit
search_dtR.fit(X_train, y_train)
y_train_pred_dtR = search_dtR.predict(X_train)
y_test_pred_dtR = search_dtR.predict(X_test)

train_rmse = mean_squared_error(y_train, y_train_pred_dtR, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred_dtR, squared=False)

print(f"decisionTreeRegressor Train RMSE: {train_rmse:.2f}")
print(f"decisionTreeRegressor Test RMSE: {test_rmse:.2f}")

#print(f"The best parameters are {search_dtR.best_params_}")

# Evaluate the model using R^2 score
train_r2 = r2_score(y_train, y_train_pred_dtR)
test_r2 = r2_score(y_test, y_test_pred_dtR)

print(f"Train R^2 Score: {train_r2:.2f}")
print(f"Model Test R^2 Score: {test_r2:.2f}")

decisionTreeRegressor Train RMSE: 34235.21
decisionTreeRegressor Test RMSE: 39244.27
Train R^2 Score: 0.82
Model Test R^2 Score: 0.75


## Random forest

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# initialize the pipe
full_pipeline_RF = make_pipeline(
    preprocessor, StandardScaler(), RandomForestRegressor(random_state = 123))
full_pipeline_RF

In [22]:


# train a DecisionTree with GridSearch cross validation
# parameter grid
param_grid_RF = {
"columntransformer__num__simpleimputer__strategy":["mean", "median"],
    "standardscaler__with_mean":[True, False],
    "standardscaler__with_std":[True, False],
    "randomforestregressor__n_estimators": [100, 200, 300],
    "randomforestregressor__max_depth": [None, 5, 11],
    "randomforestregressor__min_samples_split": [2, 5, 10]
}

# define cross validation
#
search_RF = RandomizedSearchCV(full_pipeline_RF, param_grid_RF,
                                n_iter=2, cv=7, scoring='neg_mean_absolute_percentage_error', random_state=123, verbose=0)


# fit
search_RF.fit(X_train, y_train)
y_train_pred_RF = search_RF.predict(X_train)
y_test_pred_RF = search_RF.predict(X_test)

train_rmse = mean_squared_error(y_train, y_train_pred_RF, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred_RF, squared=False)

print(f"decisionTreeRegressor Train RMSE: {train_rmse:.2f}")
print(f"decisionTreeRegressor Test RMSE: {test_rmse:.2f}")

#print(f"The best parameters are {search_dtR.best_params_}")

# Evaluate the model using R^2 score
train_r2 = r2_score(y_train, y_train_pred_RF)
test_r2 = r2_score(y_test, y_test_pred_RF)

print(f"Train R^2 Score: {train_r2:.2f}")
print(f"Model Test R^2 Score: {test_r2:.2f}")

decisionTreeRegressor Train RMSE: 11900.07
decisionTreeRegressor Test RMSE: 26553.36
Train R^2 Score: 0.98
Model Test R^2 Score: 0.89
