# Hyperparameter Tuning with Pipelines for House Prices

## Import the basic libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time



## Import the sklearn libraries that will be used

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.pipeline import Pipeline

## Import the models

In [3]:
from sklearn.ensemble import RandomForestRegressor


## Import the metrics

In [4]:
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, mean_absolute_error

# Load the dataset

In [5]:
housesales_train = pd.read_csv("./Datasets/House_sales_train.csv")
# housesales_test = pd.read_csv("./Datasets/House_sales_test.csv")

In [6]:
# housesales_test.shape

In [7]:
#Combine the datasets
# Concatenate train and test datasets vertically
# HouseSales_combined = pd.concat([housesales_train, housesales_test], ignore_index=True)
HouseSales_combined = housesales_train.copy()

In [8]:
# HouseSales_combined.info()

In [9]:
# HouseSales_combined.tail()

In [10]:
#drop the ID
HouseSales_combined = HouseSales_combined.drop(columns=['Id'])

In [11]:
# Get the number of unique values for each feature
unique_vals =  HouseSales_combined.nunique().sort_values(ascending=False)

# unique_vals = pd.DataFrame(unique_vals)
# pd.set_option('display.max_rows', None)  # Display all rows
# print(unique_vals)

In [12]:
# Get the data types of each column
data_types = HouseSales_combined.dtypes

# Select only the numerical columns
numerical_features = HouseSales_combined.select_dtypes(include=['int64', 'float64'])
# numerical_features.info()

In [13]:
numerical_features.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [14]:
# Replace all NaN values with a specified value (e.g., 0)
updatednumerical_features = numerical_features.fillna(0)

In [15]:
# updatednumerical_features['SalePrice'].info()

In [16]:
non_numeric_features = housesales_train.select_dtypes(include=['object'])
non_numeric_features = non_numeric_features.fillna('nil')
# non_numeric_features.info()

## Data Engineering

## Feature Encoding : Categorical, ordinal


In [17]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [18]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output= False).set_output(transform='pandas')
ohetransform = ohe.fit_transform(non_numeric_features)

In [19]:
# Combine the numeric and non numeric datasets
HouseSalesEncoded = pd.concat([updatednumerical_features, ohetransform], axis=1)
HouseSalesEncoded.shape

(1460, 305)

## Define the inputs and outputs

In [20]:
X,y = HouseSalesEncoded.loc[:, ~HouseSalesEncoded.columns.isin(['SalePrice'])],HouseSalesEncoded['SalePrice']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)

# Hyperparameter tuning  with RandomSearchCV and GridSearchCV

In [22]:
# Define a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor())
])

In [23]:
# Define a grid of hyperparameters to search
grid_param = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [5, 10, 20],
    'rf__min_samples_split': [2, 5, 10]
}

In [24]:
cv=3

In [25]:
def Grid_search_CV_model(pipeline, grid_param, cv, X_train, y_train):
    grid_search = RandomizedSearchCV(pipeline, 
                                   param_distributions=param_grid, 
                                   cv=cv, 
                                   scoring='neg_mean_squared_error', # This will get the score
                                   n_jobs=-1,
                                  verbose=10 
                                  )
    
    # Fit the GridSearchCV
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    search_results = grid_search.cv_results_
    return best_model,search_results

In [26]:
price_grid_estimator, price_grid_results = Grid_search_CV_model(pipeline, grid_param, cv, X_train, y_train)

NameError: name 'param_grid' is not defined

In [None]:
# Print the best parameters
print("Best Parameters:", price_grid_estimator.steps[1])

In [None]:

# Make predictions
y_pred =grid_search.predict(X_test)