In [464]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Read the data in

In [465]:
data=pd.read_csv('train.csv')

In [466]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Missing Data

In [467]:
# Assuming you have a pandas DataFrame named 'data'
columns_with_missing_values = data.columns[data.isnull().any()].tolist()
missing_values_count = data[columns_with_missing_values].isnull().sum()

print("Columns with missing values and their counts:")
for column in columns_with_missing_values:
    print(f"{column}: {missing_values_count[column]}")


Columns with missing values and their counts:
LotFrontage: 259
Alley: 1369
MasVnrType: 872
MasVnrArea: 8
BsmtQual: 37
BsmtCond: 37
BsmtExposure: 38
BsmtFinType1: 37
BsmtFinType2: 38
Electrical: 1
FireplaceQu: 690
GarageType: 81
GarageYrBlt: 81
GarageFinish: 81
GarageQual: 81
GarageCond: 81
PoolQC: 1453
Fence: 1179
MiscFeature: 1406


# Preprocessing intial dataset

In [468]:
def preprocess_inputs(df):
    df=df.copy()
    #dropping columns with a lot of missing data and Id column as this doesn't provide useful information
    df=df.drop(['Alley','MasVnrType','Fence','MiscFeature','PoolQC','FireplaceQu','Id'],axis=1)
    
    # Splitting into target and independent variables
    X=df.drop(['SalePrice'],axis=1)
    y=df['SalePrice']
    
    #train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [469]:
X_train, X_test, y_train, y_test=preprocess_inputs(data)

# Identifying data types

In [470]:
#classify features by type

#identifying binary features
binary_features=[column for column in X_train.select_dtypes('object').columns if len(X_train[column].unique()) == 2]

#identifying nominal features
nominal_features=[column for column in X_train.select_dtypes('object').columns if len(X_train[column].unique()) > 2]

#identifying numerical features
numerical_features=X_train.select_dtypes(include='number').columns.tolist()

In [471]:
# Find missing values for binary features
missing_binary = X_train[binary_features].isnull().sum() 

# Find missing values for nominal features
missing_nominal = X_train[nominal_features].isnull().sum()

# Find missing values for numerical features
missing_numerical = X_train[numerical_features].isnull().sum()


# Constructing pipelines

In [472]:
#Construct transformers to handle each type of features
binary_transformer=Pipeline(steps=[
    ('ordinal',OrdinalEncoder(categories='auto')) #  no need to impute for binary variables as no missing values
])

nominal_transformer=Pipeline(steps=[
    ('nominal_impute',SimpleImputer(strategy='most_frequent')),
    ('nominal_encode',OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer=Pipeline(steps=[
    ('numerical',SimpleImputer(strategy='median'))
])

In [473]:
preprocessor=ColumnTransformer(transformers=[
    ('binary',binary_transformer, binary_features),
    ('nominal',nominal_transformer, nominal_features),
    ('numerical',numerical_transformer,numerical_features)
],sparse_threshold=0)

# Random Forest Pipeline

In [474]:
# Build the final pipeline
rf_model=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('scaler',StandardScaler()),
    ('regressor',RandomForestRegressor(random_state=42))
])

In [475]:
rf_model.fit(X_train,y_train)

In [476]:
acc=rf_model.score(X_test,y_test)
print('Test Accuracy: {:.2f}%'.format(acc*100))

Test Accuracy: 89.64%


# XGBoost Pipeline

In [477]:
xgb_model=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('scaler',StandardScaler()),
    ('regressor',xgb.XGBRegressor(learning_rate= 0.01,max_depth= 4,n_estimators=500,eval_metric='rmsle'))
])

In [478]:
xgb_model.fit(X_train,y_train)

In [479]:
acc=xgb_model.score(X_test,y_test)
print('Test Accuracy: {:.2f}%'.format(acc*100))

Test Accuracy: 91.26%


## Hyperparameter tuning

In [480]:
param_grid = {"regressor__max_depth":    [4, 5, 6],
              "regressor__n_estimators": [500, 600, 700],
              "regressor__learning_rate": [0.01, 0.015]}


grid_search = GridSearchCV(xgb_model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

5968.09s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
5968.09s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
5968.11s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
5968.11s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
5968.12s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
5968.13s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
5968.12s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
5968.13s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make th

Best Parameters: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 4, 'regressor__n_estimators': 500}
Best Score: nan
