In [82]:
# This Python 3 environment comes with many helpful analytics libraries installed
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from category_encoders import CatBoostEncoder
import matplotlib.pyplot as plt

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import optuna


/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


# Import Dataset

In [86]:
train_dataset = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
train_dataset.head()
#train_dataset.describe()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [87]:
test_dataset = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')
test_dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


# Feature Selection

In [88]:
X = train_dataset.drop(columns=['SalePrice', 'Id'])
y = train_dataset['SalePrice']
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [89]:
test_df = test_dataset.drop(columns=['Id'])
test_Id = test_dataset['Id']
test_Id.head()

0    1461
1    1462
2    1463
3    1464
4    1465
Name: Id, dtype: int64

# Missing Values

In [90]:
numerical_columns = X.select_dtypes(include=[np.number]).columns
categorical_columns = X.select_dtypes(include=['object']).columns

X[numerical_columns] = X[numerical_columns].fillna(-1)
X[categorical_columns] = X[categorical_columns].fillna('No Attribute')

test_df[numerical_columns] = test_df[numerical_columns].fillna(-1)
test_df[categorical_columns] = test_df[categorical_columns].fillna('No Attribute')

# Splting The Training Set

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)


# Defining Column Transformers

In [93]:
numeric_transformers = Pipeline(steps=[('scaling', StandardScaler())])
category_transformers = Pipeline(steps=[('catboosting', CatBoostEncoder(cols=categorical_columns, random_state=0))])
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformers, numerical_columns),
                                                ('cat', category_transformers, categorical_columns)
                                              ])


# Defining Hyperparameters

In [94]:
def objective(trial):
    xgb_params = {
        "learning_rate": trial.suggest_float("xgb_learning_rate",0.0001,0.1, log=True),
        "max_depth": trial.suggest_int("xgb_max_depth",3,12),
        "subsample": trial.suggest_float("xgb_subsample",0.5,1.0),
        "colsample_bytree": trial.suggest_float("xgb_colsample_bytree",0.5,1.0),
        "n_estimators": trial.suggest_int("xgb_n_estimators",50,300),
    }

    cat_params = {
        "learning_rate": trial.suggest_float("cat_learning_rate",0.0001,0.1, log=True),
        "depth": trial.suggest_int("cat_depth",3,10),
        "iterations": trial.suggest_int("cat_iterations",100,500),
        "l2_leaf_reg": trial.suggest_float("cat_l2_leaf_reg",0.0001,0.1, log=True),
        "subsample": trial.suggest_float("cat_subsample",0.5,1.0),
        "random_strength": trial.suggest_float("cat_random_strength",0.0001,0.1),
        
    }

    xgb = XGBRegressor(**xgb_params, objective='reg:squarederror')
    cat = CatBoostRegressor(**cat_params, loss_function='RMSE', verbose=0)


    pipeline = Pipeline([('preprocessor', preprocessor),
                         ('voting_regressor', VotingRegressor([('xgb', xgb), ('cat', cat)]))
                        ])
    
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
    
    return score

# Running Optuna

In [95]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

best_params = study.best_params
print(best_params)

[I 2025-03-17 10:52:50,808] A new study created in memory with name: no-name-6deada1f-d8a9-41a1-a42c-1563e6d078b0
[I 2025-03-17 10:52:59,325] Trial 0 finished with value: -4284542432.5883217 and parameters: {'xgb_learning_rate': 0.004716124509503882, 'xgb_max_depth': 3, 'xgb_subsample': 0.8258129845540894, 'xgb_colsample_bytree': 0.6138608480747312, 'xgb_n_estimators': 116, 'cat_learning_rate': 0.00013849513135541128, 'cat_depth': 5, 'cat_iterations': 225, 'cat_l2_leaf_reg': 0.00046790009693396714, 'cat_subsample': 0.7373126075670826, 'cat_random_strength': 0.01950787019027997}. Best is trial 0 with value: -4284542432.5883217.
[I 2025-03-17 10:53:21,048] Trial 1 finished with value: -1515326687.9420571 and parameters: {'xgb_learning_rate': 0.01776518297694825, 'xgb_max_depth': 6, 'xgb_subsample': 0.5280063947789086, 'xgb_colsample_bytree': 0.8774627209128701, 'xgb_n_estimators': 258, 'cat_learning_rate': 0.0016553808547928861, 'cat_depth': 6, 'cat_iterations': 250, 'cat_l2_leaf_reg': 0

{'xgb_learning_rate': 0.039915037982779476, 'xgb_max_depth': 4, 'xgb_subsample': 0.7936051348786975, 'xgb_colsample_bytree': 0.5493023918345434, 'xgb_n_estimators': 235, 'cat_learning_rate': 0.09475616476724075, 'cat_depth': 7, 'cat_iterations': 290, 'cat_l2_leaf_reg': 0.0001549388893135788, 'cat_subsample': 0.6871894844921922, 'cat_random_strength': 0.04421569705230182}


# Training The Model

In [99]:
best_xgb_params = {
        "learning_rate": study.best_params['xgb_learning_rate'],
        "max_depth": study.best_params['xgb_max_depth'],
        "subsample": study.best_params['xgb_subsample'],
        "colsample_bytree": study.best_params['xgb_colsample_bytree'],
        "n_estimators": study.best_params['xgb_n_estimators'],
}
best_cat_params = {
        "learning_rate": study.best_params['cat_learning_rate'],
        "depth": study.best_params['cat_depth'],
        "iterations": study.best_params['cat_iterations'],
        "l2_leaf_reg": study.best_params['cat_l2_leaf_reg'],
        "subsample": study.best_params['cat_subsample'],
        "random_strength": study.best_params['cat_random_strength'],
}  

xgb_2 = XGBRegressor(**best_xgb_params, objective='reg:squarederror')
cat_2 = CatBoostRegressor(**best_cat_params, loss_function='RMSE', verbose=0)

pipeline_2 = Pipeline([('preprocessor', preprocessor),
                     ('voting_regressor', VotingRegressor([('xgb', xgb_2), ('cat', cat_2)]))
                        ])
pipeline_2.fit(X_train,y_train)
test_score = pipeline_2.score(X_test,y_test)
print(f"The Model Accuracy is {test_score}")


The Model Accuracy is 0.9133041058560977


# Making Predictions

In [105]:
y_pred = pipeline_2.predict(test_df)

# mse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)
# print(f"Mean Squared Error is: {mse}")
# print(f"Mean Average Error is: {mae}")
# print(f"r2 score: {r2}")
result = pd.DataFrame()
result['Id'] = test_Id
result['SalePrice'] = y_pred
result.to_csv('submission.csv', index=False)