# **Preprocessing & Pipeline**

#### **Read Data**

In [1]:
import pandas as pd

data = pd.read_csv(r'C:\Users\gasse\OneDrive\Desktop\Project 1\cleaned_data.csv')
data

Unnamed: 0,suburb,rooms,type,price,method,sellerg,distance,bedroom2,bathroom,car,landsize,yearbuilt,councilarea,regionname,year,month,day,season,street_name
0,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra,Northern Metropolitan,2016,3,12,Spring,Turner
1,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,1900.0,Yarra,Northern Metropolitan,2016,4,2,Spring,Bloomburg
2,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,1900.0,Yarra,Northern Metropolitan,2017,4,3,Spring,Charles
3,Abbotsford,3,h,850000.0,PI,Biggin,2.5,3.0,2.0,1.0,94.0,,Yarra,Northern Metropolitan,2017,4,3,Spring,Federation
4,Abbotsford,4,h,1600000.0,VB,Nelson,2.5,3.0,1.0,2.0,120.0,2014.0,Yarra,Northern Metropolitan,2016,4,6,Spring,Park
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,4,h,1245000.0,S,Barry,16.7,4.0,2.0,2.0,652.0,1981.0,,South-Eastern Metropolitan,2017,8,26,Summer,Strada
13576,Williamstown,3,h,1031000.0,SP,Williams,6.8,3.0,2.0,2.0,333.0,1995.0,,Western Metropolitan,2017,8,26,Summer,Merrett
13577,Williamstown,3,h,1170000.0,S,Raine,6.8,3.0,2.0,4.0,436.0,1997.0,,Western Metropolitan,2017,8,26,Summer,Power
13578,Williamstown,4,h,2500000.0,PI,Sweeney,6.8,4.0,1.0,5.0,866.0,1920.0,,Western Metropolitan,2017,8,26,Summer,Verdon


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   suburb       13580 non-null  object 
 1   rooms        13580 non-null  int64  
 2   type         13580 non-null  object 
 3   price        13580 non-null  float64
 4   method       13580 non-null  object 
 5   sellerg      13580 non-null  object 
 6   distance     13580 non-null  float64
 7   bedroom2     13580 non-null  float64
 8   bathroom     13580 non-null  float64
 9   car          13518 non-null  float64
 10  landsize     13580 non-null  float64
 11  yearbuilt    8205 non-null   float64
 12  councilarea  12211 non-null  object 
 13  regionname   13580 non-null  object 
 14  year         13580 non-null  int64  
 15  month        13580 non-null  int64  
 16  day          13580 non-null  int64  
 17  season       13580 non-null  object 
 18  street_name  13580 non-null  object 
dtypes: f

## **Numerical Pipeline** 

- **Select Numerical Columns**

In [3]:
num_col = data.select_dtypes(include= 'number').columns
num_col

Index(['rooms', 'price', 'distance', 'bedroom2', 'bathroom', 'car', 'landsize',
       'yearbuilt', 'year', 'month', 'day'],
      dtype='object')

In [4]:
data.select_dtypes(include= 'number').head()

Unnamed: 0,rooms,price,distance,bedroom2,bathroom,car,landsize,yearbuilt,year,month,day
0,2,1480000.0,2.5,2.0,1.0,1.0,202.0,,2016,3,12
1,2,1035000.0,2.5,2.0,1.0,0.0,156.0,1900.0,2016,4,2
2,3,1465000.0,2.5,3.0,2.0,0.0,134.0,1900.0,2017,4,3
3,3,850000.0,2.5,3.0,2.0,1.0,94.0,,2017,4,3
4,4,1600000.0,2.5,3.0,1.0,2.0,120.0,2014.0,2016,4,6


In [5]:
data.select_dtypes(include= 'number').isna().sum()

rooms           0
price           0
distance        0
bedroom2        0
bathroom        0
car            62
landsize        0
yearbuilt    5375
year            0
month           0
day             0
dtype: int64

### **Select Pipelines**

- **Pipeline 1 -> ( car , yearbuilt ) -> impute missing by  Most frequent ->  Scaling By Standard Scaling**

In [6]:
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

imputer = SimpleImputer(strategy='most_frequent')
scaler = StandardScaler()

num_pipeline1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

num_pipeline1

0,1,2
,steps,"[('imputer', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


- **Pipeline 2 --> (  rooms , distance ,  bedroom2 ,  bathroom  ,  landsize ,  year ,  month ,  day ) --> Scaling By Standard Scaling**

In [7]:
scaler2 = StandardScaler()

num_pipeline2 = Pipeline(steps=[
    ('scaler', StandardScaler())
])

num_pipeline2

0,1,2
,steps,"[('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [8]:
data = data.drop(columns=['street_name'])

In [9]:
data.head()

Unnamed: 0,suburb,rooms,type,price,method,sellerg,distance,bedroom2,bathroom,car,landsize,yearbuilt,councilarea,regionname,year,month,day,season
0,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra,Northern Metropolitan,2016,3,12,Spring
1,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,1900.0,Yarra,Northern Metropolitan,2016,4,2,Spring
2,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,1900.0,Yarra,Northern Metropolitan,2017,4,3,Spring
3,Abbotsford,3,h,850000.0,PI,Biggin,2.5,3.0,2.0,1.0,94.0,,Yarra,Northern Metropolitan,2017,4,3,Spring
4,Abbotsford,4,h,1600000.0,VB,Nelson,2.5,3.0,1.0,2.0,120.0,2014.0,Yarra,Northern Metropolitan,2016,4,6,Spring


## **Categorical Pipeline**

- **Select Categorical Columns**

In [10]:
cat_col =data.select_dtypes(include= 'object').columns
cat_col

Index(['suburb', 'type', 'method', 'sellerg', 'councilarea', 'regionname',
       'season'],
      dtype='object')

In [11]:
data.select_dtypes(include= 'object').head()

Unnamed: 0,suburb,type,method,sellerg,councilarea,regionname,season
0,Abbotsford,h,S,Biggin,Yarra,Northern Metropolitan,Spring
1,Abbotsford,h,S,Biggin,Yarra,Northern Metropolitan,Spring
2,Abbotsford,h,SP,Biggin,Yarra,Northern Metropolitan,Spring
3,Abbotsford,h,PI,Biggin,Yarra,Northern Metropolitan,Spring
4,Abbotsford,h,VB,Nelson,Yarra,Northern Metropolitan,Spring


In [12]:
for col in cat_col:
    print(f'Col : {col}')
    print(data[col].nunique())
    print('-' * 20)

Col : suburb
314
--------------------
Col : type
3
--------------------
Col : method
5
--------------------
Col : sellerg
268
--------------------
Col : councilarea
33
--------------------
Col : regionname
8
--------------------
Col : season
4
--------------------


In [13]:
data.select_dtypes(include= 'object').isna().sum()

suburb            0
type              0
method            0
sellerg           0
councilarea    1369
regionname        0
season            0
dtype: int64

### **Select Pipelines**

- **Pipeline 1 ( Councilarea ) --> Impute Using Simple Imputer --> Encoding Using Target Encoding**

In [14]:
from category_encoders import TargetEncoder

imputer_cat = SimpleImputer(strategy='most_frequent')
encoder1 = TargetEncoder()

cat_pipeline1 = Pipeline(steps=[
    ('imputer', imputer_cat),
    ('encoder', encoder1)
])

cat_pipeline1

0,1,2
,steps,"[('imputer', ...), ('encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,


- **Pipeline 2 ( suburb - sellerg ) --> Encoding Using Target Encoder**

In [15]:
encoder2 = TargetEncoder()

cat_pipeline2 = Pipeline(steps=[
    ('encoder', encoder2)
])

cat_pipeline2

0,1,2
,steps,"[('encoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,


- **Pipeline 3 ( type - method - regionname - season ) --> Encoding Using One Hot Encoder**

In [16]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop= 'first', sparse_output=False)

cat_pipeline3 = Pipeline(steps=[
    ('OneHotEncoder', ohe)
])

cat_pipeline3

0,1,2
,steps,"[('OneHotEncoder', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


## **Column Transformer**

In [17]:
from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer(transformers=[
    ('num_pipeline1', num_pipeline1, ['car', 'yearbuilt']),
    ('num_pipeline2', num_pipeline2, ['rooms' , 'distance' , 'bedroom2' , 'bathroom'  , 'landsize' , 'year' , 'month' , 'day']),
    ('cat_pipeline1', cat_pipeline1, ['councilarea']),  
    ('cat_pipeline2', cat_pipeline2, ['suburb', 'sellerg']),  
    ('cat_pipeline3', cat_pipeline3, ['type', 'method', 'regionname', 'season'])

],
    remainder='passthrough'
)

preprocessing

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


#### **Spilt Data To ( x , y )**

In [18]:
X = data.drop('price', axis=1)
y = data['price']

### **Select Best Model**

In [19]:
# Import Models
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor
import warnings
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('ignore')

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

models = [
    ('Linear Regression', LinearRegression(n_jobs=-1)),
    ('Knn', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor(random_state=42)),
    ('Random Forest', RandomForestRegressor(random_state=42, n_jobs=-1)),
    ('Xgboost', XGBRegressor()),
    ('CatBoost', CatBoostRegressor(verbose=0)),
    ('LightGBM', LGBMRegressor(force_col_wise=True))
]

for name, reg in models:

    model_pipeline = Pipeline(
        steps=[('Preprocessing', preprocessing), ('Model', reg)])

    # 2) train on train set
    model_pipeline.fit(X_train, y_train)

    # 3) predict on train and test
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)

    # 4) compute R2
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(name)
    print('Train R2 Score :', round(train_r2 * 100, 2))
    print('Test  R2 Score :', round(test_r2 * 100, 2))
    print('-' * 50)




Linear Regression
Train R2 Score : 65.99
Test  R2 Score : 67.14
--------------------------------------------------
Knn
Train R2 Score : 66.0
Test  R2 Score : 52.5
--------------------------------------------------
Decision Tree
Train R2 Score : 100.0
Test  R2 Score : 63.76
--------------------------------------------------
Random Forest
Train R2 Score : 96.92
Test  R2 Score : 80.85
--------------------------------------------------
Xgboost
Train R2 Score : 95.43
Test  R2 Score : 78.84
--------------------------------------------------
CatBoost
Train R2 Score : 92.02
Test  R2 Score : 83.08
--------------------------------------------------
[LightGBM] [Info] Total Bins 1083
[LightGBM] [Info] Number of data points in the train set: 10864, number of used features: 29
[LightGBM] [Info] Start training from score 1074964.928203
LightGBM
Train R2 Score : 89.15
Test  R2 Score : 83.02
--------------------------------------------------


In [21]:
# 1) split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

models = [
    ('Linear Regression', LinearRegression(n_jobs=-1)),
    ('Knn', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor(random_state=42)),
    ('Random Forest', RandomForestRegressor(random_state=42, n_jobs=-1)),
    ('Xgboost', XGBRegressor()),
    ('CatBoost', CatBoostRegressor(verbose=0)),
    ('LightGBM', LGBMRegressor(force_col_wise=True))
]

for name, reg in models:

    model_pipeline = Pipeline(
        steps=[('Preprocessing', preprocessing), ('Model', reg)])
    model_pipeline_scaled_target = TransformedTargetRegressor(
        regressor=model_pipeline, func=np.log1p, inverse_func=np.expm1)

    # 2) train on train set
    model_pipeline.fit(X_train, y_train)

    # 3) predict on train and test
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)

    # 4) compute R2
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print(name)
    print('Train R2 Score :', round(train_r2 * 100, 2))
    print('Test  R2 Score :', round(test_r2 * 100, 2))
    print('-' * 50)

Linear Regression
Train R2 Score : 65.99
Test  R2 Score : 67.14
--------------------------------------------------
Knn
Train R2 Score : 66.0
Test  R2 Score : 52.5
--------------------------------------------------
Decision Tree
Train R2 Score : 100.0
Test  R2 Score : 63.76
--------------------------------------------------
Random Forest
Train R2 Score : 96.92
Test  R2 Score : 80.85
--------------------------------------------------
Xgboost
Train R2 Score : 95.43
Test  R2 Score : 78.84
--------------------------------------------------
CatBoost
Train R2 Score : 92.02
Test  R2 Score : 83.08
--------------------------------------------------
[LightGBM] [Info] Total Bins 1083
[LightGBM] [Info] Number of data points in the train set: 10864, number of used features: 29
[LightGBM] [Info] Start training from score 1074964.928203
LightGBM
Train R2 Score : 89.15
Test  R2 Score : 83.02
--------------------------------------------------


In [22]:
# 1) Pipeline: preprocessing + CatBoost
catboost_pipeline = Pipeline(
    steps=[('Preprocessing', preprocessing), ('Model', CatBoostRegressor(verbose=0))])

# 2) Wrap with TransformedTargetRegressor (log1p on y)
catboost_scaled_target = TransformedTargetRegressor(
    regressor=catboost_pipeline, func=np.log1p, inverse_func=np.expm1)

# 3) Train on training set
catboost_scaled_target.fit(X_train, y_train)

# 4) Predict on train and test
y_train_pred = catboost_scaled_target.predict(X_train)
y_test_pred = catboost_scaled_target.predict(X_test)

# 5) R2 scores
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("CatBoost with Target Transformation")
print("Train R2 Score :", round(train_r2 * 100, 2))
print("Test  R2 Score :", round(test_r2 * 100, 2))
print("-" * 50)

CatBoost with Target Transformation
Train R2 Score : 89.21
Test  R2 Score : 83.91
--------------------------------------------------


## **HyperParameter Tuning**

In [24]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'regressor__Model__learning_rate': [0.01, 0.05, 0.1],
    'regressor__Model__depth': [4, 6, 8],
    'regressor__Model__iterations': [100, 200]
}

result = RandomizedSearchCV(catboost_scaled_target, param_grid,
                            cv=5, scoring='r2', return_train_score=True, n_jobs=-1)

result.fit(X, y)

0,1,2
,estimator,TransformedTa...12EE2E530>)]))
,param_distributions,"{'regressor__Model__depth': [4, 6, ...], 'regressor__Model__iterations': [100, 200], 'regressor__Model__learning_rate': [0.01, 0.05, ...]}"
,n_iter,10
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [25]:
print("Best R2:", result.best_score_)
print("Best Params:", result.best_params_)

Best R2: 0.7809495716337154
Best Params: {'regressor__Model__learning_rate': 0.1, 'regressor__Model__iterations': 200, 'regressor__Model__depth': 6}


## **Final Model**

In [26]:
catboost_pipeline = Pipeline(steps=[('Preprocessing', preprocessing), (
    'Model', CatBoostRegressor(learning_rate=0.1, depth=6, l2_leaf_reg=1))])
catboost_model_final = TransformedTargetRegressor(
    regressor=catboost_pipeline, func=np.log1p, inverse_func=np.expm1)
catboost_model_final.fit(X, y)

0:	learn: 0.4936850	total: 79.5ms	remaining: 1m 19s
1:	learn: 0.4627554	total: 97.7ms	remaining: 48.7s
2:	learn: 0.4357561	total: 106ms	remaining: 35.2s
3:	learn: 0.4121867	total: 119ms	remaining: 29.5s
4:	learn: 0.3916321	total: 128ms	remaining: 25.4s
5:	learn: 0.3724179	total: 140ms	remaining: 23.2s
6:	learn: 0.3556287	total: 157ms	remaining: 22.3s
7:	learn: 0.3413477	total: 204ms	remaining: 25.4s
8:	learn: 0.3280115	total: 219ms	remaining: 24.1s
9:	learn: 0.3166429	total: 257ms	remaining: 25.4s
10:	learn: 0.3060858	total: 276ms	remaining: 24.8s
11:	learn: 0.2979133	total: 304ms	remaining: 25s
12:	learn: 0.2888493	total: 338ms	remaining: 25.6s
13:	learn: 0.2816126	total: 350ms	remaining: 24.7s
14:	learn: 0.2752120	total: 385ms	remaining: 25.3s
15:	learn: 0.2694614	total: 420ms	remaining: 25.8s
16:	learn: 0.2639383	total: 438ms	remaining: 25.3s
17:	learn: 0.2595261	total: 459ms	remaining: 25s
18:	learn: 0.2553975	total: 468ms	remaining: 24.2s
19:	learn: 0.2508890	total: 475ms	remainin

0,1,2
,regressor,Pipeline(step...315643FF70>)])
,transformer,
,func,<ufunc 'log1p'>
,inverse_func,<ufunc 'expm1'>
,check_inverse,True

0,1,2
,transformers,"[('num_pipeline1', ...), ('num_pipeline2', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


## **Save Model**

In [27]:
import joblib
joblib.dump(catboost_model_final, 'model.pkl', compress=3)

['model.pkl']

In [28]:
model = joblib.load('model.pkl')

In [29]:
model.predict(X.head(1))

array([1219908.36880475])

In [30]:
print(model.regressor_)

Pipeline(steps=[('Preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num_pipeline1',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['car', 'yearbuilt']),
                                                 ('num_pipeline2',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['rooms', 'distance',
                                                   'bedroom2', 'bathroom',
                                   