In [32]:
import numpy as np
import pandas as pd

In [33]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

In [34]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [35]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [36]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [37]:
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished', 1.0:'semifurnished', 2.0:'furnished'})
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [38]:
X = df.drop(columns=['price'])
y = df['price']

In [39]:
y_transformed = np.log1p(y)

### Ordinal Encoding

In [40]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [41]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder = 'passthrough'
)

In [42]:
# Creating a Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [43]:
# K-fold cross-validation
kfold = KFold(n_splits = 10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [44]:
scores.mean(), scores.std()

(np.float64(0.7363096633436828), np.float64(0.03238005754429936))

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [46]:
pipeline.fit(X_train, y_train)

In [47]:
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)

In [48]:
mean_absolute_error(np.expm1(y_test), y_pred)

np.float64(0.9463822160089356)

In [56]:
def scorer(model_name, model):
    output = []
    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [57]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [58]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees':ExtraTreesRegressor(),
    'gradient boosting':GradientBoostingRegressor(),
    'adaboost':AdaBoostRegressor(),
    'mlp':MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [59]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [60]:
model_output

[['linear_reg',
  np.float64(0.7363096633436828),
  np.float64(0.9463822160089356)],
 ['svr', np.float64(0.7642012011196353), np.float64(0.8472636473483922)],
 ['ridge', np.float64(0.7363125343993554), np.float64(0.9463387741853386)],
 ['LASSO', np.float64(0.05943378064493573), np.float64(1.528905986892753)],
 ['decision tree',
  np.float64(0.7753019410466737),
  np.float64(0.72614508674312)],
 ['random forest',
  np.float64(0.8808036251447824),
  np.float64(0.5250543224430154)],
 ['extra trees',
  np.float64(0.8698052287633647),
  np.float64(0.5496017408243923)],
 ['gradient boosting',
  np.float64(0.8726447846133778),
  np.float64(0.5767685293829701)],
 ['adaboost', np.float64(0.7559435852407321), np.float64(0.8509715305410004)],
 ['mlp', np.float64(0.8068668985881079), np.float64(0.7240104920373505)],
 ['xgboost', np.float64(0.8894876835260124), np.float64(0.5040475141482346)]]

In [61]:
model_df = pd.DataFrame(model_output, columns=['name', 'r2', 'mae']).sort_values(by='mae')

In [62]:
model_df

Unnamed: 0,name,r2,mae
10,xgboost,0.889488,0.504048
5,random forest,0.880804,0.525054
6,extra trees,0.869805,0.549602
7,gradient boosting,0.872645,0.576769
9,mlp,0.806867,0.72401
4,decision tree,0.775302,0.726145
1,svr,0.764201,0.847264
8,adaboost,0.755944,0.850972
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


### OneHotEncoding

In [63]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first'), ['sector', 'agePossession', 'furnishing_type'])
    ],
    remainder = 'passthrough'
)

In [64]:
# Creating a Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [65]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [66]:
scores.mean()

np.float64(0.8546094810971422)

In [67]:
scores.std()

np.float64(0.015997422908695623)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [69]:
pipeline.fit(X_train, y_train)

In [70]:
y_pred = pipeline.predict(X_test)

In [71]:
y_pred = np.expm1(y_pred)

In [72]:
mean_absolute_error(np.expm1(y_test), y_pred)

np.float64(0.6497514315131458)

In [73]:
def scorer(model_name, model):
    output = []
    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [74]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees':ExtraTreesRegressor(),
    'gradient boosting':GradientBoostingRegressor(),
    'adaboost':AdaBoostRegressor(),
    'mlp':MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [75]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [76]:
model_df = pd.DataFrame(model_output, columns=['name', 'r2', 'mae'])

In [77]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.894631,0.481519
10,xgboost,0.89585,0.493456
5,random forest,0.891518,0.503193
9,mlp,0.872503,0.553393
7,gradient boosting,0.876327,0.569807
0,linear_reg,0.854609,0.649751
2,ridge,0.854739,0.652915
4,decision tree,0.805794,0.697753
8,adaboost,0.754213,0.821951
1,svr,0.769741,0.834124


### OneHotEncoding with PCA

In [78]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['sector', 'agePossession'])
    ],
    remainder = 'passthrough'
)

In [79]:
# Creating a Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [80]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [81]:
scores.mean()

np.float64(0.06225201431451135)

In [82]:
scores.std()

np.float64(0.01986059407164016)

### Target Encoding
- used for those column which have high cardinality
- each sector -> value mean
- avoid Data Leakage <> do split beforhand

In [83]:
import category_encoders as ce

columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [84]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [85]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')


In [86]:
scores.mean(), scores.std()

(np.float64(0.8295219182255362), np.float64(0.018384463379122782))

In [87]:
def scorer(model_name, model):
    output = []
    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [88]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees':ExtraTreesRegressor(),
    'gradient boosting':GradientBoostingRegressor(),
    'adaboost':AdaBoostRegressor(),
    'mlp':MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [89]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [90]:
model_df = pd.DataFrame(model_output, columns=['name', 'r2', 'mae'])

In [91]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.900619,0.458418
6,extra trees,0.902475,0.461471
7,gradient boosting,0.889074,0.508437
4,decision tree,0.825731,0.576802
9,mlp,0.853777,0.618373
8,adaboost,0.818296,0.689611
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


### Hyperparameter Tuning of Random Forest

In [54]:
from sklearn.model_selection import GridSearchCV

In [55]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10 , 20, 30],
    'regressor__max_samples': [0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [56]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),  # Numerical features
        ('cat', OrdinalEncoder(), ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),  # Ordinal encoding for selected categorical features
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),  # One-hot encoding for 'agePossession'
        ('target_enc', ce.TargetEncoder(), ['sector'])  # Target encoding for 'sector'
    ],
    remainder='passthrough'  # Keep any other columns unchanged
)


In [57]:
# Creating a pipeline
pipeline = Pipeline( steps= [
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [58]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)


In [59]:
search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [60]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
474 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DIVYANSHU\anaconda3\envs\dsmp\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DIVYANSHU\anaconda3\envs\dsmp\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\DIVYANSHU\anaconda3\envs\dsmp\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\DIVYANSH

In [61]:
search.best_estimator_

In [62]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [63]:
search.best_score_

np.float64(0.8903720935787149)

### XGBoost Hyperparameter Tuning 

In [158]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

# Define the hyperparameter distributions
param_grid = {
    'regressor__max_depth': stats.randint(3, 10),
    'regressor__learning_rate': stats.uniform(0.01, 0.5),
    'regressor__subsample': stats.uniform(0.5, 0.5),
    'regressor__n_estimators':stats.randint(50, 200)
}


In [159]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),  # Numerical features
        ('cat', OrdinalEncoder(), ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),  # Ordinal encoding for selected categorical features
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),  # One-hot encoding for 'agePossession'
        ('target_enc', ce.TargetEncoder(), ['sector'])  # Target encoding for 'sector'
    ],
    remainder='passthrough'  # Keep any other columns unchanged
)


In [160]:
# Creating a pipeline
pipeline = Pipeline( steps= [
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [161]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [162]:
# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=10, cv=kfold, scoring='r2', n_jobs=-1)

In [163]:
random_search.fit(X, y_transformed)

55 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DIVYANSHU\anaconda3\envs\dsmp\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DIVYANSHU\anaconda3\envs\dsmp\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\DIVYANSHU\anaconda3\envs\dsmp\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\DIVYANSHU\an

In [164]:
random_search.best_estimator_

In [165]:
random_search.best_params_

{'regressor__learning_rate': np.float64(0.30913351237459663),
 'regressor__max_depth': 3,
 'regressor__n_estimators': 173,
 'regressor__subsample': np.float64(0.9875113953559389)}

In [166]:
random_search.best_score_

np.float64(nan)

### Optimizing XGBoost with Hyperopt

In [152]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [153]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),  # Numerical features
        ('cat', OrdinalEncoder(), ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),  # Ordinal encoding for selected categorical features
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),  # One-hot encoding for 'agePossession'
        ('target_enc', ce.TargetEncoder(), ['sector'])  # Target encoding for 'sector'
    ],
    remainder='passthrough'  # Keep any other columns unchanged
)


In [154]:
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 5000, 50),
    'max_depth': hp.quniform('max_depth', 10, 100, 10),  
    'learning_rate': hp.loguniform('learning_rate', -4, 0),  
    'subsample': hp.uniform('subsample', 0.5, 1),  
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 1),  
    'gamma': hp.loguniform('gamma', 1/10, 1),  
    'min_child_weight': hp.quniform('min_child_weight', 1, 15, 1),  
    'reg_alpha': hp.loguniform('reg_alpha', 1/1000, 1),  
    'reg_lambda': hp.loguniform('reg_lambda', 1/1000, 1)  
}


In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [156]:
def objective(space):
  regressor = XGBRegressor(
    n_estimators = int(space['n_estimators']),
    max_depth = int(space['max_depth']),
    learning_rate = space['learning_rate'],
    subsample = space['subsample'],
    colsample_bytree = space['colsample_bytree'],
    gamma = space['gamma'],
    min_child_weight=space['min_child_weight'],
    reg_alpha=space['reg_alpha'],
    reg_lambda=space['reg_lambda'],
    random_state=42
  )
  pipeline = Pipeline( steps= [
    ('preprocessor', preprocessor),
    ('regressor', regressor)
  ])
  kfold = KFold(n_splits=10, shuffle=True, random_state=42)
  
  scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
  
  r2_score = scores.mean()

  return {'loss': -r2_score, 'status': STATUS_OK}

In [157]:
trials = Trials()
best = fmin(
  fn = objective,
  space = space,
  algo = tpe.suggest,
  max_evals = 100,
  trials = trials
)
print("Best Hyperparameter:", best)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [15:49<00:00,  9.49s/trial, best loss: -0.8649200072500245]
Best Hyperparameter: {'colsample_bytree': np.float64(0.9679391261499604), 'gamma': np.float64(1.1059993083645767), 'learning_rate': np.float64(0.02379808544310402), 'max_depth': np.float64(80.0), 'min_child_weight': np.float64(5.0), 'n_estimators': np.float64(2950.0), 'reg_alpha': np.float64(1.159790498336153), 'reg_lambda': np.float64(2.0262032879914664), 'subsample': np.float64(0.8797426024653933)}


### XGBoost Hyperparameter tuning with Optuna 

In [106]:
import optuna
from xgboost import XGBRegressor

In [107]:
from optuna.samplers import TPESampler, RandomSampler

In [108]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),  # Numerical features
        ('cat', OrdinalEncoder(), ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),  # Ordinal encoding for selected categorical features
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),  # One-hot encoding for 'agePossession'
        ('target_enc', ce.TargetEncoder(), ['sector'])  # Target encoding for 'sector'
    ],
    remainder='passthrough'  # Keep any other columns unchanged
)

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [110]:
def objective(trial):
  # Suggest hyperparameters
  n_estimators = trial.suggest_int('n_estimators', 500, 4500, step=50)
  max_depth = trial.suggest_int('max_depth', 5, 30)
  learning_rate = trial.suggest_float('learning_rate', 0.004, 0.05)
  subsample = trial.suggest_float('subsample', 0.5, 1.0)
  colsample_bytree = trial.suggest_float('colsample_bytree', 0.4, 0.5)
  gamma = trial.suggest_float('gamma', 0.04, 1)
  min_child_weight = trial.suggest_int('min_child_weight', 5, 15)
  reg_alpha = trial.suggest_float('reg_alpha', 1e-6, 10, log=True)
  reg_lambda = trial.suggest_float('reg_lambda', 1e-6, 10, log=True)
  
  # Define the XGBRegressor with hyperparameters from Optuna
  regressor = XGBRegressor(
      n_estimators=n_estimators,
      max_depth=max_depth,
      learning_rate=learning_rate,
      subsample=subsample,
      colsample_bytree=colsample_bytree,
      gamma=gamma,
      min_child_weight=min_child_weight,
      reg_alpha=reg_alpha,
      reg_lambda=reg_lambda,
      random_state=42,
      n_jobs = -1
  )
   # Define the pipeline
  pipeline = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('regressor', regressor)
  ])
  
  # Use 10-fold cross-validation to evaluate the model
  kfold = KFold(n_splits=10, shuffle=True, random_state=42)

  scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
  
  # The loss to minimize is the mean of negative MAE across all folds
  r2score = scores.mean()
  
  return (r2score)

# Create a study and optimize the objective function
sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50)

# Print the best hyperparameters
print("Best Hyperparameters:", study.best_params)
print("Best r2 score :", study.best_value)

[I 2024-10-12 12:48:08,830] A new study created in memory with name: no-name-3c2983cf-834d-4222-bee0-6bed84ba8de2


[I 2024-10-12 12:48:17,277] Trial 0 finished with value: 0.8770857606888335 and parameters: {'n_estimators': 2000, 'max_depth': 29, 'learning_rate': 0.037671721323324636, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.41560186404424365, 'gamma': 0.18975473952275454, 'min_child_weight': 5, 'reg_alpha': 1.156732719914599, 'reg_lambda': 0.016136341713591334}. Best is trial 0 with value: 0.8770857606888335.
[I 2024-10-12 12:48:28,196] Trial 1 finished with value: 0.8812210283729547 and parameters: {'n_estimators': 3350, 'max_depth': 5, 'learning_rate': 0.04861585319945173, 'subsample': 0.9162213204002109, 'colsample_bytree': 0.42123391106782765, 'gamma': 0.21455196851881658, 'min_child_weight': 7, 'reg_alpha': 0.000134801802908908, 'reg_lambda': 0.004712973756110786}. Best is trial 1 with value: 0.8812210283729547.
[I 2024-10-12 12:48:37,159] Trial 2 finished with value: 0.8736869250590822 and parameters: {'n_estimators': 2200, 'max_depth': 12, 'learning_rate': 0.032145233157229454

Best Hyperparameters: {'n_estimators': 4000, 'max_depth': 28, 'learning_rate': 0.024686455563300843, 'subsample': 0.8373188839164015, 'colsample_bytree': 0.4421641000085366, 'gamma': 0.046080840458789324, 'min_child_weight': 7, 'reg_alpha': 0.00168539150423581, 'reg_lambda': 0.007239211681257434}
Best r2 score : 0.8947515421483431


### Final Model Pipeline

In [185]:
import pickle

In [186]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),  # Numerical features
        ('cat', OrdinalEncoder(), ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),  # Ordinal encoding for selected categorical features
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),  # One-hot encoding for 'agePossession'
        ('target_enc', ce.TargetEncoder(), ['sector'])  # Target encoding for 'sector'
    ],
    remainder='passthrough'  # Keep any other columns unchanged
)

In [187]:
final_regressor = XGBRegressor(
  n_estimators=4000,
  max_depth=28,
  learning_rate=0.024686455563300843,
  subsample=0.8373188839164015,
  colsample_bytree=0.4421641000085366,
  gamma=0.046080840458789324,
  min_child_weight=7,
  reg_alpha=0.00168539150423581,
  reg_lambda=0.007239211681257434,
  random_state=42,
  n_jobs=-1
)

In [189]:
final_pipe = Pipeline(steps=[
  ('preprocessor', preprocessor),
  ('regressor', final_regressor)
])

In [190]:
final_pipe.fit(X, y_transformed)

In [191]:
with open('pipeline.pkl', 'wb') as file:
  pickle.dump(final_pipe, file)

In [192]:
with open('df.pkl', 'wb') as file:
  pickle.dump(X, file)

In [201]:
data = [['house', 'sector 102', 3, 3, '3+', 'New Property', 1750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]

columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony', 'agePossession', 'built_up_area', 'servant room', 'store room', 'furnishing_type', 'luxury_category', 'floor_category']

In [202]:
one_df = pd.DataFrame(data, columns=columns)
one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,3,3,3+,New Property,1750,0,0,unfurnished,Low,Low Floor


In [203]:
np.expm1(final_pipe.predict(one_df))

array([2.2802694], dtype=float32)