In [1]:
# importing the necessary library

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')


from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from xgboost import XGBRegressor

In [2]:
# import the dataset

df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [4]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [5]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [6]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [7]:
# split the data in dependent variable and independent variable

X = df.drop(columns=['price'])
y = df['price']

In [8]:
# Applying the log1p transformation to the target variable
# price column is right skewed thats why use log transformation


y_transformed = np.log1p(y)

### Ordinal Encoding

In [9]:
# Encode the attributes

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [10]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [11]:
# Creating a pipeline

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [12]:
# K-fold cross-validation

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [13]:
scores.mean(),scores.std()

(0.7363096633436828, 0.03238005754429926)

In [14]:
# split the data in train_set and test_set

X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [15]:
# Fit the model in training_set

pipeline.fit(X_train,y_train)

In [16]:
y_pred = pipeline.predict(X_test)

In [17]:
y_pred = np.expm1(y_pred)

In [18]:
# MAE

mean_absolute_error(np.expm1(y_test),y_pred)

0.946382216008936

In [19]:
# make a function which will get model_name in input and return the r2 score and mae score 


def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [20]:
# make a dictionary and pass all the model which you want

model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [21]:
# pass all the model list in scorer for getting output

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [22]:
model_output

[['linear_reg', 0.7363096633436828, 0.946382216008936],
 ['svr', 0.7642012011196353, 0.8472636473483934],
 ['ridge', 0.7363125343993555, 0.9463387741853383],
 ['LASSO', 0.05943378064493573, 1.528905986892753],
 ['decision tree', 0.7774712987963288, 0.7491054851996453],
 ['random forest', 0.8808357963975585, 0.5288762599528407],
 ['extra trees', 0.86650638226002, 0.5515279570871251],
 ['gradient boosting', 0.872595751726197, 0.5759394855161694],
 ['adaboost', 0.755687560419161, 0.8600126459933657],
 ['mlp', 0.8109215192123699, 0.7000789135261144],
 ['xgboost', 0.8894876835260124, 0.5040475141482346]]

In [23]:
# make a dataframe to get proper appearance

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [24]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.889488,0.504048
5,random forest,0.880836,0.528876
6,extra trees,0.866506,0.551528
7,gradient boosting,0.872596,0.575939
9,mlp,0.810922,0.700079
4,decision tree,0.777471,0.749105
1,svr,0.764201,0.847264
8,adaboost,0.755688,0.860013
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


### OneHotEncoding

In [25]:
# Creating a column transformer for preprocessing


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [26]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [27]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [28]:
scores.mean()

0.8546098138146467

In [29]:
scores.std()

0.016002496624190985

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [31]:
pipeline.fit(X_train,y_train)

In [32]:
y_pred = pipeline.predict(X_test)

In [33]:
y_pred = np.expm1(y_pred)

In [34]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6497148092826402

In [35]:
# make a function which will get model_name in input and return the r2 score and mae score 


def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [36]:
# make a dictionary and pass all the model which you want


model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [37]:
# pass all the model list and scorer in loop for getting output


model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [38]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [39]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.895051,0.468815
5,random forest,0.890144,0.492786
10,xgboost,0.89585,0.493456
9,mlp,0.878069,0.542767
7,gradient boosting,0.876652,0.570325
0,linear_reg,0.85461,0.649715
2,ridge,0.854673,0.652981
4,decision tree,0.804216,0.677743
8,adaboost,0.755729,0.820168
1,svr,0.769741,0.834124


### OneHotEncoding With PCA

In [40]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [41]:
# Creating a pipeline

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [42]:
# K-fold cross-validation

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [43]:
scores.mean()

0.06225201431451133

In [44]:
scores.std()

0.019860594071640147

In [45]:
# make a function which will get model_name in input and return the r2 score and mae score 


def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [46]:
# make a dictionary and pass all the model which you want


model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [47]:
# pass all the model list and scorer in loop for getting output

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [48]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [49]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.762863,0.657141
6,extra trees,0.739248,0.701535
4,decision tree,0.696182,0.75729
10,xgboost,0.620664,0.948597
7,gradient boosting,0.610604,0.987906
8,adaboost,0.308408,1.358958
1,svr,0.218073,1.361163
9,mlp,0.210766,1.405561
2,ridge,0.062252,1.526707
0,linear_reg,0.062252,1.526707


### Target Encoder

In [50]:
# !pip install category_encoders

In [51]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [52]:
# Creating a pipeline


pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [53]:
# K-fold cross-validation


kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [54]:
scores.mean(),scores.std()

(0.8295219182255359, 0.01838446337912286)

In [55]:
# make a function which will get model_name in input and return the r2 score and mae score 


def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [56]:
# make a dictionary and pass all the model which you want


model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [57]:
# pass all the model list and scorer in loop for getting output

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [58]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [59]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.900897,0.454395
6,extra trees,0.902071,0.462435
7,gradient boosting,0.889066,0.509988
4,decision tree,0.827212,0.541624
9,mlp,0.855811,0.61577
8,adaboost,0.816721,0.697371
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


### Hyperparameter Tuning

In [60]:
from sklearn.model_selection import GridSearchCV

In [61]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [62]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [63]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [64]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [65]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [66]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


In [67]:
final_pipe = search.best_estimator_

In [68]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [69]:
search.best_score_

0.9025395140071465

In [70]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [71]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [72]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [73]:
pipeline.fit(X,y_transformed)

### Trying out the predictions

In [74]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [75]:
X.iloc[0].values

array(['flat', 'sector 36', 3.0, 2.0, '2', 'New Property', 850.0, 0.0,
       0.0, 'unfurnished', 'Low', 'Low Floor'], dtype=object)

In [76]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [77]:
np.expm1(pipeline.predict(one_df))

array([3.13423529])

In [78]:
X.dtypes

property_type       object
sector              object
bedRoom            float64
bathroom           float64
balcony             object
agePossession       object
built_up_area      float64
servant room       float64
store room         float64
furnishing_type     object
luxury_category     object
floor_category      object
dtype: object

In [79]:
sorted(X['sector'].unique().tolist())

['dwarka expressway',
 'gwal pahari',
 'manesar',
 'sector 1',
 'sector 10',
 'sector 102',
 'sector 103',
 'sector 104',
 'sector 105',
 'sector 106',
 'sector 107',
 'sector 108',
 'sector 109',
 'sector 11',
 'sector 110',
 'sector 111',
 'sector 112',
 'sector 113',
 'sector 12',
 'sector 13',
 'sector 14',
 'sector 15',
 'sector 17',
 'sector 2',
 'sector 21',
 'sector 22',
 'sector 23',
 'sector 24',
 'sector 25',
 'sector 26',
 'sector 27',
 'sector 28',
 'sector 3',
 'sector 30',
 'sector 31',
 'sector 33',
 'sector 36',
 'sector 37',
 'sector 37d',
 'sector 38',
 'sector 39',
 'sector 4',
 'sector 40',
 'sector 41',
 'sector 43',
 'sector 45',
 'sector 46',
 'sector 47',
 'sector 48',
 'sector 49',
 'sector 5',
 'sector 50',
 'sector 51',
 'sector 52',
 'sector 53',
 'sector 54',
 'sector 55',
 'sector 56',
 'sector 57',
 'sector 58',
 'sector 59',
 'sector 6',
 'sector 60',
 'sector 61',
 'sector 62',
 'sector 63',
 'sector 63a',
 'sector 65',
 'sector 66',
 'sector 67',
 'se

# Try XGBoost with Hyperopt

#### 1. Import Libraries:

In [152]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from hyperopt import hp, fmin, tpe
from sklearn.pipeline import Pipeline
import joblib


#### 2. Load Your Data:

In [153]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

#### 3. replacement operation on the 'furnishing_type' column

In [154]:
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

#### 4. Split the data in to dependent column and independent column

In [155]:
X = df.drop(columns=['price'])
y = df['price']

#### 5. Log-transform the Dependent Variable
###### because dependent column is right skewed when we try to normalize the data we have to convert in normal distribution with the help of log transformation but when we use log transformation it covers the negative range as well thats why i am using log1p.

In [156]:
y_transformed = np.log1p(y)

#### 6. Define Column Transformers and Transformers:

In [157]:
numerical_cols = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']
ordinal_cols = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']
onehot_cols = ['agePossession']
target_encode_cols = ['sector']

numeric_transformer = StandardScaler()
ordinal_transformer = OrdinalEncoder()
onehot_transformer = OneHotEncoder()
target_encode_transformer = TargetEncoder()


#### 7. Create Preprocessor using ColumnTransformer:

In [158]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('onehot', onehot_transformer, onehot_cols),
        ('target', target_encode_transformer, target_encode_cols)
    ])


#### 8. Create a Pipeline with XGBoost Regressor:

In [159]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())
])


#### 9. Define Hyperparameter Space for XGBoost:

In [160]:
space = {
    'model__n_estimators': hp.choice('n_estimators', range(100, 1000)),
    'model__max_depth': hp.choice('max_depth', range(1, 10)),
    'model__learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'model__subsample': hp.uniform('subsample', 0.5, 1.0),
    'model__colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'model__gamma': hp.uniform('gamma', 0, 1),
    'model__reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'model__reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'model__min_child_weight': hp.choice('min_child_weight', range(1, 10))
}

#### 10. Objective Function for Hyperparameter Optimization

In [161]:
def objective(params):
    pipeline.set_params(**params)
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    mae = -cross_val_score(pipeline, X, y_transformed, cv=kf, scoring='neg_mean_absolute_error').mean()
    return mae


#### 11. Perform Hyperparameter Optimization

In [162]:
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=60,
            rstate=np.random.seed(42))


100%|███████████████████████████████████████████████| 60/60 [01:58<00:00,  1.98s/trial, best loss: 0.11519330585391441]


#### 12. Extract Best Hyperparameters

In [163]:
best_params = {
    'model__n_estimators': best['n_estimators'] + 100,
    'model__max_depth': best['max_depth'] + 1,
    'model__learning_rate': best['learning_rate'],
    'model__subsample': best['subsample'],
    'model__colsample_bytree': best['colsample_bytree'],
    'model__gamma': best['gamma'],
    'model__reg_alpha': best['reg_alpha'],
    'model__reg_lambda': best['reg_lambda'],
    'model__min_child_weight': best['min_child_weight']
}
best_params

{'model__n_estimators': 591,
 'model__max_depth': 7,
 'model__learning_rate': 0.2722400315824108,
 'model__subsample': 0.7848309690052149,
 'model__colsample_bytree': 0.7768686510543826,
 'model__gamma': 0.002466348183761847,
 'model__reg_alpha': 0.948656261315564,
 'model__reg_lambda': 0.6603642573618689,
 'model__min_child_weight': 8}

#### 13. Calculate mean absolute error

In [164]:
# Calculate and print mean absolute error
kf = KFold(n_splits=20, shuffle=True, random_state=42)
mae_scores = -cross_val_score(pipeline, X, y_transformed, cv=kf, scoring='neg_mean_absolute_error')
mean_mae = mae_scores.mean()

print("Mean Absolute Error:", mean_mae)

Mean Absolute Error: 0.11732251010827885


In [165]:
pipeline

# Export the model

#### Flexible Feature Preprocessing with ColumnTransformer

In [166]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('onehot', onehot_transformer, onehot_cols),
        ('target', target_encode_transformer, target_encode_cols)
    ])


#### Make a scikit learn pipeline

In [167]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=500))
])

#### Fit the model in pipeline

In [168]:
pipeline.fit(X,y_transformed)

####  Export the binary code model in pkl

In [169]:
# Save the trained model to a .pkl file
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

# save the dataframe to pkl file
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

#### Trying out the prediction

In [170]:
data = [['house', 'sector 54', 21, 21, '3+', 'New Property', 1162, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 54,21,21,3+,New Property,1162,0,0,unfurnished,Low,Low Floor


In [171]:
np.expm1(pipeline.predict(one_df))

array([4.19066], dtype=float32)

In [172]:
data = [['house', 'sector 49', 3, 3, '3+', 'New Property', 1750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 49,3,3,3+,New Property,1750,0,0,unfurnished,Low,Low Floor


In [173]:
np.expm1(pipeline.predict(one_df))

array([2.8800159], dtype=float32)