In [35]:
import numpy as np
import pandas as pd

In [36]:
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

In [37]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [38]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [39]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [40]:
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished', 1.0:'semifurnished', 2.0:'furnished'})
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [41]:
X = df.drop(columns=['price'])
y = df['price']

In [42]:
y_transformed = np.log1p(y)

### Ordinal Encoding

In [43]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [44]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ],
    remainder = 'passthrough'
)

In [45]:
# Creating a Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [46]:
# K-fold cross-validation
kfold = KFold(n_splits = 10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [47]:
scores.mean(), scores.std()

(np.float64(0.7363096633436828), np.float64(0.03238005754429936))

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [49]:
pipeline.fit(X_train, y_train)

In [50]:
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)

In [51]:
mean_absolute_error(np.expm1(y_test), y_pred)

np.float64(0.9463822160089356)

In [52]:
def scorer(model_name, model):
    output = []
    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [53]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [54]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees':ExtraTreesRegressor(),
    'gradient boosting':GradientBoostingRegressor(),
    'adaboost':AdaBoostRegressor(),
    'mlp':MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [21]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [23]:
model_output

[['linear_reg',
  np.float64(0.7363096633436828),
  np.float64(0.9463822160089356)],
 ['svr', np.float64(0.7642012011196353), np.float64(0.8472636473483922)],
 ['ridge', np.float64(0.7363125343993554), np.float64(0.9463387741853386)],
 ['LASSO', np.float64(0.05943378064493573), np.float64(1.528905986892753)],
 ['decision tree',
  np.float64(0.7807613638666707),
  np.float64(0.748127821417852)],
 ['random forest',
  np.float64(0.8801994506665143),
  np.float64(0.524297545751117)],
 ['extra trees',
  np.float64(0.8682448804364702),
  np.float64(0.5454842669650655)],
 ['gradient boosting',
  np.float64(0.8727528001154086),
  np.float64(0.5758500041347059)],
 ['adaboost', np.float64(0.7494444040692857), np.float64(0.8632871406051902)],
 ['mlp', np.float64(0.8104676206390673), np.float64(0.7321535050304743)],
 ['xgboost', np.float64(0.8894876835260124), np.float64(0.5040475141482346)]]

In [24]:
model_df = pd.DataFrame(model_output, columns=['name', 'r2', 'mae']).sort_values(by='mae')

In [25]:
model_df

Unnamed: 0,name,r2,mae
10,xgboost,0.889488,0.504048
5,random forest,0.880199,0.524298
6,extra trees,0.868245,0.545484
7,gradient boosting,0.872753,0.57585
9,mlp,0.810468,0.732154
4,decision tree,0.780761,0.748128
1,svr,0.764201,0.847264
8,adaboost,0.749444,0.863287
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


### OneHotEncoding

In [55]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first'), ['sector', 'agePossession', 'furnishing_type'])
    ],
    remainder = 'passthrough'
)

In [56]:
# Creating a Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [57]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [58]:
scores.mean()

np.float64(0.8546094810971422)

In [59]:
scores.std()

np.float64(0.015997422908695623)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [61]:
pipeline.fit(X_train, y_train)

In [62]:
y_pred = pipeline.predict(X_test)

In [63]:
y_pred = np.expm1(y_pred)

In [64]:
mean_absolute_error(np.expm1(y_test), y_pred)

np.float64(0.6497514315131458)

In [65]:
def scorer(model_name, model):
    output = []
    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [66]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees':ExtraTreesRegressor(),
    'gradient boosting':GradientBoostingRegressor(),
    'adaboost':AdaBoostRegressor(),
    'mlp':MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [38]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [39]:
model_df = pd.DataFrame(model_output, columns=['name', 'r2', 'mae'])

In [40]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.895676,0.470743
5,random forest,0.891592,0.492176
10,xgboost,0.89585,0.493456
9,mlp,0.876509,0.524054
7,gradient boosting,0.876771,0.569691
0,linear_reg,0.854609,0.649751
2,ridge,0.854739,0.652915
4,decision tree,0.80264,0.702055
8,adaboost,0.749963,0.826217
1,svr,0.769741,0.834124


### OneHotEncoding with PCA

In [67]:
# Creating a column transformer for preprocessing

preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['sector', 'agePossession'])
    ],
    remainder = 'passthrough'
)

In [68]:
# Creating a Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [69]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [70]:
scores.mean()

np.float64(0.06225201431451135)

In [71]:
scores.std()

np.float64(0.01986059407164016)

### Target Encoding
- used for those column which have high cardinality
- each sector -> value mean
- avoid Data Leakage <> do split beforhand

In [72]:
import category_encoders as ce

columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [73]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [74]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')


In [75]:
scores.mean(), scores.std()

(np.float64(0.8295219182255362), np.float64(0.018384463379122782))

In [76]:
def scorer(model_name, model):
    output = []
    output.append(model_name)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [77]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree':DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees':ExtraTreesRegressor(),
    'gradient boosting':GradientBoostingRegressor(),
    'adaboost':AdaBoostRegressor(),
    'mlp':MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [52]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [53]:
model_df = pd.DataFrame(model_output, columns=['name', 'r2', 'mae'])

In [54]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.90059,0.456
6,extra trees,0.90223,0.462729
7,gradient boosting,0.889019,0.507802
4,decision tree,0.832753,0.554294
8,adaboost,0.816365,0.697681
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
9,mlp,0.85178,0.732885
1,svr,0.782917,0.818851


### Hyperparameter Tuning of Random Forest

In [78]:
from sklearn.model_selection import GridSearchCV

In [79]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10 , 20, 30],
    'regressor__max_samples': [0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [80]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),  # Numerical features
        ('cat', OrdinalEncoder(), ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),  # Ordinal encoding for selected categorical features
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),  # One-hot encoding for 'agePossession'
        ('target_enc', ce.TargetEncoder(), ['sector'])  # Target encoding for 'sector'
    ],
    remainder='passthrough'  # Keep any other columns unchanged
)


In [81]:
# Creating a pipeline
pipeline = Pipeline( steps= [
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [82]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)


In [83]:
search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [76]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
306 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DIVYANSHU\anaconda3\envs\dsmp\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DIVYANSHU\anaconda3\envs\dsmp\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\DIVYANSHU\anaconda3\envs\dsmp\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\DIVYANSH

In [99]:
search.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [78]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [79]:
search.best_score_

np.float64(0.8903594697951759)

### XGBoost Hyperparameter Tuning 

In [90]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

# Define the hyperparameter distributions
param_grid = {
    'regressor__max_depth': stats.randint(3, 10),
    'regressor__learning_rate': stats.uniform(0.01, 0.1),
    'regressor__subsample': stats.uniform(0.5, 0.5),
    'regressor__n_estimators':stats.randint(50, 200)
}



In [91]:
columns_to_encode = ['property_type', 'sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),  # Numerical features
        ('cat', OrdinalEncoder(), ['property_type', 'balcony', 'furnishing_type', 'luxury_category', 'floor_category']),  # Ordinal encoding for selected categorical features
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), ['agePossession']),  # One-hot encoding for 'agePossession'
        ('target_enc', ce.TargetEncoder(), ['sector'])  # Target encoding for 'sector'
    ],
    remainder='passthrough'  # Keep any other columns unchanged
)


In [92]:
# Creating a pipeline
pipeline = Pipeline( steps= [
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [93]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [94]:
# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, n_iter=10, cv=kfold, scoring='r2', n_jobs=-1)

In [95]:
random_search.fit(X, y_transformed)

In [96]:
random_search.best_estimator_

In [97]:
random_search.best_params_

{'regressor__learning_rate': np.float64(0.06344774855661249),
 'regressor__max_depth': 6,
 'regressor__n_estimators': 163,
 'regressor__subsample': np.float64(0.8493126642389985)}

In [98]:
random_search.best_score_

np.float64(0.90023834019893)