## Model Selection
### @cipher499
### 28/12/23

In [1]:
import numpy as np
import pandas as pd

In [2]:
# set the option to display all columns
pd.set_option('display.max_columns', None)

In [3]:
# load the dataset
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')
df.head()

Unnamed: 0,property_type,sector,price,bedrooms,bathrooms,balconies,age_possession,built_up_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_category,floor_category
0,flat,sector 65,2.5,3.0,3.0,3,Relatively New,1654.0,0.0,0.0,0.0,0.0,0.0,0.0,budget,high floor
1,flat,sector 48,2.65,4.0,4.0,3+,Moderately Old,2134.0,1.0,1.0,0.0,0.0,0.0,1.0,high,high floor
2,flat,sector 85,1.2,2.0,2.0,3,Relatively New,1300.0,0.0,0.0,0.0,0.0,0.0,1.0,high,medium floor
3,flat,sector 107,0.52,3.0,2.0,2,Relatively New,717.0,0.0,0.0,0.0,0.0,1.0,0.0,budget,medium floor
4,flat,sohna road,0.54,2.0,1.0,3,New Property,828.0,0.0,0.0,0.0,0.0,0.0,0.0,medium,low floor


In [4]:
# drop the columns that ranked the lowest in feature importance
df.drop(columns=['others', 'pooja room'], inplace=True)
df.head()

Unnamed: 0,property_type,sector,price,bedrooms,bathrooms,balconies,age_possession,built_up_area,study room,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 65,2.5,3.0,3.0,3,Relatively New,1654.0,0.0,0.0,0.0,0.0,budget,high floor
1,flat,sector 48,2.65,4.0,4.0,3+,Moderately Old,2134.0,1.0,1.0,0.0,1.0,high,high floor
2,flat,sector 85,1.2,2.0,2.0,3,Relatively New,1300.0,0.0,0.0,0.0,1.0,high,medium floor
3,flat,sector 107,0.52,3.0,2.0,2,Relatively New,717.0,0.0,0.0,0.0,0.0,budget,medium floor
4,flat,sohna road,0.54,2.0,1.0,3,New Property,828.0,0.0,0.0,0.0,0.0,medium,low floor


In [5]:
# replace the numerical values in furnishing_type with str categories
df['furnishing_type'].replace({0.0:'unfurnished', 1.0:'semifurnished', 2.0:'furnished'}, inplace=True)
df.head()

Unnamed: 0,property_type,sector,price,bedrooms,bathrooms,balconies,age_possession,built_up_area,study room,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 65,2.5,3.0,3.0,3,Relatively New,1654.0,0.0,0.0,0.0,unfurnished,budget,high floor
1,flat,sector 48,2.65,4.0,4.0,3+,Moderately Old,2134.0,1.0,1.0,0.0,semifurnished,high,high floor
2,flat,sector 85,1.2,2.0,2.0,3,Relatively New,1300.0,0.0,0.0,0.0,semifurnished,high,medium floor
3,flat,sector 107,0.52,3.0,2.0,2,Relatively New,717.0,0.0,0.0,0.0,unfurnished,budget,medium floor
4,flat,sohna road,0.54,2.0,1.0,3,New Property,828.0,0.0,0.0,0.0,unfurnished,medium,low floor


In [16]:
# create the features and the target
X = df.drop(columns='price')
y = df['price']

# log transform the target variable to bring it closer to the normal distribution
y_transformed = np.log1p(y)

### Ordinal Encoding

In [7]:
X.columns

Index(['property_type', 'sector', 'bedrooms', 'bathrooms', 'balconies',
       'age_possession', 'built_up_area', 'study room', 'servant room',
       'store room', 'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [7]:
columns_to_encode = ['property_type', 'sector', 'balconies', 'age_possession', 'furnishing_type', 'luxury_category', 'floor_category']

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# create a column transformer object to transform the numerical and categorical columns
preprocessor = ColumnTransformer(
                   transformers=[
                       ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area', 'servant room', 'store room']),
                       ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode)
                       ],
                    remainder='passthrough'
                    )

In [57]:
# create a pipeline for preprocessing followed by fitting a linear regressor
pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('regressor', LinearRegression())
            ])

In [30]:
X['sector'].value_counts()

sohna road    163
sector 85     108
sector 102    107
sector 92     100
sector 69      93
             ... 
sector 88b      3
sector 73       3
sector 27       2
sector 37       1
sector 17a      1
Name: sector, Length: 112, dtype: int64

In [10]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# create a kfold object that splits the data into 10 and shuffles before each split
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [21]:
# get the cross validation score using R2_squared 
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [45]:
oe = OrdinalEncoder()
X['sector'] = oe.fit_transform(X[['sector']])                               

In [22]:
scores.mean(), scores.std()

(0.7417582897284817, 0.02499369968112312)

In [23]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [24]:
# fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [25]:
# get the predictions from the trained model
y_pred = pipeline.predict(X_test)
# transform to the original scale
y_pred = np.expm1(y_pred)

In [26]:
mean_absolute_error(np.expm1(y_test), y_pred)

0.8170658219435963

In [66]:
def scorer(model_name, model):
    """
    takes the model name and model constructor as inputs
    and trains the pipeline on the training set
    returns the model name, cv score, and mae as a list
    """
    output = []  
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed,cv=kfold, scoring='r2')
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)   
    pipeline.fit(X_train, y_train)
    
    y_pred = np.expm1(pipeline.predict(X_test))  
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [12]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

# create a dictionary with regression model names as keys and their constructors as values
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [67]:
# iterate over the dictionary and feed the items to the scorer function 
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [68]:
model_output

[['linear_reg', 0.7417582897284817, 0.8170658219435963],
 ['svr', 0.7610805404179728, 0.8238475641279626],
 ['ridge', 0.7417609242994277, 0.8169963234760353],
 ['LASSO', 0.05787353508809641, 1.4843625217979457],
 ['decision tree', 0.7811952607279748, 0.6695953668212529],
 ['random forest', 0.884890563547143, 0.49003498405554763],
 ['extra trees', 0.8693541617342408, 0.5201525944577925],
 ['gradient boosting', 0.8784615297367251, 0.5474151307090133],
 ['adaboost', 0.7620345521093589, 0.7914242180638115],
 ['mlp', 0.8106241140902265, 0.6921981326595785],
 ['xgboost', 0.894166365949134, 0.4742831668002025]]

In [31]:
# convert the list into a dataframe
model_df = pd.DataFrame(model_output, columns=['name', 'r2', 'mae'])
model_df.sort_values(by='mae')

Unnamed: 0,name,r2,mae
10,xgboost,0.894166,0.474283
5,random forest,0.88649,0.48782
6,extra trees,0.870337,0.519119
7,gradient boosting,0.878523,0.547047
4,decision tree,0.783493,0.670778
9,mlp,0.80889,0.693459
2,ridge,0.741761,0.816996
0,linear_reg,0.741758,0.817066
8,adaboost,0.760786,0.819403
1,svr,0.761081,0.823848


- Tree based models performed the best as was expected in the case of ordinal encoding.

### One-Hot Encoding

In [32]:
# create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore'),['sector','age_possession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [33]:
# create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [34]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [35]:
scores.mean()

0.8574997354110527

In [36]:
scores.std()

0.024088254922215134

In [37]:
# split the data and fit the pipeline on the training set
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train, y_train)

In [38]:
# get the predictions
y_pred = pipeline.predict(X_test)
# transform back to the original scale
y_pred = np.expm1(y_pred)
mean_absolute_error(np.expm1(y_test), y_pred)

0.590822901280835

In [39]:
# iterate over the dictionary and feed the items to the scorer function 
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [40]:
# convert the list into a dataframe
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.895461,0.462556
6,extra trees,0.896983,0.465529
10,xgboost,0.899448,0.474707
7,gradient boosting,0.879235,0.556788
0,linear_reg,0.8575,0.590823
9,mlp,0.873218,0.595307
2,ridge,0.857614,0.59927
4,decision tree,0.808596,0.614682
8,adaboost,0.759673,0.801978
1,svr,0.764293,0.822628


### One-Hot Encoding with PCA
- OHE increases the dimensionality of data
- PCA can be used for dimensionality reduction

In [46]:
# create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), ['sector','age_possession'])
    ], 
    remainder='passthrough'
)

In [53]:
from sklearn.decomposition import PCA

pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('pca', PCA(n_components=0.95)),
            ('regressor', LinearRegression())
            ])

In [54]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [55]:
scores.mean(), scores.std()

(0.060702623948214116, 0.023508347414413558)

In [69]:
def scorer(model_name, model):
    """
    takes the model name and model constructor as inputs
    and trains the pipeline on the training set
    returns the model name, cv score, and mae as a list
    """
    output = []  
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed,cv=kfold, scoring='r2')
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)   
    pipeline.fit(X_train, y_train)
    
    y_pred = np.expm1(pipeline.predict(X_test))  
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [71]:
model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [72]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.76057,0.743359
6,extra trees,0.730058,0.783154
4,decision tree,0.692961,0.834074
10,xgboost,0.61488,0.942535
7,gradient boosting,0.611893,1.005726
1,svr,0.227876,1.333881
8,adaboost,0.30766,1.342594
9,mlp,0.217113,1.402854
3,LASSO,0.058067,1.484294
2,ridge,0.0607,1.490718


### Target Encoding
- Used to transform features having high cardinality
- Target encoding -> groupby on feature columns; mean of target column
     - can lead to data leakage 
     - always use it on training data after splitting the data
- Gives better results with tree based algorithms than reg. algorithms

In [13]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balconies', 'age_possession', 'furnishing_type', 'luxury_category', 'floor_category']

# create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False),['age_possession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [75]:
# create a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [76]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [77]:
scores.mean(),scores.std()

(0.8290253101264102, 0.022486429572588985)

In [78]:
def scorer(model_name, model):
    """
    takes the model name and model constructor as inputs
    and trains the pipeline on the training set
    returns the model name, cv score, and mae as a list
    """
    output = []  
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed,cv=kfold, scoring='r2')
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)   
    pipeline.fit(X_train, y_train)
    
    y_pred = np.expm1(pipeline.predict(X_test))  
    output.append(mean_absolute_error(np.expm1(y_test), y_pred))
    
    return output

In [79]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.907317,0.453302
5,random forest,0.906098,0.455762
6,extra trees,0.902819,0.457894
7,gradient boosting,0.89115,0.534789
4,decision tree,0.83244,0.541481
9,mlp,0.844822,0.599962
2,ridge,0.829045,0.674564
0,linear_reg,0.829025,0.674568
8,adaboost,0.814858,0.675765
1,svr,0.776905,0.803074


### Hyperparameter Tuning

- XGBoost -> tune hyperparameters using bayesian search(hyperop) // kaggle comp hack

In [44]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import train_test_split

space = {
    'max_depth': hp.uniform('max_depth', 3, 18, 1),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 1, 0.5),
    'min_samples_split': hp.uniform('min_samples_split', 0, 0.5, 1),
    'n_estimators': hp.choice('n_estimators', [10, 50, 300, 750, 1000, 1200, 1500])
}

In [32]:
space

{'criterion': <hyperopt.pyll.base.Apply at 0x7f5a736c1f60>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x7f5a7375a440>,
 'max_features': <hyperopt.pyll.base.Apply at 0x7f5a73759780>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x7f5a73713f70>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x7f5a7375ad40>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x7f5a7375a800>}

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)   

In [34]:
def obj_fn(params):
	model = RandomForestRegressor(
				criterion=params['criterion'],
				max_depth=params['max_depth'],
				max_features=params['max_features'],
				min_samples_leaf=params['min_samples_leaf'],
				min_samples_split=params['min_samples_split'],
				n_estimators=params['n_estimators']
				)
				
	accuracy = cross_val_score(model, X_train, y_train, cv=5).mean()
	
	return {'loss': accuracy, 'status':STATUS_OK}

In [None]:
trials = Trials()
best = fmin(fn=obj_fn,
		space=space,
		algo=tpe.suggest,
		max_evals=80,
		trials=trials)
best

In [30]:
X_train.shape, y_train.shape

((2844, 13), (2844,))

In [17]:
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np

# Assuming you have a feature matrix X and target variable y
# For illustration, let's create a synthetic dataset
# np.random.seed(42)
# X = np.random.rand(100, 5)
# y = 2 * X[:, 0] + 3 * X[:, 1] + 0.5 * X[:, 2] + np.random.randn(100)

# Define the hyperparameter search space
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'max_depth': hp.choice('max_depth', range(1, 10)),
    'n_estimators': hp.choice('n_estimators', range(50, 200)),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'gamma': hp.uniform('gamma', 0, 1)
}

In [18]:
# Define the objective function (minimize negative mean squared error)
def objective(params):
    model = xgb.XGBRegressor(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        gamma=params['gamma']
    )
    
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model and get predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    # Compute mean squared error (the objective to minimize)
    mse = np.mean((y_val - y_pred) ** 2)
    
    # Return the negative mean squared error (to be minimized)
    return {'loss': mse, 'status': STATUS_OK}

In [None]:
# Run hyperparameter optimization
trials = Trials()
best_params = fmin(fn=objective,
                   space=space,
                   algo=tpe.suggest,
                   max_evals=100,
                   trials=trials)

print("Best Hyperparameters:")
print(best_params)

### Using GridSearchCV

In [22]:
from sklearn.model_selection import GridSearchCV

In [43]:
param_grid = {
            'regressor__n_estimators': [50, 100, 200, 300],
            'regressor__max_depth': [None, 10, 20, 30],
            'regressor__max_samples': [0.1, 0.25, 0.5, 1.0],
            'regressor__max_features': ['log2', 'sqrt']
            }

In [50]:
columns_to_encode = ['property_type', 'sector', 'balconies', 'age_possession', 'furnishing_type', 'luxury_category', 'floor_category']


preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area', 'servant room', 'store room']),
                    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
                    ('cat1', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), ['age_possession']),
                    ('target_enc', ce.TargetEncoder(), ['sector'])
                ],
                remainder='passthrough'
                )

In [51]:
pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor())
])

In [52]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=1)
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


In [39]:
X.columns

Index(['property_type', 'sector', 'bedrooms', 'bathrooms', 'balconies',
       'age_possession', 'built_up_area', 'study room', 'servant room',
       'store room', 'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [40]:
X.floor_category.value_counts()

medium floor    1797
low floor        954
high floor       804
Name: floor_category, dtype: int64

In [53]:
final_pipe = search.best_estimator_

In [54]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [55]:
search.best_score_

0.9027938390879473

### Using Hyperopt

In [125]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 18, 1),
    'max_features': hp.choice('max_features', [None, 'sqrt', 'log2']),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 1),
    'min_samples_split': hp.uniform('min_samples_split', 0.01, 1.0),  # Adjust the range as needed
    'n_estimators': hp.choice('n_estimators', [10, 50, 300, 750, 1000, 1200, 1500])
}

In [126]:
columns_to_encode = ['property_type', 'sector', 'balconies', 'age_possession', 'furnishing_type', 'luxury_category', 'floor_category']


preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area', 'servant room', 'store room']),
                    ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
                    ('cat1', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), ['age_possession']),
                    ('target_enc', ce.TargetEncoder(), ['sector'])
                ],
                remainder='passthrough'
                )

In [129]:
def obj_fn(params):
    model = RandomForestRegressor(
        max_depth=int(params['max_depth']),
        max_features=params['max_features'],
        min_samples_leaf=params['min_samples_leaf'],
        min_samples_split=params['min_samples_split'],
        n_estimators=params['n_estimators']
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    score = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2').mean()
    return {'loss': -score, 'status': STATUS_OK}

In [130]:
trials = Trials()
best_params = fmin(fn=obj_fn,
                   space=space,
                   algo=tpe.suggest,
                   max_evals=100,
                   trials=trials)

100%|██████| 100/100 [13:31<00:00,  8.11s/trial, best loss: -0.8404645252293823]


In [101]:
best_params

{'max_depth': 11.0,
 'max_features': 1,
 'min_samples_leaf': 0.0004460952328575289,
 'min_samples_split': 0.0392402749560397,
 'n_estimators': 3}

In [None]:
# for i, trial in enumerate(trials.trials):
#     print(f"Trial {i + 1}:")
#     print(f"  Status: {trial['result']['status']}")
#     print(f"  Loss: {trial['result']['loss']}")
#     print(f"  Params: {trial['misc']['vals']}")
#     print("\n")

In [144]:
rf = RandomForestRegressor(
    max_depth = int(11.0),
    max_features = int(2),
    min_samples_leaf = 0.003711990899788821,
    min_samples_split = 0.01199136863810941,
    n_estimators = int(1000))

In [145]:
pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', rf)
    ])

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
accuracy = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2').mean()

In [146]:
accuracy

0.8257149876168951

### Exporting the best model

In [147]:
final_pipe.fit(X, y_transformed)

In [148]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(final_pipe, file)

In [149]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [150]:
X

Unnamed: 0,property_type,sector,bedrooms,bathrooms,balconies,age_possession,built_up_area,study room,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 65,3.0,3.0,3,Relatively New,1654.0,0.0,0.0,0.0,unfurnished,budget,high floor
1,flat,sector 48,4.0,4.0,3+,Moderately Old,2134.0,1.0,1.0,0.0,semifurnished,high,high floor
2,flat,sector 85,2.0,2.0,3,Relatively New,1300.0,0.0,0.0,0.0,semifurnished,high,medium floor
3,flat,sector 107,3.0,2.0,2,Relatively New,717.0,0.0,0.0,0.0,unfurnished,budget,medium floor
4,flat,sohna road,2.0,1.0,3,New Property,828.0,0.0,0.0,0.0,unfurnished,medium,low floor
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3550,flat,sector 85,2.0,2.0,3,Relatively New,1484.0,0.0,0.0,0.0,semifurnished,high,high floor
3551,flat,sector 80,3.0,4.0,2,Moderately Old,1854.0,0.0,0.0,1.0,unfurnished,medium,medium floor
3552,flat,sector 67,4.0,4.0,3+,Moderately Old,2127.0,0.0,1.0,0.0,semifurnished,medium,medium floor
3553,house,sector 76,2.0,2.0,2,New Property,745.0,0.0,0.0,0.0,unfurnished,budget,high floor


In [157]:
test = ['flat', 'sector 65', 3.0, 3.0, '3', 'Relatively New', 1654.0, 0.0,
       0.0, 0.0, 'unfurnished', 'budget', 'high floor']
len(test)

13

In [None]:
final_pipe.predict(pd.DataFrame(test))