This notebook exists to mainly find out the best model used for prediction.

In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df_train = pd.read_csv('./data/iProperty_rent_to_be_trained.csv')

In [3]:
df_train.drop(['name','psf','area'], axis=1, inplace=True)

for col in ['price']:
    q_low = df_train[col].quantile(0.01)
    q_hi  = df_train[col].quantile(0.99)
    df_train = df_train[(df_train[col] < q_hi) & (df_train[col] > q_low)]
    
one_hot = pd.get_dummies(df_train[['district','state','type','details']])
df_train = df_train.join(one_hot).drop(['district','state','type','details'], axis=1)

for num in df_train.drop(['price'], axis=1).dtypes[(df_train.dtypes == "int64")].index:
    MMS = MinMaxScaler()
    MMS.fit(df_train[[num]])
    df_train[num] = MMS.transform(df_train[[num]])

In [4]:
x = df_train.drop(['price'],axis=1)
y = df_train['price']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

1. Gradient Boosting Regressor

In [5]:
gb = GradientBoostingRegressor()

In [20]:
gb.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [37]:
gb_hyperparameters = {
    'n_estimators': [100,50],
    'max_depth': [10,5],
    'min_samples_leaf': [2],
    'min_samples_split': [2],
    'loss': ['squared_error'],
    'ccp_alpha': [0.0,0.1],
    'min_weight_fraction_leaf': [0.0,0.1],
    'validation_fraction': [0.2,0.25],
}

In [38]:
model = GridSearchCV(estimator=gb, param_grid=gb_hyperparameters, cv=5, n_jobs=-1, verbose=4)
model.fit(x_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [45]:
pd.DataFrame.from_dict(model.cv_results_).sort_values(['mean_test_score','std_test_score'],ascending=False).T

Unnamed: 0,25,9,24,8,1,0,16,17,2,3,...,13,12,22,7,23,6,15,14,31,30
mean_fit_time,7.31406,8.099631,7.413385,7.769665,13.056433,12.586598,13.411874,13.705577,6.902035,6.451685,...,5.08788,5.077848,3.128745,2.694308,2.989899,2.577378,2.561467,2.597301,2.536283,2.520722
std_fit_time,0.115223,0.366482,0.246799,0.726787,0.721294,0.288218,0.129833,0.667461,0.205297,0.256926,...,0.180411,0.325209,0.405356,0.200284,0.283219,0.030409,0.1415,0.12525,0.098922,0.07821
mean_score_time,0.021884,0.029371,0.019049,0.020379,0.032848,0.028246,0.034236,0.032034,0.02131,0.026193,...,0.019215,0.018875,0.019901,0.022042,0.02408,0.019048,0.019396,0.015636,0.016261,0.015911
std_score_time,0.007649,0.004192,0.006125,0.00101,0.004788,0.002716,0.019526,0.011263,0.008593,0.004634,...,0.006108,0.006193,0.007576,0.007555,0.006995,0.006116,0.007418,0.000012,0.002492,0.000338
param_ccp_alpha,0.1,0.0,0.1,0.0,0.0,0.0,0.1,0.1,0.0,0.0,...,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.0,0.1,0.1
param_loss,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,...,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error
param_max_depth,5,5,5,5,10,10,10,10,10,10,...,5,5,10,10,10,10,5,5,5,5
param_min_samples_leaf,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
param_min_samples_split,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
param_min_weight_fraction_leaf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1


In [42]:
gb_best_params = model.best_params_

gb_best_params

{'ccp_alpha': 0.1,
 'loss': 'squared_error',
 'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'validation_fraction': 0.25}

2. Decision Tree Regressor

In [12]:
dt = DecisionTreeRegressor()

In [14]:
DecisionTreeRegressor().get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [29]:
dt_hyperparameters = {
    'min_weight_fraction_leaf': [0.0,0.1,0.2,0.3,0.4,0.5],
    'min_samples_leaf': [1,2,3],
    'ccp_alpha': [0.0,0.1,0.2],
    'criterion': ['squared_error','absolute_error','friedman_mse'],
}

In [None]:
model = GridSearchCV(estimator=dt, param_grid=dt_hyperparameters, cv=5, n_jobs=-1, verbose=4)
model.fit(x_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [None]:
pd.DataFrame.from_dict(model.cv_results_).sort_values('rank_test_score')