This notebook exists to mainly find out the best model used for prediction.

In [1]:
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split, GridSearchCV



In [2]:
df_train = pd.read_csv('../data/iProperty_sale_to_be_trained.csv')

In [3]:
df_train.drop(['name','psf','area'], axis=1, inplace=True)

for col in ['price']:
    q_low = df_train[col].quantile(0.01)
    q_hi  = df_train[col].quantile(0.99)
    df_train = df_train[(df_train[col] < q_hi) & (df_train[col] > q_low)]
    
one_hot = pd.get_dummies(df_train[['district','state','type','details']])
df_train = df_train.join(one_hot).drop(['district','state','type','details'], axis=1)

In [4]:
x = df_train.drop(['price'],axis=1)
y = df_train['price']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

1. Gradient Boosting Regressor

In [5]:
gb = GradientBoostingRegressor()

In [6]:
gb.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [7]:
gb_hyperparameters = {
    'n_estimators': [100,50],
    'max_depth': [10,5],
    'min_samples_leaf': [2],
    'min_samples_split': [2],
    'loss': ['squared_error'],
    'ccp_alpha': [0.0,0.1],
    'min_weight_fraction_leaf': [0.0,0.1],
    'validation_fraction': [0.2,0.25],
}

In [8]:
model = GridSearchCV(estimator=gb, param_grid=gb_hyperparameters, cv=5, n_jobs=-1, verbose=4)
model.fit(x_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [9]:
pd.DataFrame.from_dict(model.cv_results_).sort_values(['mean_test_score','std_test_score'],ascending=False).T

Unnamed: 0,1,16,17,0,2,18,3,19,8,25,...,20,4,6,23,7,22,15,14,30,31
mean_fit_time,28.859993,25.837187,25.506232,30.26842,13.553546,14.015599,13.428647,13.803553,14.137822,13.4684,...,10.48982,11.256066,5.857532,5.76925,6.029424,5.917736,5.109815,5.181103,5.577783,4.78551
std_fit_time,1.522008,0.681035,0.896112,0.331598,0.173598,0.911675,0.843682,0.261265,1.128035,0.227907,...,0.378871,0.884303,0.460134,0.455041,0.476078,0.225249,0.290564,0.24696,0.331765,0.454029
mean_score_time,0.061279,0.04232,0.062603,0.067159,0.044651,0.04268,0.032193,0.043722,0.032464,0.032633,...,0.034393,0.034011,0.038351,0.032952,0.03727,0.033395,0.035554,0.027895,0.026759,0.025182
std_score_time,0.025082,0.012942,0.023825,0.013218,0.01164,0.014102,0.000574,0.010602,0.000621,0.0009,...,0.007457,0.002634,0.018199,0.000184,0.006828,0.009898,0.006055,0.01411,0.008643,0.007804
param_ccp_alpha,0.0,0.1,0.1,0.0,0.0,0.1,0.0,0.1,0.0,0.1,...,0.1,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.1,0.1
param_loss,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,...,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error
param_max_depth,10,10,10,10,10,10,10,10,5,5,...,10,10,10,10,10,10,5,5,5,5
param_min_samples_leaf,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
param_min_samples_split,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
param_min_weight_fraction_leaf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1


In [10]:
gb_best_params = model.best_params_

gb_best_params

{'ccp_alpha': 0.0,
 'loss': 'squared_error',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'validation_fraction': 0.25}

In [11]:
model.best_params_

{'ccp_alpha': 0.0,
 'loss': 'squared_error',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'validation_fraction': 0.25}

2. Decision Tree Regressor

In [12]:
dt = DecisionTreeRegressor()

In [13]:
DecisionTreeRegressor().get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [14]:
dt_hyperparameters = {
    'min_weight_fraction_leaf': [0.0,0.1,0.2,0.3,0.4,0.5],
    'min_samples_leaf': [1,2,3],
    'ccp_alpha': [0.0,0.1,0.2],
    'criterion': ['squared_error','absolute_error','friedman_mse'],
}

In [None]:
model = GridSearchCV(estimator=dt, param_grid=dt_hyperparameters, cv=5, n_jobs=-1, verbose=4)
model.fit(x_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [None]:
pd.DataFrame.from_dict(model.cv_results_).sort_values('rank_test_score')