This notebook exists to mainly find out the best model used for prediction.

In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split, GridSearchCV



In [2]:
df_train = pd.read_csv('../data/iProperty_sale_to_be_trained.csv')

In [3]:
df_train.drop(['name','psf','area'], axis=1, inplace=True)

for col in ['price']:
    q_low = df_train[col].quantile(0.01)
    q_hi  = df_train[col].quantile(0.99)
    df_train = df_train[(df_train[col] < q_hi) & (df_train[col] > q_low)]
    
one_hot = pd.get_dummies(df_train[['district','state','type','details']])
df_train = df_train.join(one_hot).drop(['district','state','type','details'], axis=1)

for num in df_train.drop(['price'], axis=1).dtypes[(df_train.dtypes == "int64")].index:
    MMS = MinMaxScaler()
    MMS.fit(df_train[[num]])
    df_train[num] = MMS.transform(df_train[[num]])

In [4]:
x = df_train.drop(['price'],axis=1)
y = df_train['price']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

1. Gradient Boosting Regressor

In [5]:
gb = GradientBoostingRegressor()

In [6]:
gb.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [7]:
gb_hyperparameters = {
    'n_estimators': [100,50],
    'max_depth': [10,5],
    'min_samples_leaf': [2],
    'min_samples_split': [2],
    'loss': ['squared_error'],
    'ccp_alpha': [0.0,0.1],
    'min_weight_fraction_leaf': [0.0,0.1],
    'validation_fraction': [0.2,0.25],
}

In [8]:
model = GridSearchCV(estimator=gb, param_grid=gb_hyperparameters, cv=5, n_jobs=-1, verbose=4)
model.fit(x_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [9]:
pd.DataFrame.from_dict(model.cv_results_).sort_values(['mean_test_score','std_test_score'],ascending=False).T

Unnamed: 0,17,1,16,0,18,19,2,3,8,24,...,13,28,23,6,7,22,14,15,30,31
mean_fit_time,25.634209,26.59329,25.72546,25.295355,13.382283,12.714795,12.828508,13.772474,13.312517,13.935005,...,10.55324,10.389012,5.592159,5.671615,5.320617,5.986868,5.875837,5.765004,4.675166,4.35609
std_fit_time,0.893903,1.076439,0.272382,0.636852,0.815932,0.840329,0.217561,0.832767,0.1215,0.819607,...,1.031894,0.419354,0.334857,0.168565,0.207196,0.325319,0.749658,0.294992,0.043939,0.432872
mean_score_time,0.057577,0.085058,0.055404,0.051912,0.050311,0.027367,0.031293,0.042662,0.040246,0.028994,...,0.03571,0.03596,0.033185,0.02783,0.028317,0.029848,0.026442,0.035808,0.028363,0.025905
std_score_time,0.00853,0.027161,0.009289,0.016715,0.010948,0.012562,0.001548,0.020096,0.014325,0.004975,...,0.005189,0.00595,0.010097,0.006465,0.006994,0.017837,0.008097,0.015192,0.006377,0.00714
param_ccp_alpha,0.1,0.0,0.1,0.0,0.1,0.1,0.0,0.0,0.0,0.1,...,0.0,0.1,0.1,0.0,0.0,0.1,0.0,0.0,0.1,0.1
param_loss,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,...,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error,squared_error
param_max_depth,10,10,10,10,10,10,10,10,5,5,...,5,5,10,10,10,10,5,5,5,5
param_min_samples_leaf,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
param_min_samples_split,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
param_min_weight_fraction_leaf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1


In [42]:
gb_best_params = model.best_params_

gb_best_params

{'ccp_alpha': 0.1,
 'loss': 'squared_error',
 'max_depth': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'validation_fraction': 0.25}

In [10]:
model.best_params_

{'ccp_alpha': 0.1,
 'loss': 'squared_error',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'validation_fraction': 0.25}

2. Decision Tree Regressor

In [12]:
dt = DecisionTreeRegressor()

In [14]:
DecisionTreeRegressor().get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [29]:
dt_hyperparameters = {
    'min_weight_fraction_leaf': [0.0,0.1,0.2,0.3,0.4,0.5],
    'min_samples_leaf': [1,2,3],
    'ccp_alpha': [0.0,0.1,0.2],
    'criterion': ['squared_error','absolute_error','friedman_mse'],
}

In [None]:
model = GridSearchCV(estimator=dt, param_grid=dt_hyperparameters, cv=5, n_jobs=-1, verbose=4)
model.fit(x_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [None]:
pd.DataFrame.from_dict(model.cv_results_).sort_values('rank_test_score')