In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='dark')

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
le = LabelEncoder()

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error, r2_score

import os

In [4]:
path = os.getcwd()

In [5]:
train = pd.read_csv(path + "\\Train.csv")
test = pd.read_csv(path+"\\Test.csv")
sample_sub = pd.read_csv(path+"\\sample_submission.csv")

In [6]:
train['TARGET(PRICE_IN_LACS)']=np.log1p(train['TARGET(PRICE_IN_LACS)'])

In [7]:
train

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,TARGET(PRICE_IN_LACS)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.969910,77.597960,4.025352
1,Dealer,0,0,2,BHK,1275.000000,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,3.951244
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,3.784190
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.642300,77.344500,4.151040
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.592200,88.484911,4.119037
...,...,...,...,...,...,...,...,...,...,...,...,...
29446,Owner,0,0,3,BHK,2500.000000,1,1,"Shamshabad Road,Agra",27.140626,78.043277,3.828641
29447,Owner,0,0,2,BHK,769.230769,1,1,"E3-108, Lake View Recidency,,Vapi",39.945409,-86.150721,2.833213
29448,Dealer,0,0,2,BHK,1022.641509,1,1,"Ajmer Road,Jaipur",26.928785,75.828002,3.335770
29449,Owner,0,0,2,BHK,927.079009,1,1,"Sholinganallur,Chennai",12.900150,80.227910,4.219508


---
# Preprocessing

In [8]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)
df.shape

(98171, 12)

In [9]:
df['SQUARE_FT'] = np.log(df['SQUARE_FT'])

In [10]:
df['POSTED_BY'].replace( {'Owner' : 0, 'Builder' : 1, 'Dealer' : 2}, inplace = True)
df['BHK_OR_RK'].replace( {'BHK':1, 'RK': 0}, inplace = True)

df['BHK_NO.'].replace( {16:15, 17:15, 18:15, 31:20, 13:11}, inplace= True)

In [11]:
df['POSTED_BY'] = df['POSTED_BY'].astype('uint8')
df['UNDER_CONSTRUCTION'] = df['UNDER_CONSTRUCTION'].astype('bool')
df['RERA'] = df['RERA'].astype('bool')
df['BHK_NO.'] = df['BHK_NO.'].astype('int8')
df['BHK_OR_RK'] = df['BHK_OR_RK'].astype('bool')
df['READY_TO_MOVE'] = df['READY_TO_MOVE'].astype('bool')
df['RESALE'] = df['RESALE'].astype('bool')

---
# Feature Engineering

### 1. Extracting Features

In [12]:
df['City'] = df['ADDRESS'].apply(lambda x: x.split(',')[-1])

In [13]:
df['Address'] = df['ADDRESS'].apply(lambda x: x.split(',')[0])

In [14]:
df.drop('ADDRESS', axis = 1, inplace=True)

### 2. Grouping Features

In [15]:
df['median_sqft_per_bhkno'] = df.groupby('BHK_NO.')['SQUARE_FT'].transform('median')
df['min_sqft_per_bhkno'] = df.groupby('BHK_NO.')['SQUARE_FT'].transform('min')
df['max_sqft_per_bhkno'] = df.groupby('BHK_NO.')['SQUARE_FT'].transform('max')

In [16]:
df['mean_sqft_per_location'] = df.groupby('LONGITUDE')['SQUARE_FT'].transform('median')
df['min_sqft_per_location'] = df.groupby('LONGITUDE')['SQUARE_FT'].transform('min')
df['max_sqft_per_location'] = df.groupby('LONGITUDE')['SQUARE_FT'].transform('max')

In [17]:
df['median_sqft_per_city'] = df.groupby('City')['SQUARE_FT'].transform('median')
df['min_sqft_per_city'] = df.groupby('City')['SQUARE_FT'].transform('min')
df['max_sqft_per_city'] = df.groupby('City')['SQUARE_FT'].transform('max')

In [18]:
df['median_sqft_per_address'] = df.groupby('Address')['SQUARE_FT'].transform('median')
df['min_sqft_per_address'] = df.groupby('Address')['SQUARE_FT'].transform('min')
df['max_sqft_per_address'] = df.groupby('Address')['SQUARE_FT'].transform('max')

---

In [19]:
df['sqft_per_room'] = df.apply(lambda x: x['SQUARE_FT']/x['BHK_NO.'], axis = 1)

---

---
### Price per SQFT

In [20]:
train['City'] = train['ADDRESS'].apply(lambda x: x.split(',')[-1])
test['City'] = test['ADDRESS'].apply(lambda x: x.split(',')[-1])

In [21]:
a = np.array(train['City'].unique())
b = np.array(test['City'].unique())

In [22]:
city_replace = {'Alappuzha' : 'Kochi','Amreli' : 'Rajkot','Azamgarh' : 'Varanasi','Barmer' : 'Jodhpur','Barnala' : 'Ludhiana',
                'Bellary': 'Anantapur','Bhilwara' : 'Ajmer','Bhusawal' : 'Dhule','Birbhum' : 'Bardhaman','Bulandshahr' : 'Hapur',
                'Burhanpur' : 'Jalgaon','Chittorgarh' : 'Udaipur','Contai' : 'Kharagpur','Dewas' : 'Indore','Dhar' : 'Indore',
                'Dhenkanal' : 'Cuttack','Dindigul' : 'Madurai','Firozabad' : 'Agra','Gangtok' : 'Darjeeling','Gorakhpur' : 'Varanasi','Gudivada' : 'Vijayawada',
                'Jaisalmer' : 'Jaipur','Jalpaiguri' : 'Siliguri','Jhajjar' : 'Rohtak','Jorhat' : 'Nagaon','Kaithal' : 'Patiala','Karimnagar' : 'Warangal',
                'Karur' : 'Salem','Katni' : 'Jabalpur','Kaushambi' : 'Satna','Khandwa' : 'Jalgaon','Kolar' : 'Vellore','Malegaon' : 'Dhule',
                'Mancherial' : 'Warangal','Mandsaur' : 'Ujjain','Morena' : 'Gwalior','Namakkal' : 'Salem','Palani' : 'Madurai','Panchmahal' : 'Godhra',
                'Patan' : 'Gandhinagar','Pathankot' : 'Jammu','Pollachi' : 'Coimbatore','Purulia' : 'Dhanbad','Raebareli' : 'Lucknow','Rajsamand' : 'Udaipur',
                'Ramgarh' : 'Nainital','Rampur' : 'Moradabad','Ratlam' : 'Ujjain','Rupnagar' : 'Chandigarh','Shimoga' : 'Mangalore','Sirsa' : 'Patiala',
                'Sivasagar' : 'Dibrugarh','Tezpur' : 'Guwahati','Theni' : 'Madurai','Thiruvarur' : 'Thanjavur','Tiruchengode' : 'Erode',
                'Tumkur' : 'Bangalore','Viramgam' : 'Gandhinagar','Wayanad' : 'Ooty'}

In [23]:
replace_list = list(np.setdiff1d(b,a))

In [24]:
train['price_per_sqft'] = train['TARGET(PRICE_IN_LACS)']/train['SQUARE_FT']

mean_price_per_sqft = train.groupby('City')['price_per_sqft'].mean().to_dict()
median_price_per_sqft = train.groupby('City')['price_per_sqft'].median().to_dict()
min_price_per_sqft = train.groupby('City')['price_per_sqft'].min().to_dict()
max_price_per_sqft = train.groupby('City')['price_per_sqft'].max().to_dict()

In [25]:
def price_per_sqft_imputer(x, dictionary):
    if x in replace_list:
        return -1 #dictionary[city_replace[x]]
    return dictionary[x]

In [26]:
# Per sqft price w.r.t. City

df['mean_price_per_sqft'] = df['City'].apply(lambda x : price_per_sqft_imputer(x, mean_price_per_sqft))
df['median_price_per_sqft'] = df['City'].apply(lambda x : price_per_sqft_imputer(x, median_price_per_sqft))

df['min_price_per_sqft'] = df['City'].apply(lambda x : price_per_sqft_imputer(x, min_price_per_sqft))
df['max_price_per_sqft'] = df['City'].apply(lambda x : price_per_sqft_imputer(x, max_price_per_sqft))

In [27]:
median_price_per_city_dict = train.groupby('City')['TARGET(PRICE_IN_LACS)'].median().to_dict()
min_price_per_city_dict = train.groupby('City')['TARGET(PRICE_IN_LACS)'].min().to_dict()
max_price_per_city_dict = train.groupby('City')['TARGET(PRICE_IN_LACS)'].max().to_dict()

In [28]:
df['med_price_per_city'] = df['City'].apply(lambda x : median_price_per_city_dict[ city_replace[x]] if x in replace_list else median_price_per_city_dict[x])
df['min_price_per_city'] = df['City'].apply(lambda x : min_price_per_city_dict[ city_replace[x]] if x in replace_list else min_price_per_city_dict[x])
df['max_price_per_city'] = df['City'].apply(lambda x : max_price_per_city_dict[ city_replace[x]] if x in replace_list else max_price_per_city_dict[x])

In [29]:
df['exp'] = df['med_price_per_city']/df.groupby('City')['med_price_per_city'].transform('mean')

---
### Preprocessing New Features

In [30]:
col_ls = ['SQUARE_FT', 'LONGITUDE', 'LATITUDE','median_sqft_per_bhkno',
       'min_sqft_per_bhkno', 'max_sqft_per_bhkno', 'mean_sqft_per_location',
       'min_sqft_per_location', 'max_sqft_per_location',
       'median_sqft_per_city', 'min_sqft_per_city', 'max_sqft_per_city',
       'median_sqft_per_address', 'min_sqft_per_address',
       'max_sqft_per_address', 'sqft_per_room', 'mean_price_per_sqft',
       'median_price_per_sqft', 'min_price_per_sqft', 'max_price_per_sqft',
       'med_price_per_city', 'min_price_per_city', 'max_price_per_city']


for col in col_ls:
    df[col] = df[col].astype('float32')

In [31]:
df[[ 'City', 'Address']] = df[[ 'City', 'Address']].apply(le.fit_transform)

In [32]:
train_proc, test_proc = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop = True)

In [33]:
target = 'TARGET(PRICE_IN_LACS)'


features = [col for col in df.columns if col not in ([target])]

In [34]:
len(features)

33

In [290]:
trn, val = train_test_split(train_proc, test_size = 0.2, random_state = 1999)

##### Input for model
X_trn, X_val = trn[features], val[features]

##### Target column
y_trn, y_val = trn[target], val[target]

##### Features for test data that we will be predicting
X_test = test_proc[features]



## Basic Model Building

In [291]:
%%time
lgb = LGBMRegressor(random_state=1999)

lgb.fit(X_trn, y_trn)

preds = lgb.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))
print(f'mean_squared_log_error is : {error}')

mean_squared_log_error is : 0.29811099376619293
Wall time: 307 ms


In [257]:
%%time

xgb = XGBRegressor()

xgb.fit(X_trn, y_trn)
preds = xgb.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))

print(f'mean_squared_log_error is : {error}')

mean_squared_log_error is : 0.29412312577766475
Wall time: 1.28 s


In [258]:
%%time

rf = RandomForestRegressor(random_state = 1999, n_jobs = -1)

rf.fit(X_trn, y_trn)
preds = rf.predict(X_val)
preds = np.abs(preds)

error = np.sqrt(mean_squared_error(y_val, preds))

print(f'mean_squared_log_error is : {error}')

mean_squared_log_error is : 0.28773957037857306
Wall time: 3.03 s


---
## Cross Validation Strategy

In [35]:
# Cross Validation for Boosting
def cross_val(regressor, train, test, features, name):
    N_splits = 5
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True,random_state = 1999)
    stratified_target = pd.qcut( train[target], 10, labels=False, duplicates='drop')
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n================================Fold{index + 1}===================================')
        
        #### Train Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        
        #### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[target].iloc[val_idx]
        
        #### Test Set
        X_test = test[features]
        
        if name != 'cat':
            #### Scaling Data ####
            scaler = StandardScaler()
            _ = scaler.fit(X_trn)
            X_trn = scaler.transform(X_trn)
            X_val = scaler.transform(X_val)
            X_test = scaler.transform(X_test)
        
        ############ Fitting #############
        _ = regressor.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
        
        ############ Predicting #############
        val_preds = np.abs(regressor.predict(X_val))
        test_preds = np.abs(regressor.predict(X_test))
        
        error = np.sqrt(mean_squared_error(y_val, val_preds))
        print(f'\n Root Log Mean Squared Error for Validation set is : {error}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    total_error = np.sqrt(mean_squared_error(target_col, oofs))
    print(f'\n\Root Log Mean Squared Error for oofs is {total_error}')
    
    return oofs, preds

In [36]:
def normal_cross_val(regressor, train, test, features):
    N_splits = 5
    
    oofs = np.zeros(len(train))
    preds = np.zeros(len(test))
    
    target_col = train[target]
    
    folds = StratifiedKFold(n_splits = N_splits, shuffle = True,random_state = 1999)
    stratified_target = pd.qcut( train[target], 10, labels=False, duplicates='drop')
    for index, (trn_idx, val_idx) in enumerate(folds.split(train, stratified_target)):
        print(f'\n================================Fold{index + 1}===================================')
        
        #### Train Set
        X_trn, y_trn = train[features].iloc[trn_idx], train[target].iloc[trn_idx]
        
        #### Validation Set
        X_val, y_val = train[features].iloc[val_idx], train[target].iloc[val_idx]
        
        #### Test Set
        X_test = test[features]
        
        #### Scaling Data ####
        scaler = StandardScaler()
        _ = scaler.fit(X_trn)
        
        X_trn = scaler.transform(X_trn)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
        
        ############ Fitting #############
        _ = regressor.fit(X_trn, y_trn)
        
        ############ Predicting #############
        val_preds = np.abs(regressor.predict(X_val))
        test_preds = np.abs(regressor.predict(X_test))
        
        error = np.sqrt(mean_squared_error(y_val, val_preds))
        print(f'\n Root Log Mean Squared Error for Validation set is : {error}')
        
        oofs[val_idx] = val_preds
        preds += test_preds / N_splits
        
    total_error = np.sqrt(mean_squared_error(target_col, oofs))
    print(f'\n\Root Log Mean Squared Error for oofs is {total_error}')
    
    return oofs, preds

---
# Predicting With Tuned Models

In [1647]:
%%time
rf_oofs, rf_preds = normal_cross_val(rf, train_proc, test_proc, features)



 Root Log Mean Squared Error for Validation set is : 0.2763880933616468


 Root Log Mean Squared Error for Validation set is : 0.27755309309921083


 Root Log Mean Squared Error for Validation set is : 0.29167454288372224


 Root Log Mean Squared Error for Validation set is : 0.28359457259551735


 Root Log Mean Squared Error for Validation set is : 0.2922550468744347

\Root Log Mean Squared Error for oofs is 0.2843723863134213
Wall time: 3min 44s


In [1648]:
%%time
lgb_oofs, lgb_preds = cross_val(lgb, train_proc, test_proc, features, 'lgb')



 Root Log Mean Squared Error for Validation set is : 0.26629946870958443


 Root Log Mean Squared Error for Validation set is : 0.2693908562460813


 Root Log Mean Squared Error for Validation set is : 0.28204591075507646


 Root Log Mean Squared Error for Validation set is : 0.2669893360007582


 Root Log Mean Squared Error for Validation set is : 0.2807970014732888

\Root Log Mean Squared Error for oofs is 0.273190910022824
Wall time: 22.9 s


In [1649]:
%%time

xgb_oofs, xgb_preds = cross_val(xgb, train_proc, test_proc, features, 'xgb')



 Root Log Mean Squared Error for Validation set is : 0.2584373087548216


 Root Log Mean Squared Error for Validation set is : 0.26075218361136987


 Root Log Mean Squared Error for Validation set is : 0.27776801953128816


 Root Log Mean Squared Error for Validation set is : 0.26666927488703795


 Root Log Mean Squared Error for Validation set is : 0.27077540048959825

\Root Log Mean Squared Error for oofs is 0.2669710377287559
Wall time: 57.5 s


---
# Hyperparameter Tuning

In [46]:
import optuna
from optuna.samplers import TPESampler

# 1. Random Forest

In [58]:
%%time

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 35)
    n_estimators = trial.suggest_int("n_estimators", 700, 1500)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 5)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 100,10000)
    max_features = trial.suggest_uniform('max_features', 0.1, 0.9)
    model = RandomForestRegressor( 
        max_depth=max_depth,
        n_estimators = n_estimators,
        min_samples_split = min_samples_split,
        max_leaf_nodes = max_leaf_nodes,
        max_features = max_features,
        random_state=1999,
        bootstrap = True,
        n_jobs = -1
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=40)

rf_params = study.best_params
rf_params['random_state'] = 1999
rf = RandomForestRegressor(**rf_params)
rf.fit(X_trn, y_trn)
preds = rf.predict(X_val)
print('Optimized RF RMSLE', np.sqrt(mean_squared_error(y_val, preds)))

[I 2020-10-24 12:10:10,308] A new study created in memory with name: no-name-ee35fa3a-9a05-4ba2-88a5-c89b769aadab
[I 2020-10-24 12:10:14,611] Trial 0 finished with value: 0.5857039572050123 and parameters: {'max_depth': 2, 'n_estimators': 1463, 'min_samples_split': 5, 'max_leaf_nodes': 9325, 'max_features': 0.5988509574287779}. Best is trial 0 with value: 0.5857039572050123.
[I 2020-10-24 12:10:24,106] Trial 1 finished with value: 0.2835654202084174 and parameters: {'max_depth': 25, 'n_estimators': 770, 'min_samples_split': 2, 'max_leaf_nodes': 6844, 'max_features': 0.31812503566409056}. Best is trial 1 with value: 0.2835654202084174.
[I 2020-10-24 12:10:29,859] Trial 2 finished with value: 0.4949274215231798 and parameters: {'max_depth': 3, 'n_estimators': 1186, 'min_samples_split': 5, 'max_leaf_nodes': 2322, 'max_features': 0.5544356488751458}. Best is trial 1 with value: 0.2835654202084174.
[I 2020-10-24 12:10:34,843] Trial 3 finished with value: 0.2958470716175937 and parameters: {

Optimized RF RMSLE 0.28227506010457565
Wall time: 20min 13s


## 2. LGBM

In [63]:
%%time

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 40)
    n_estimators = trial.suggest_int("n_estimators", 700, 2000)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 1)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 0.9)
    num_leaves = trial.suggest_int("num_leaves", 2, 500)
    #min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    reg_alpha = trial.suggest_uniform("reg_alpha", 0.1, 0.9)
    reg_lambda = trial.suggest_uniform("reg_lambda", 0.1, 0.9)
    model = LGBMRegressor(
        max_depth=max_depth,
        n_estimators = n_estimators,
        learning_rate=learning_rate, 
        colsample_bytree = colsample_bytree,
        num_leaves=num_leaves, 
        reg_alpha = reg_alpha,
        reg_lambda = reg_lambda,
        #min_child_samples=min_child_samples,
        random_state=1999,
        n_jobs = -3
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=60)

lgb_params = study.best_params
lgb_params['random_state'] = 1999
lgb = LGBMRegressor(**lgb_params)
lgb.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
preds = lgb.predict(X_val)
print('Optimized LGBM RMSLE', np.sqrt(mean_squared_error(y_val, preds)))

[I 2020-10-24 12:32:40,194] A new study created in memory with name: no-name-defb7de4-c882-42c0-b173-524036bd1cda
[I 2020-10-24 12:32:41,391] Trial 0 finished with value: 0.2954044505998161 and parameters: {'max_depth': 2, 'n_estimators': 1535, 'learning_rate': 0.8721510558604811, 'colsample_bytree': 0.7778013910273004, 'num_leaves': 213, 'reg_alpha': 0.6167152904533248, 'reg_lambda': 0.45006976901015405}. Best is trial 0 with value: 0.2954044505998161.
[I 2020-10-24 12:32:42,322] Trial 1 finished with value: 0.2840497715875627 and parameters: {'max_depth': 8, 'n_estimators': 1300, 'learning_rate': 0.3453906651221019, 'colsample_bytree': 0.4821320938570799, 'num_leaves': 41, 'reg_alpha': 0.5231159358023236, 'reg_lambda': 0.5544356488751458}. Best is trial 1 with value: 0.2840497715875627.
[I 2020-10-24 12:32:45,209] Trial 2 finished with value: 0.276234921574609 and parameters: {'max_depth': 39, 'n_estimators': 1237, 'learning_rate': 0.16393245237809825, 'colsample_bytree': 0.169703439

[I 2020-10-24 12:35:37,116] Trial 46 finished with value: 0.28694790255484437 and parameters: {'max_depth': 13, 'n_estimators': 1351, 'learning_rate': 0.33751367699090135, 'colsample_bytree': 0.2401594308102682, 'num_leaves': 383, 'reg_alpha': 0.7568753719260112, 'reg_lambda': 0.7742938514867925}. Best is trial 14 with value: 0.2730527748346632.
[I 2020-10-24 12:35:43,784] Trial 47 finished with value: 0.2718213326536507 and parameters: {'max_depth': 30, 'n_estimators': 1054, 'learning_rate': 0.10417981242620984, 'colsample_bytree': 0.29385095448022197, 'num_leaves': 337, 'reg_alpha': 0.7991573833877426, 'reg_lambda': 0.6551007927065132}. Best is trial 47 with value: 0.2718213326536507.
[I 2020-10-24 12:35:49,051] Trial 48 finished with value: 0.2712419066362349 and parameters: {'max_depth': 31, 'n_estimators': 768, 'learning_rate': 0.10395358602462655, 'colsample_bytree': 0.3004582369227073, 'num_leaves': 332, 'reg_alpha': 0.7969567856974819, 'reg_lambda': 0.6432689443285323}. Best is

Optimized LGBM RMSLE 0.2711500634901012
Wall time: 4min 9s


## 3. XGBoost

In [64]:
%%time

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 7, 15)
    n_estimators = trial.suggest_int("n_estimators", 500, 1500)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 1)
    subsample = trial.suggest_uniform('subsample', 0.1, 0.99)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 0.9)
    colsample_bylevel = trial.suggest_uniform('colsample_bylevel', 0.1, 0.9)
    #num_leaves = trial.suggest_int("num_leaves", 2, 5000)
    #min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    reg_alpha = trial.suggest_int("reg_alpha", 1, 10)
    reg_lambda = trial.suggest_int("reg_lambda", 1, 10)
    model = XGBRegressor(
        max_depth = max_depth,
        n_estimators = n_estimators,
        learning_rate=learning_rate, 
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        colsample_bylevel = colsample_bylevel,
        #num_leaves=num_leaves, 
        #min_child_samples=min_child_samples,
        random_state=0,
        n_jobs = -3
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn, eval_set = [ (X_val, y_val)], early_stopping_rounds = 50, verbose = False)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=50)

xgb_params = study.best_params
xgb_params['random_state'] = 0
xgb = XGBRegressor(**xgb_params)
xgb.fit(X_trn, y_trn, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
preds = xgb.predict(X_val)
print('Optimized XGB RMSLE', np.sqrt(mean_squared_error(y_val, preds)))

[I 2020-10-24 12:36:49,216] A new study created in memory with name: no-name-c2309e32-8278-43a2-85e9-073a8d0018b8
[I 2020-10-24 12:36:51,558] Trial 0 finished with value: 0.3484254148843745 and parameters: {'max_depth': 12, 'n_estimators': 692, 'learning_rate': 0.6424870384644795, 'subsample': 0.5849460328672382, 'colsample_bytree': 0.4389238394711238, 'colsample_bylevel': 0.6167152904533248, 'reg_alpha': 5, 'reg_lambda': 8}. Best is trial 0 with value: 0.3484254148843745.
[I 2020-10-24 12:36:53,231] Trial 1 finished with value: 0.39014295097313234 and parameters: {'max_depth': 13, 'n_estimators': 972, 'learning_rate': 0.9672964844509263, 'subsample': 0.4412629517549421, 'colsample_bytree': 0.7333800304661316, 'colsample_bylevel': 0.5231159358023236, 'reg_alpha': 9, 'reg_lambda': 2}. Best is trial 0 with value: 0.3484254148843745.
[I 2020-10-24 12:36:59,233] Trial 2 finished with value: 0.3117214643345071 and parameters: {'max_depth': 12, 'n_estimators': 1037, 'learning_rate': 0.163932

[I 2020-10-24 12:40:52,985] Trial 44 finished with value: 0.27951169656618036 and parameters: {'max_depth': 10, 'n_estimators': 1219, 'learning_rate': 0.2465995758376201, 'subsample': 0.9408529214823024, 'colsample_bytree': 0.42031961409098134, 'colsample_bylevel': 0.7819678687376859, 'reg_alpha': 5, 'reg_lambda': 1}. Best is trial 21 with value: 0.2681921717707495.
[I 2020-10-24 12:41:01,103] Trial 45 finished with value: 0.26939303241529095 and parameters: {'max_depth': 10, 'n_estimators': 1399, 'learning_rate': 0.10084507800598488, 'subsample': 0.8473951465943335, 'colsample_bytree': 0.5912419067918427, 'colsample_bylevel': 0.8931271251942052, 'reg_alpha': 3, 'reg_lambda': 3}. Best is trial 21 with value: 0.2681921717707495.
[I 2020-10-24 12:41:05,669] Trial 46 finished with value: 0.2782779708034813 and parameters: {'max_depth': 10, 'n_estimators': 1500, 'learning_rate': 0.1974721828618271, 'subsample': 0.8503415148024464, 'colsample_bytree': 0.6078716081527894, 'colsample_bylevel'

Optimized XGB RMSLE 0.27543876908577986
Wall time: 4min 44s


## 4. Catboost

In [65]:
%%time

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 7, 13)
    n_estimators = trial.suggest_int("n_estimators", 1000, 2000)
    learning_rate = trial.suggest_uniform('learning_rate', 0.1, 1)
    rsm = trial.suggest_uniform('rsm', 0.1, 0.99)
    #num_leaves = trial.suggest_int("num_leaves", 2, 5000)
    #min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    reg_lambda = trial.suggest_int("reg_lambda", 1, 10)
    model = CatBoostRegressor(
        max_depth = max_depth,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        rsm = rsm,
        reg_lambda=reg_lambda,
        cat_features=cat_features,
        #num_leaves=num_leaves, 
        #min_child_samples=min_child_samples,
        random_state=1999,
        verbose=False
    )
    return model

sampler = TPESampler(seed=0)
def objective(trial):
    model = create_model(trial)
    model.fit(X_trn, y_trn, eval_set = [ (X_val, y_val)], early_stopping_rounds = 50, verbose = False)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val,preds))
    return score

study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=20)

cat_params = study.best_params
cat_params['random_state'] = 1999
cat = CatBoostRegressor(**cat_params)
cat.fit(X_trn, y_trn, eval_set = [ (X_val, y_val)], early_stopping_rounds = 50, verbose = False)
preds = cat.predict(X_val)
print('Optimized CAT RMLSE',np.sqrt(mean_squared_error(y_val, preds)))

[I 2020-10-24 12:41:33,482] A new study created in memory with name: no-name-25fe3206-0aab-4ffb-be3b-9d5ecbe6d9a3
[I 2020-10-24 12:41:45,454] Trial 0 finished with value: 0.3019479905736397 and parameters: {'max_depth': 11, 'n_estimators': 1559, 'learning_rate': 0.7436704297351775, 'rsm': 0.636459404703763, 'reg_lambda': 4}. Best is trial 0 with value: 0.3019479905736397.
[I 2020-10-24 12:41:57,073] Trial 1 finished with value: 0.2934580845103464 and parameters: {'max_depth': 8, 'n_estimators': 1723, 'learning_rate': 0.6813047017599905, 'rsm': 0.48945261802379636, 'reg_lambda': 7}. Best is trial 1 with value: 0.2934580845103464.
[I 2020-10-24 12:42:16,268] Trial 2 finished with value: 0.2748916311581521 and parameters: {'max_depth': 7, 'n_estimators': 1600, 'learning_rate': 0.3453906651221019, 'rsm': 0.5251219544160014, 'reg_lambda': 8}. Best is trial 2 with value: 0.2748916311581521.
[I 2020-10-24 12:43:45,235] Trial 3 finished with value: 0.2882239729912589 and parameters: {'max_dept

Optimized CAT RMLSE 0.27653013958429956
Wall time: 9min 58s


In [72]:
%%time

models = {'lgb' : lgb, 'rf' : rf, 'xgb' : xgb, 'cat' : cat}

for name,model in models.items():
    error = predict(model, name)
    print(f'Error for {name} is {error}')

Error for lgb is 0.2711500634901012
Error for rf is 0.28227506010457565
Error for xgb is 0.27543876908577986
Error for cat is 0.27653013958429956
Wall time: 3min 34s


In [None]:
Error for lgb is 0.2715683553005204
Error for rf is 0.2821737840690106
Error for xgb is 0.2703749902764792
Error for cat is 0.2730301229971128

---
# All Tuned Models

### 1. LGBM Models :

In [370]:
# 1. Manual Tuned
lgb_1 = LGBMRegressor(random_state=1999,n_estimators=1000, learning_rate=0.13,num_leaves=70,max_depth=31,
               reg_lambda=0.3, reg_alpha = 0.7)

In [371]:
# 2. Optuna Tuned
params = {'max_depth': 29, 'n_estimators': 868, 'learning_rate': 0.10130592168165514, 'colsample_bytree': 0.29840872430993026,
          'num_leaves': 338, 'reg_alpha': 0.7919788672424208, 'reg_lambda': 0.5736739628502263}

lgb_2 = LGBMRegressor(**params)

In [372]:
params = {'max_depth': 31, 'n_estimators': 768, 'learning_rate': 0.10395358602462655, 'colsample_bytree': 0.3004582369227073,
          'num_leaves': 332, 'reg_alpha': 0.7969567856974819, 'reg_lambda': 0.6432689443285323}

lgb_3 = LGBMRegressor(**params)

---
### 2. XGBoost Models

In [373]:
# XGBoost
params = {'max_depth': 9, 'n_estimators': 500,'learning_rate': 0.1, 'booster' : 'gbtree', 'n_jobs' : -1,
         'subsample' : 0.9, 'colsample_bytree' : 0.8, 'colsample_bylevel' : 0.6, 'random_state' : 0}
xgb_1 = XGBRegressor(**params)

In [374]:
params = {'max_depth': 8, 'n_estimators': 1156, 'learning_rate': 0.1282423144462752,
          'subsample': 0.8583044649709827,'colsample_bytree': 0.39430648031413884,
          'colsample_bylevel': 0.439284444843544,'random_state' : 0}

xgb_2 = XGBRegressor(**params)

In [375]:
params = {'max_depth': 9, 'n_estimators': 1329, 'learning_rate': 0.10067225176673156, 
          'subsample': 0.9010792397620144, 'colsample_bytree': 0.4501213056757911, 
          'colsample_bylevel': 0.75993128190449555, 'random_state' : 0}
xgb_3 = XGBRegressor(**params)

In [376]:
# XGBoost
params = {'max_depth': 9, 'n_estimators': 2000,'learning_rate': 0.1, 'booster' : 'gbtree', 'n_jobs' : -1,
         'subsample' : 0.9, 'colsample_bytree' : 0.8, 'colsample_bylevel' : 0.6, 'random_state' : 0}
xgb_4 = XGBRegressor(**params)

### 3. RF Models

In [377]:
params = {'max_depth': 30, 'n_estimators': 2000, 'min_samples_split' : 2,'max_features' : 'sqrt', 'max_leaf_nodes' : 8000,
          'bootstrap' : True,'random_state' : 1999, 'n_jobs' : -1}

rf_1 = RandomForestRegressor(**params)

In [378]:
params = {'max_depth': 31, 'n_estimators': 1317, 'min_samples_split': 2, 'max_leaf_nodes': 6653, 
          'max_features': 0.6297197869507615, 'bootstrap' : True,'random_state' : 1999, 'n_jobs' : -1}

rf_2 = RandomForestRegressor(**params)

### 4. Catboost Models

In [379]:
cat_features = ['UNDER_CONSTRUCTION', 'RERA', 'BHK_NO.', 'BHK_OR_RK',
                'READY_TO_MOVE', 'RESALE', 'City','Address']

In [380]:
params = {
    'learning_rate' : 0.1,'max_depth' : 10,'n_estimators' : 2000,
    'random_state' : 1999,'reg_lambda' : 6
}
cat_1 = CatBoostRegressor( **params,cat_features=cat_features,)

In [381]:
params = {'max_depth': 9, 'n_estimators': 1233, 'learning_rate': 0.11644235072217447, 
          'rsm': 0.25205222374124897, 'reg_lambda': 3}
cat_2 = CatBoostRegressor( **params,cat_features=cat_features,)

### 5. Bagging Models

In [382]:
bag_1 = BaggingRegressor(base_estimator=lgb_1,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)

In [383]:
bag_2 = BaggingRegressor(base_estimator=lgb_2,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)

In [384]:
bag_3 = BaggingRegressor(base_estimator=xgb_1,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)

In [385]:
bag_4 = BaggingRegressor(base_estimator=xgb_3,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)

In [386]:
bag_5 = BaggingRegressor(base_estimator=xgb_4,n_estimators=20,max_samples=0.99,max_features=0.99,
                       bootstrap=True,n_jobs=-1,random_state=0,verbose=0,)

---
# Predicting With All Models

#### For Storing All OOFs and Preds

In [388]:
train_new = train_proc[[target, 'RERA']].copy()
test_new = test_proc[[target, 'RERA']].copy()

#### 1. LGBM

In [None]:
%%time
lgb_1_oofs, lgb_1_preds = cross_val(lgb_1, train_proc, test_proc, features, 'lgb')

In [390]:
%%time
lgb_2_oofs, lgb_2_preds = cross_val(lgb_2, train_proc, test_proc, features, 'lgb')

In [391]:
%%time
lgb_3_oofs, lgb_3_preds = cross_val(lgb_3, train_proc, test_proc, features, 'lgb')

In [392]:
train_new['lgb_1'] = lgb_1_oofs
test_new['lgb_1'] = lgb_1_preds

train_new['lgb_2'] = lgb_2_oofs
test_new['lgb_2'] = lgb_2_preds

train_new['lgb_3'] = lgb_3_oofs
test_new['lgb_3'] = lgb_3_preds

#### 2. XGBoost

In [None]:
%%time
xgb_1_oofs, xgb_1_preds = cross_val(xgb_1, train_proc, test_proc, features, 'xgb')

In [None]:
%%time
xgb_2_oofs, xgb_2_preds = cross_val(xgb_2, train_proc, test_proc, features, 'xgb')

In [None]:
%%time
xgb_3_oofs, xgb_3_preds = cross_val(xgb_3, train_proc, test_proc, features, 'xgb')

In [None]:
%%time
xgb_4_oofs, xgb_4_preds = cross_val(xgb_4, train_proc, test_proc, features, 'xgb')

In [None]:
train_new['xgb_1'] = xgb_1_oofs
test_new['xgb_1'] = xgb_1_preds

train_new['xgb_2'] = xgb_2_oofs
test_new['xgb_2'] = xgb_2_preds

train_new['xgb_3'] = xgb_3_oofs
test_new['xgb_3'] = xgb_3_preds

train_new['xgb_4'] = xgb_4_oofs
test_new['xgb_4'] = xgb_4_preds

#### 3. Random Forest

In [398]:
%%time
rf_1_oofs, rf_1_preds = normal_cross_val(rf_1, train_proc, test_proc, features)

In [None]:
%%time
rf_2_oofs, rf_2_preds = normal_cross_val(rf_2, train_proc, test_proc, features)

In [400]:
train_new['rf_1'] = rf_1_oofs
test_new['rf_1'] = rf_1_preds

train_new['rf_2'] = rf_2_oofs
test_new['rf_2'] = rf_2_preds


#### 4. CATBoost

In [401]:
%%time
cat_1_oofs, cat_1_preds = cross_val(cat_1, train_proc, test_proc, features, 'cat')

In [402]:
%%time
cat_2_oofs, cat_2_preds = cross_val(cat_2, train_proc, test_proc, features, 'cat')

In [403]:
train_new['cat_1'] = cat_1_oofs
test_new['cat_1'] = cat_1_preds

train_new['cat_2'] = cat_2_oofs
test_new['cat_2'] = cat_2_preds


#### 5. Bagging

In [None]:
%%time
bag_1_oofs, bag_1_preds = normal_cross_val(bag_1, train_proc, test_proc, features)

In [None]:
%%time
bag_2_oofs, bag_2_preds = normal_cross_val(bag_2, train_proc, test_proc, features)

In [None]:
%%time
bag_3_oofs, bag_3_preds = normal_cross_val(bag_3, train_proc, test_proc, features)

In [407]:
%%time
bag_4_oofs, bag_4_preds = normal_cross_val(bag_4, train_proc, test_proc, features)

In [408]:
%%time
bag_5_oofs, bag_5_preds = normal_cross_val(bag_5, train_proc, test_proc, features)

In [409]:
train_new['bag_1'] = bag_1_oofs
test_new['bag_1'] = bag_1_preds

train_new['bag_2'] = bag_2_oofs
test_new['bag_2'] = bag_2_preds

train_new['bag_3'] = bag_3_oofs
test_new['bag_3'] = bag_3_preds

train_new['bag_4'] = bag_4_oofs
test_new['bag_4'] = bag_4_preds

train_new['bag_5'] = bag_5_oofs
test_new['bag_5'] = bag_5_preds

---
# Level 1 Stacking

In [410]:
ens_features = [c for c in train_new.columns if c not in [target, 'RERA']]

In [None]:
%%time
level_1_lgb_oofs, level_1_lgb_preds = cross_val(LGBMRegressor(), train_new, test_new, ens_features, 'lgb')

In [None]:
%%time
level_1_xgb_oofs, level_1_xgb_preds = cross_val(xgb_1, train_new, test_new, ens_features, 'xgb')

In [None]:
%%time
level_1_rf_1_oofs, level_1_rf_1_preds = normal_cross_val(rf_1, train_new, test_new, ens_features)

In [None]:
cat_ens = CatBoostRegressor(**params, verbose = False)

level_1_cat_oofs, level_1_cat_preds = cross_val(cat_ens, train_new, test_new, ens_features, 'cat')

In [None]:
%%time
level_1_lgb_bag_oofs, level_1_lgb_bag_preds = normal_cross_val(BaggingRegressor(base_estimator = LGBMRegressor()), train_new, test_new, ens_features)

In [None]:
%%time
level_1_xgb_bag_oofs, level_1_xgb_bag_preds = normal_cross_val(BaggingRegressor(base_estimator = xgb_1), train_new, test_new, ens_features)

In [None]:
%%time
params = {'max_depth': 30, 'n_estimators': 1000, 'min_samples_split' : 2,'max_features' : 'sqrt', 'max_leaf_nodes' : 8000,
          'bootstrap' : True,'random_state' : 1999, 'n_jobs' : -1}

rf = RandomForestRegressor(**params)

level_1_rf_bag_oofs, level_1_rf_bag_preds = normal_cross_val(BaggingRegressor(base_estimator = rf), train_new, test_new, ens_features)

In [90]:
ens_train_new = train_proc[[target, 'RERA']].copy()
ens_test_new = test_proc[[target, 'RERA']].copy()

ens_train_new['lgb'] = level_1_lgb_oofs
ens_test_new['lgb'] = level_1_lgb_preds

ens_train_new['xgb'] = level_1_xgb_oofs
ens_test_new['xgb'] = level_1_xgb_preds

ens_train_new['cat'] = level_1_cat_oofs
ens_test_new['cat'] = level_1_cat_preds

ens_train_new['rf'] = level_1_rf_1_oofs
ens_test_new['rf'] = level_1_rf_1_preds

ens_train_new['lgb_bag'] = level_1_lgb_bag_oofs
ens_test_new['lgb_bag'] = level_1_lgb_bag_preds

ens_train_new['rf1_bag'] = level_1_rf_bag_oofs
ens_test_new['rf1_bag'] = level_1_rf_bag_preds

ens_train_new['xgb_bag'] = level_1_xgb_bag_oofs
ens_test_new['xgb_bag'] = level_1_xgb_bag_preds

#ens_train_new['cat_bag'] = level_1_cat_bag_oofs
#ens_test_new['cat_bag'] = level_1_cat_bag_preds

---
# Level 2 Stacking

In [91]:
ens_lvl_2_features = [c for c in ens_train_new.columns if c not in [target, 'RERA']]

In [431]:
from sklearn.linear_model import LinearRegression,Ridge,ARDRegression,SGDRegressor
clf = LinearRegression()

ens_linear_oofs, ens_linear_preds = normal_cross_val(clf, ens_train_new, ens_test_new, ens_lvl_2_features)



 Root Log Mean Squared Error for Validation set is : 0.25179921498193075


 Root Log Mean Squared Error for Validation set is : 0.2535219504056735


 Root Log Mean Squared Error for Validation set is : 0.2739281827903636


 Root Log Mean Squared Error for Validation set is : 0.2558337825788035


 Root Log Mean Squared Error for Validation set is : 0.26326715456355954

\Root Log Mean Squared Error for oofs is 0.2597971122339046


In [432]:
clf = Ridge()

ens_ridge_oofs, ens_ridge_preds = normal_cross_val(clf, ens_train_new, ens_test_new, ens_lvl_2_features)



 Root Log Mean Squared Error for Validation set is : 0.25175475850941487


 Root Log Mean Squared Error for Validation set is : 0.2535203039104475


 Root Log Mean Squared Error for Validation set is : 0.2739348631855066


 Root Log Mean Squared Error for Validation set is : 0.25583654710508374


 Root Log Mean Squared Error for Validation set is : 0.2632423637618381

\Root Log Mean Squared Error for oofs is 0.25978510183717146


In [433]:
clf = ARDRegression(normalize = True)

ens_ARD_oofs, ens_ARD_preds = normal_cross_val(clf, ens_train_new, ens_test_new, ens_lvl_2_features)



 Root Log Mean Squared Error for Validation set is : 0.2517881995349767


 Root Log Mean Squared Error for Validation set is : 0.2535303430636621


 Root Log Mean Squared Error for Validation set is : 0.27391173837391397


 Root Log Mean Squared Error for Validation set is : 0.255989246938876


 Root Log Mean Squared Error for Validation set is : 0.26326071963875886

\Root Log Mean Squared Error for oofs is 0.2598224685770336


In [434]:
preds = ens_linear_preds*0.60 + ens_ridge_preds*0.20 + ens_ARD_preds*0.20

sample_sub['TARGET(PRICE_IN_LACS)']=np.abs((np.exp(preds)-1))
sample_sub.to_csv(path + '\\Stacking.csv', index = False)