## CAT BOOST REGRESSOR

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import zipfile
import re
from scipy import stats
from catboost import CatBoostRegressor
from tqdm import tqdm
import gc
import datetime as dt

<font color = blue> Once we have imported the necessary libraries, we are going to define our data path and then, load the data: </font>

In [2]:
data_path = './data/'

In [3]:
print('Loading Properties ...')
properties2016 = pd.read_csv(data_path + 'properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv(data_path + 'properties_2017.csv', low_memory = False)

Loading Properties ...


In [4]:
properties2016.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,7.0,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [5]:
properties2016.shape

(2985217, 58)

In [6]:
properties2017.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2016.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,5.0,,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,6.0,,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,


In [7]:
properties2017.shape

(2985217, 58)

<font color = blue> We are going to specify the parameter 'parse_dates' to make the column 'transactiondate' in train files date formatted: </font>

In [8]:
print('Loading Train ...')
train2016 = pd.read_csv(data_path + 'train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv(data_path + 'train_2017.csv', parse_dates=['transactiondate'], low_memory=False)

Loading Train ...


In [9]:
train2016.shape

(90275, 3)

In [10]:
train2017.shape

(77613, 3)

In [11]:
train2016.head()

Unnamed: 0,parcelid,logerror,transactiondate
0,11016594,0.0276,2016-01-01
1,14366692,-0.1684,2016-01-01
2,12098116,-0.004,2016-01-01
3,12643413,0.0218,2016-01-02
4,14432541,-0.005,2016-01-02


<font color = blue> Now we are going to parse dates to get one column for year, one for month, one for day and one for quarter: </font>

In [12]:
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = (df["transactiondate"].dt.year - 2016)*12 + df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = (df["transactiondate"].dt.year - 2016)*4 + df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [13]:
train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

In [14]:
train2016.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter
0,11016594,0.0276,2016,1,1,1
1,14366692,-0.1684,2016,1,1,1
2,12098116,-0.004,2016,1,1,1
3,12643413,0.0218,2016,1,2,1
4,14432541,-0.005,2016,1,2,1


In [15]:
print('Loading Sample ...')
sample_submission = pd.read_csv(data_path + 'sample_submission.csv', low_memory = False)

Loading Sample ...


<font color = blue> Our sample submission will be a file with the parcel id for each property and six prediction columns for different dates (last 2016 quarter and last 2017 quarter): </font>

In [16]:
sample_submission.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0


<font color = blue> As we are going to apply the first model, cat boost, to the column for 201612 predictions, we remove the other columns: </font>

In [17]:
sample_submission_model_5 = sample_submission.drop(columns = ['201610', '201611', '201710', '201711', '201712'])
sample_submission_model_5.head()

Unnamed: 0,ParcelId,201612
0,10754147,0
1,10759547,0
2,10843547,0
3,10859147,0
4,10879947,0


<font color = blue> We will merge properties and train files to get all data grouped: </font>

In [18]:
print('Merging Train with Properties ...')
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')

Merging Train with Properties ...


In [19]:
train2016.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016,1,1,1,1.0,,,2.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,-0.1684,2016,1,1,1,,,,3.5,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016,1,1,1,1.0,,,3.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,0.0218,2016,1,2,1,1.0,,,2.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016,1,2,1,,,,2.5,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


In [20]:
train2016.shape

(90275, 63)

In [21]:
train2017.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,14297519,0.025595,2017,13,1,5,,,,3.5,...,,,485713.0,1023282.0,2016.0,537569.0,11013.72,,,60590630000000.0
1,17052889,0.055619,2017,13,1,5,,,,1.0,...,1.0,,88000.0,464000.0,2016.0,376000.0,5672.48,,,61110010000000.0
2,14186244,0.005383,2017,13,1,5,,,,2.0,...,1.0,,85289.0,564778.0,2016.0,479489.0,6488.3,,,60590220000000.0
3,12177905,-0.10341,2017,13,1,5,,,,3.0,...,,,108918.0,145143.0,2016.0,36225.0,1777.51,,,60373000000000.0
4,10887214,0.00694,2017,13,1,5,1.0,,,3.0,...,,,73681.0,119407.0,2016.0,45726.0,1533.89,,,60371240000000.0


In [22]:
train2017.shape

(77613, 63)

<font color = blue> As we should not use the 2016 tax values when predicting log errors against the 2016 log errors, we will omit this information for the 2016 predicitons columns: </font>

In [23]:
print('Tax Features 2017  ...')
train2017.iloc[:, train2017.columns.str.startswith('tax')] = np.nan
train2017.iloc[:, train2017.columns.str.endswith('taxvaluedollarcnt')] = np.nan

Tax Features 2017  ...


In [24]:
train2017.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,14297519,0.025595,2017,13,1,5,,,,3.5,...,,,,,2016.0,,,,,60590630000000.0
1,17052889,0.055619,2017,13,1,5,,,,1.0,...,1.0,,,,2016.0,,,,,61110010000000.0
2,14186244,0.005383,2017,13,1,5,,,,2.0,...,1.0,,,,2016.0,,,,,60590220000000.0
3,12177905,-0.10341,2017,13,1,5,,,,3.0,...,,,,,2016.0,,,,,60373000000000.0
4,10887214,0.00694,2017,13,1,5,1.0,,,3.0,...,,,,,2016.0,,,,,60371240000000.0


<font color = blue> Now we will generate our training and test files to run the model, the training file will be all properties and train from 2016 and 2017, while for the test, we will merge the sample submission with the unique properties file: </font>

In [25]:
print('Concat Train 2016 & 2017 ...')
train_df = pd.concat([train2016, train2017], axis = 0)
test_df = pd.merge(sample_submission_model_5[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')


Concat Train 2016 & 2017 ...


In [26]:
train_df.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016,1,1,1,1.0,,,2.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,-0.1684,2016,1,1,1,,,,3.5,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016,1,1,1,1.0,,,3.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,0.0218,2016,1,2,1,1.0,,,2.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016,1,2,1,,,,2.5,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


In [27]:
train_df.shape

(167888, 63)

In [28]:
test_df.head()

Unnamed: 0,ParcelId,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,7.0,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [29]:
test_df.shape

(2985217, 58)

<font color = blue> To optimize memory management, we are using garbage collections algorithms (GC), which solve reference cycles (when one or more objects are referencing each other) that reference counting cannot detect: </font>

In [30]:
del properties2016, properties2017, train2016, train2017
gc.collect();

<font color = blue> Let's do some feature engineering. We have to deal with missing values, where we will establish a threshold of 98% missing values to remove those fields, and also with features with one unique value and others that we do not want to use in our training: </font>

In [31]:
print('Missing data fields to remove ...')
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print(exclude_missing)
print("We exclude: %s" % len(exclude_missing))

Missing data fields to remove ...
['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26', 'fireplaceflag', 'taxdelinquencyflag', 'taxdelinquencyyear']
We exclude: 15


In [32]:
print ("Remove features with one unique value ...")
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print(exclude_unique)
print("We exclude: %s" % len(exclude_unique))

Remove features with one unique value ...
['decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'taxdelinquencyflag']
We exclude: 9


In [33]:
del num_rows, missing_perc_thresh
gc.collect();

In [34]:
print ("Define training features ...")
exclude_other = ['parcelid', 'logerror', 'propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other:
        train_features.append(c)
print(train_features)
print("We use these for training: %s" % len(train_features))

Define training features ...
['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'poolcnt', 'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock']
We use these for training: 45


<font color = blue> Now we have to deal with categorical features: </font>

In [35]:
print ("Define categorical features ...")
cat_feature_inds = []
cat_unique_thresh = 100
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Define categorical features ...
Cat features are: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'hashottuborspa', 'heatingorsystemtypeid', 'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcounty', 'assessmentyear']


<font color = blue> As we are going to use a tree based model, we will replace NaN values by '-999' so they do not interfere with proper data: </font>

In [36]:
print ("Replacing NaN values by -999 ...")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Replacing NaN values by -999 ...


In [37]:
print ("Training model 5: Cat boost ...")
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

test_df['transactiondate'] = pd.Timestamp('2016-12-01') 
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

Training model 5: Cat boost ...
(167888, 45) (167888,)
(2985217, 45)


In [38]:
y_pred = 0.0
model = CatBoostRegressor(
    iterations = 690, learning_rate = 0.01,
    depth = 7, l2_leaf_reg = 5,
    loss_function = 'MAE',
    eval_metric = 'MAE',
    random_seed = i)
model.fit(
    X_train, y_train,
    cat_features = cat_feature_inds)
y_pred += model.predict(X_test)


0:	learn: 0.0690998	total: 679ms	remaining: 7m 48s
1:	learn: 0.0689989	total: 1.15s	remaining: 6m 34s
2:	learn: 0.0689224	total: 1.6s	remaining: 6m 5s
3:	learn: 0.0688491	total: 2.15s	remaining: 6m 8s
4:	learn: 0.0687887	total: 2.68s	remaining: 6m 6s
5:	learn: 0.0687335	total: 3.25s	remaining: 6m 10s
6:	learn: 0.0686937	total: 3.69s	remaining: 5m 59s
7:	learn: 0.0686615	total: 4.32s	remaining: 6m 8s
8:	learn: 0.0686262	total: 4.95s	remaining: 6m 14s
9:	learn: 0.0685969	total: 5.67s	remaining: 6m 25s
10:	learn: 0.0685699	total: 6.21s	remaining: 6m 23s
11:	learn: 0.0685479	total: 6.96s	remaining: 6m 33s
12:	learn: 0.0685209	total: 7.82s	remaining: 6m 47s
13:	learn: 0.0685034	total: 8.38s	remaining: 6m 44s
14:	learn: 0.0684822	total: 8.98s	remaining: 6m 44s
15:	learn: 0.0684666	total: 9.6s	remaining: 6m 44s
16:	learn: 0.0684525	total: 10.2s	remaining: 6m 42s
17:	learn: 0.0684339	total: 11.7s	remaining: 7m 15s
18:	learn: 0.0684207	total: 12.2s	remaining: 7m 10s
19:	learn: 0.0684067	total: 

157:	learn: 0.0678230	total: 1m 41s	remaining: 5m 42s
158:	learn: 0.0678228	total: 1m 42s	remaining: 5m 42s
159:	learn: 0.0678211	total: 1m 43s	remaining: 5m 42s
160:	learn: 0.0678201	total: 1m 44s	remaining: 5m 41s
161:	learn: 0.0678167	total: 1m 44s	remaining: 5m 40s
162:	learn: 0.0678140	total: 1m 45s	remaining: 5m 39s
163:	learn: 0.0678133	total: 1m 45s	remaining: 5m 38s
164:	learn: 0.0678099	total: 1m 45s	remaining: 5m 36s
165:	learn: 0.0678060	total: 1m 46s	remaining: 5m 35s
166:	learn: 0.0678044	total: 1m 46s	remaining: 5m 34s
167:	learn: 0.0678042	total: 1m 46s	remaining: 5m 32s
168:	learn: 0.0678018	total: 1m 47s	remaining: 5m 30s
169:	learn: 0.0678006	total: 1m 47s	remaining: 5m 29s
170:	learn: 0.0677999	total: 1m 48s	remaining: 5m 28s
171:	learn: 0.0677980	total: 1m 48s	remaining: 5m 26s
172:	learn: 0.0677945	total: 1m 48s	remaining: 5m 25s
173:	learn: 0.0677909	total: 1m 49s	remaining: 5m 24s
174:	learn: 0.0677886	total: 1m 49s	remaining: 5m 22s
175:	learn: 0.0677870	total:

311:	learn: 0.0675659	total: 3m 24s	remaining: 4m 7s
312:	learn: 0.0675647	total: 3m 24s	remaining: 4m 6s
313:	learn: 0.0675643	total: 3m 25s	remaining: 4m 5s
314:	learn: 0.0675618	total: 3m 25s	remaining: 4m 4s
315:	learn: 0.0675603	total: 3m 26s	remaining: 4m 4s
316:	learn: 0.0675591	total: 3m 26s	remaining: 4m 3s
317:	learn: 0.0675561	total: 3m 27s	remaining: 4m 2s
318:	learn: 0.0675553	total: 3m 28s	remaining: 4m 1s
319:	learn: 0.0675527	total: 3m 29s	remaining: 4m 2s
320:	learn: 0.0675523	total: 3m 30s	remaining: 4m 1s
321:	learn: 0.0675503	total: 3m 31s	remaining: 4m 1s
322:	learn: 0.0675474	total: 3m 31s	remaining: 4m
323:	learn: 0.0675466	total: 3m 32s	remaining: 3m 59s
324:	learn: 0.0675457	total: 3m 32s	remaining: 3m 58s
325:	learn: 0.0675440	total: 3m 33s	remaining: 3m 57s
326:	learn: 0.0675431	total: 3m 33s	remaining: 3m 56s
327:	learn: 0.0675420	total: 3m 34s	remaining: 3m 56s
328:	learn: 0.0675388	total: 3m 34s	remaining: 3m 55s
329:	learn: 0.0675359	total: 3m 35s	remaini

465:	learn: 0.0673782	total: 5m 32s	remaining: 2m 39s
466:	learn: 0.0673768	total: 5m 33s	remaining: 2m 39s
467:	learn: 0.0673766	total: 5m 34s	remaining: 2m 38s
468:	learn: 0.0673761	total: 5m 37s	remaining: 2m 39s
469:	learn: 0.0673751	total: 5m 39s	remaining: 2m 38s
470:	learn: 0.0673734	total: 5m 40s	remaining: 2m 38s
471:	learn: 0.0673730	total: 5m 41s	remaining: 2m 37s
472:	learn: 0.0673719	total: 5m 41s	remaining: 2m 36s
473:	learn: 0.0673709	total: 5m 42s	remaining: 2m 36s
474:	learn: 0.0673705	total: 5m 43s	remaining: 2m 35s
475:	learn: 0.0673698	total: 5m 44s	remaining: 2m 34s
476:	learn: 0.0673698	total: 5m 44s	remaining: 2m 33s
477:	learn: 0.0673692	total: 5m 45s	remaining: 2m 33s
478:	learn: 0.0673682	total: 5m 46s	remaining: 2m 32s
479:	learn: 0.0673678	total: 5m 47s	remaining: 2m 32s
480:	learn: 0.0673673	total: 5m 48s	remaining: 2m 31s
481:	learn: 0.0673666	total: 5m 49s	remaining: 2m 30s
482:	learn: 0.0673654	total: 5m 49s	remaining: 2m 29s
483:	learn: 0.0673651	total:

619:	learn: 0.0672158	total: 7m 37s	remaining: 51.6s
620:	learn: 0.0672153	total: 7m 37s	remaining: 50.9s
621:	learn: 0.0672139	total: 7m 38s	remaining: 50.1s
622:	learn: 0.0672129	total: 7m 38s	remaining: 49.4s
623:	learn: 0.0672116	total: 7m 39s	remaining: 48.6s
624:	learn: 0.0672097	total: 7m 40s	remaining: 47.9s
625:	learn: 0.0672090	total: 7m 40s	remaining: 47.1s
626:	learn: 0.0672070	total: 7m 41s	remaining: 46.4s
627:	learn: 0.0672063	total: 7m 42s	remaining: 45.6s
628:	learn: 0.0672058	total: 7m 43s	remaining: 44.9s
629:	learn: 0.0672042	total: 7m 43s	remaining: 44.2s
630:	learn: 0.0672024	total: 7m 44s	remaining: 43.4s
631:	learn: 0.0671995	total: 7m 45s	remaining: 42.8s
632:	learn: 0.0671995	total: 7m 45s	remaining: 42s
633:	learn: 0.0671994	total: 7m 46s	remaining: 41.2s
634:	learn: 0.0671988	total: 7m 47s	remaining: 40.5s
635:	learn: 0.0671979	total: 7m 47s	remaining: 39.7s
636:	learn: 0.0671975	total: 7m 48s	remaining: 39s
637:	learn: 0.0671957	total: 7m 49s	remaining: 38.

In [39]:
submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})
test_dates = {
    '201612': pd.Timestamp('2016-11-30')
}
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    submission[label] = y_pred
    
submission.to_csv(data_path + 'Model_5_CatBoost.csv', float_format='%.6f',index=False)

Predicting for: 201612 ... 


In [40]:
predictions_model_5 = pd.read_csv(data_path + 'Model_5_CatBoost.csv')
predictions_model_5.head()

Unnamed: 0,ParcelId,201612
0,10754147,0.017949
1,10759547,0.012833
2,10843547,0.034269
3,10859147,0.032328
4,10879947,0.01511
