## CAT BOOST REGRESSOR

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import zipfile
import re
from scipy import stats
from catboost import CatBoostRegressor
from tqdm import tqdm
import gc
import datetime as dt

<font color = blue> Once we have imported the necessary libraries, we are going to define our data path and then, load the data: </font>

In [2]:
data_path = './data/'

In [3]:
print('Loading Properties ...')
properties2016 = pd.read_csv(data_path + 'properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv(data_path + 'properties_2017.csv', low_memory = False)

Loading Properties ...


In [4]:
properties2016.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,7.0,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [5]:
properties2016.shape

(2985217, 58)

In [6]:
properties2017.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2016.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,5.0,,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,6.0,,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,


In [7]:
properties2017.shape

(2985217, 58)

<font color = blue> We are going to specify the parameter 'parse_dates' to make the column 'transactiondate' in train files date formatted: </font>

In [8]:
print('Loading Train ...')
train2016 = pd.read_csv(data_path + 'train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv(data_path + 'train_2017.csv', parse_dates=['transactiondate'], low_memory=False)

Loading Train ...


In [9]:
train2016.shape

(90275, 3)

In [10]:
train2017.shape

(77613, 3)

In [11]:
train2016.head()

Unnamed: 0,parcelid,logerror,transactiondate
0,11016594,0.0276,2016-01-01
1,14366692,-0.1684,2016-01-01
2,12098116,-0.004,2016-01-01
3,12643413,0.0218,2016-01-02
4,14432541,-0.005,2016-01-02


<font color = blue> Now we are going to parse dates to get one column for year, one for month, one for day and one for quarter: </font>

In [12]:
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = (df["transactiondate"].dt.year - 2016)*12 + df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = (df["transactiondate"].dt.year - 2016)*4 + df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [13]:
train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

In [14]:
train2016.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter
0,11016594,0.0276,2016,1,1,1
1,14366692,-0.1684,2016,1,1,1
2,12098116,-0.004,2016,1,1,1
3,12643413,0.0218,2016,1,2,1
4,14432541,-0.005,2016,1,2,1


In [15]:
print('Loading Sample ...')
sample_submission = pd.read_csv(data_path + 'sample_submission.csv', low_memory = False)

Loading Sample ...


<font color = blue> Our sample submission will be a file with the parcel id for each property and six prediction columns for different dates (last 2016 quarter and last 2017 quarter): </font>

In [16]:
sample_submission.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0


<font color = blue> As we are going to apply the first model, cat boost, to the column for 201711 predictions, we remove the other columns: </font>

In [17]:
sample_submission_model_4 = sample_submission.drop(columns = ['201610', '201611', '201612', '201710', '201712'])
sample_submission_model_4.head()

Unnamed: 0,ParcelId,201711
0,10754147,0
1,10759547,0
2,10843547,0
3,10859147,0
4,10879947,0


<font color = blue> We will merge properties and train files to get all data grouped: </font>

In [18]:
print('Merging Train with Properties ...')
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')

Merging Train with Properties ...


In [19]:
train2016.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016,1,1,1,1.0,,,2.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,-0.1684,2016,1,1,1,,,,3.5,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016,1,1,1,1.0,,,3.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,0.0218,2016,1,2,1,1.0,,,2.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016,1,2,1,,,,2.5,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


In [20]:
train2016.shape

(90275, 63)

In [21]:
train2017.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,14297519,0.025595,2017,13,1,5,,,,3.5,...,,,485713.0,1023282.0,2016.0,537569.0,11013.72,,,60590630000000.0
1,17052889,0.055619,2017,13,1,5,,,,1.0,...,1.0,,88000.0,464000.0,2016.0,376000.0,5672.48,,,61110010000000.0
2,14186244,0.005383,2017,13,1,5,,,,2.0,...,1.0,,85289.0,564778.0,2016.0,479489.0,6488.3,,,60590220000000.0
3,12177905,-0.10341,2017,13,1,5,,,,3.0,...,,,108918.0,145143.0,2016.0,36225.0,1777.51,,,60373000000000.0
4,10887214,0.00694,2017,13,1,5,1.0,,,3.0,...,,,73681.0,119407.0,2016.0,45726.0,1533.89,,,60371240000000.0


In [22]:
train2017.shape

(77613, 63)

<font color = blue> Now we will generate our training and test files to run the model, the training file will be all properties and train from 2016 and 2017, while for the test, we will merge the sample submission with the unique properties file: </font>

In [23]:
print('Concat Train 2016 & 2017 ...')
train_df = pd.concat([train2016, train2017], axis = 0)
test_df = pd.merge(sample_submission_model_4[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')


Concat Train 2016 & 2017 ...


In [24]:
train_df.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016,1,1,1,1.0,,,2.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,-0.1684,2016,1,1,1,,,,3.5,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016,1,1,1,1.0,,,3.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,0.0218,2016,1,2,1,1.0,,,2.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016,1,2,1,,,,2.5,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


In [25]:
train_df.shape

(167888, 63)

In [26]:
test_df.head()

Unnamed: 0,ParcelId,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,7.0,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [27]:
test_df.shape

(2985217, 58)

<font color = blue> To optimize memory management, we are using garbage collections algorithms (GC), which solve reference cycles (when one or more objects are referencing each other) that reference counting cannot detect: </font>

In [28]:
del properties2016, properties2017, train2016, train2017
gc.collect();

<font color = blue> Let's do some feature engineering. We have to deal with missing values, where we will establish a threshold of 98% missing values to remove those fields, and also with features with one unique value and others that we do not want to use in our training: </font>

In [29]:
print('Missing data fields to remove ...')
missing_perc_thresh = 0.98
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print(exclude_missing)
print("We exclude: %s" % len(exclude_missing))

Missing data fields to remove ...
['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet6', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft26', 'fireplaceflag']
We exclude: 13


In [30]:
print ("Remove features with one unique value ...")
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print(exclude_unique)
print("We exclude: %s" % len(exclude_unique))

Remove features with one unique value ...
['decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'taxdelinquencyflag']
We exclude: 9


In [31]:
del num_rows, missing_perc_thresh
gc.collect();

In [32]:
print ("Define training features ...")
exclude_other = ['parcelid', 'logerror', 'propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other:
        train_features.append(c)
print(train_features)
print("We use these for training: %s" % len(train_features))

Define training features ...
['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet15', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'hashottuborspa', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'poolcnt', 'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yardbuildingsqft17', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'taxdelinquencyflag', 'taxdelinquencyyear', 'censustractandblock']
We use these for training: 47


<font color = blue> Now we have to deal with categorical features: </font>

In [33]:
print ("Define categorical features ...")
cat_feature_inds = []
cat_unique_thresh = 100
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Define categorical features ...
Cat features are: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'hashottuborspa', 'heatingorsystemtypeid', 'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcounty', 'assessmentyear', 'taxdelinquencyflag', 'taxdelinquencyyear']


<font color = blue> As we are going to use a tree based model, we will replace NaN values by '-999' so they do not interfere with proper data: </font>

In [34]:
print ("Replacing NaN values by -999 ...")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Replacing NaN values by -999 ...


In [35]:
print ("Training model 4: Cat boost ...")
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

test_df['transactiondate'] = pd.Timestamp('2016-12-01') 
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

Training model 4: Cat boost ...
(167888, 47) (167888,)
(2985217, 47)


In [36]:
y_pred = 0.0
model = CatBoostRegressor(
    iterations = 720, learning_rate = 0.03,
    depth = 8, l2_leaf_reg = 5,
    loss_function = 'MAE',
    eval_metric = 'MAE',
    random_seed = i)
model.fit(
    X_train, y_train,
    cat_features = cat_feature_inds)
y_pred += model.predict(X_test)


0:	learn: 0.0688926	total: 828ms	remaining: 9m 55s
1:	learn: 0.0687281	total: 1.64s	remaining: 9m 47s
2:	learn: 0.0685900	total: 2.38s	remaining: 9m 29s
3:	learn: 0.0685118	total: 3.04s	remaining: 9m 4s
4:	learn: 0.0684522	total: 3.88s	remaining: 9m 15s
5:	learn: 0.0683974	total: 4.64s	remaining: 9m 12s
6:	learn: 0.0683481	total: 5.28s	remaining: 8m 57s
7:	learn: 0.0683024	total: 6.02s	remaining: 8m 56s
8:	learn: 0.0682738	total: 6.7s	remaining: 8m 49s
9:	learn: 0.0682541	total: 7.39s	remaining: 8m 44s
10:	learn: 0.0682011	total: 8.21s	remaining: 8m 49s
11:	learn: 0.0681781	total: 9.77s	remaining: 9m 36s
12:	learn: 0.0681424	total: 10.7s	remaining: 9m 42s
13:	learn: 0.0681191	total: 11.5s	remaining: 9m 40s
14:	learn: 0.0680968	total: 12.3s	remaining: 9m 37s
15:	learn: 0.0680778	total: 13s	remaining: 9m 31s
16:	learn: 0.0680502	total: 13.7s	remaining: 9m 25s
17:	learn: 0.0680246	total: 14.4s	remaining: 9m 22s
18:	learn: 0.0679996	total: 15.1s	remaining: 9m 15s
19:	learn: 0.0679821	total

157:	learn: 0.0669578	total: 2m 9s	remaining: 7m 38s
158:	learn: 0.0669551	total: 2m 9s	remaining: 7m 36s
159:	learn: 0.0669499	total: 2m 9s	remaining: 7m 34s
160:	learn: 0.0669421	total: 2m 10s	remaining: 7m 32s
161:	learn: 0.0669383	total: 2m 10s	remaining: 7m 30s
162:	learn: 0.0669353	total: 2m 11s	remaining: 7m 28s
163:	learn: 0.0669215	total: 2m 11s	remaining: 7m 26s
164:	learn: 0.0669189	total: 2m 12s	remaining: 7m 24s
165:	learn: 0.0669126	total: 2m 12s	remaining: 7m 22s
166:	learn: 0.0669060	total: 2m 13s	remaining: 7m 20s
167:	learn: 0.0669054	total: 2m 13s	remaining: 7m 18s
168:	learn: 0.0668979	total: 2m 14s	remaining: 7m 17s
169:	learn: 0.0668819	total: 2m 14s	remaining: 7m 15s
170:	learn: 0.0668775	total: 2m 15s	remaining: 7m 16s
171:	learn: 0.0668760	total: 2m 16s	remaining: 7m 14s
172:	learn: 0.0668704	total: 2m 16s	remaining: 7m 13s
173:	learn: 0.0668609	total: 2m 17s	remaining: 7m 11s
174:	learn: 0.0668585	total: 2m 17s	remaining: 7m 9s
175:	learn: 0.0668535	total: 2m 

311:	learn: 0.0663563	total: 3m 44s	remaining: 4m 53s
312:	learn: 0.0663420	total: 3m 45s	remaining: 4m 52s
313:	learn: 0.0663415	total: 3m 45s	remaining: 4m 51s
314:	learn: 0.0663351	total: 3m 45s	remaining: 4m 50s
315:	learn: 0.0663281	total: 3m 46s	remaining: 4m 49s
316:	learn: 0.0663267	total: 3m 46s	remaining: 4m 48s
317:	learn: 0.0663218	total: 3m 47s	remaining: 4m 47s
318:	learn: 0.0663144	total: 3m 47s	remaining: 4m 46s
319:	learn: 0.0663117	total: 3m 48s	remaining: 4m 45s
320:	learn: 0.0663081	total: 3m 48s	remaining: 4m 44s
321:	learn: 0.0663075	total: 3m 50s	remaining: 4m 44s
322:	learn: 0.0663055	total: 3m 50s	remaining: 4m 43s
323:	learn: 0.0663047	total: 3m 51s	remaining: 4m 42s
324:	learn: 0.0662922	total: 3m 51s	remaining: 4m 41s
325:	learn: 0.0662798	total: 3m 52s	remaining: 4m 40s
326:	learn: 0.0662784	total: 3m 52s	remaining: 4m 39s
327:	learn: 0.0662755	total: 3m 53s	remaining: 4m 38s
328:	learn: 0.0662706	total: 3m 53s	remaining: 4m 37s
329:	learn: 0.0662655	total:

464:	learn: 0.0656717	total: 4m 59s	remaining: 2m 44s
465:	learn: 0.0656678	total: 5m	remaining: 2m 43s
466:	learn: 0.0656574	total: 5m	remaining: 2m 42s
467:	learn: 0.0656522	total: 5m	remaining: 2m 42s
468:	learn: 0.0656449	total: 5m 1s	remaining: 2m 41s
469:	learn: 0.0656434	total: 5m 2s	remaining: 2m 40s
470:	learn: 0.0656389	total: 5m 2s	remaining: 2m 40s
471:	learn: 0.0656306	total: 5m 3s	remaining: 2m 39s
472:	learn: 0.0656285	total: 5m 4s	remaining: 2m 38s
473:	learn: 0.0656255	total: 5m 4s	remaining: 2m 38s
474:	learn: 0.0656226	total: 5m 5s	remaining: 2m 37s
475:	learn: 0.0656136	total: 5m 5s	remaining: 2m 36s
476:	learn: 0.0656111	total: 5m 5s	remaining: 2m 35s
477:	learn: 0.0656056	total: 5m 6s	remaining: 2m 35s
478:	learn: 0.0656009	total: 5m 6s	remaining: 2m 34s
479:	learn: 0.0655899	total: 5m 7s	remaining: 2m 33s
480:	learn: 0.0655852	total: 5m 7s	remaining: 2m 32s
481:	learn: 0.0655775	total: 5m 8s	remaining: 2m 32s
482:	learn: 0.0655707	total: 5m 8s	remaining: 2m 31s
4

618:	learn: 0.0647778	total: 6m 12s	remaining: 1m
619:	learn: 0.0647758	total: 6m 12s	remaining: 1m
620:	learn: 0.0647668	total: 6m 13s	remaining: 59.5s
621:	learn: 0.0647614	total: 6m 13s	remaining: 58.8s
622:	learn: 0.0647540	total: 6m 13s	remaining: 58.2s
623:	learn: 0.0647475	total: 6m 14s	remaining: 57.6s
624:	learn: 0.0647469	total: 6m 14s	remaining: 56.9s
625:	learn: 0.0647378	total: 6m 14s	remaining: 56.3s
626:	learn: 0.0647300	total: 6m 15s	remaining: 55.7s
627:	learn: 0.0647256	total: 6m 15s	remaining: 55.1s
628:	learn: 0.0647134	total: 6m 16s	remaining: 54.5s
629:	learn: 0.0647100	total: 6m 17s	remaining: 53.9s
630:	learn: 0.0647068	total: 6m 18s	remaining: 53.3s
631:	learn: 0.0647038	total: 6m 18s	remaining: 52.7s
632:	learn: 0.0647010	total: 6m 19s	remaining: 52.1s
633:	learn: 0.0646950	total: 6m 19s	remaining: 51.5s
634:	learn: 0.0646909	total: 6m 20s	remaining: 50.9s
635:	learn: 0.0646903	total: 6m 20s	remaining: 50.2s
636:	learn: 0.0646853	total: 6m 20s	remaining: 49.6s

In [39]:
submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})
test_dates = {
    '201711': pd.Timestamp('2017-10-31')
}
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    submission[label] = y_pred
    
submission.to_csv(data_path + 'Model_4_CatBoost.csv', float_format='%.6f',index=False)

Predicting for: 201711 ... 


In [40]:
predictions_model_4 = pd.read_csv(data_path + 'Model_4_CatBoost.csv')
predictions_model_4.head()

Unnamed: 0,ParcelId,201711
0,10754147,0.033367
1,10759547,0.008797
2,10843547,0.006214
3,10859147,0.03223
4,10879947,-0.005727
