## CAT BOOST REGRESSOR

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
import zipfile
import re
from scipy import stats
from catboost import CatBoostRegressor
from tqdm import tqdm
import gc
import datetime as dt

<font color = blue> Once we have imported the necessary libraries, we are going to define our data path and then, load the data: </font>

In [2]:
data_path = './data/'

In [3]:
print('Loading Properties ...')
properties2016 = pd.read_csv(data_path + 'properties_2016.csv', low_memory = False)
properties2017 = pd.read_csv(data_path + 'properties_2017.csv', low_memory = False)

Loading Properties ...


In [4]:
properties2016.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,7.0,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [5]:
properties2016.shape

(2985217, 58)

In [6]:
properties2017.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2016.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,5.0,,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,6.0,,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,


In [7]:
properties2017.shape

(2985217, 58)

<font color = blue> We are going to specify the parameter 'parse_dates' to make the column 'transactiondate' in train files date formatted: </font>

In [8]:
print('Loading Train ...')
train2016 = pd.read_csv(data_path + 'train_2016_v2.csv', parse_dates=['transactiondate'], low_memory=False)
train2017 = pd.read_csv(data_path + 'train_2017.csv', parse_dates=['transactiondate'], low_memory=False)

Loading Train ...


In [9]:
train2016.shape

(90275, 3)

In [10]:
train2017.shape

(77613, 3)

In [11]:
train2016.head()

Unnamed: 0,parcelid,logerror,transactiondate
0,11016594,0.0276,2016-01-01
1,14366692,-0.1684,2016-01-01
2,12098116,-0.004,2016-01-01
3,12643413,0.0218,2016-01-02
4,14432541,-0.005,2016-01-02


<font color = blue> Now we are going to parse dates to get one column for year, one for month, one for day and one for quarter: </font>

In [12]:
def add_date_features(df):
    df["transaction_year"] = df["transactiondate"].dt.year
    df["transaction_month"] = (df["transactiondate"].dt.year - 2016)*12 + df["transactiondate"].dt.month
    df["transaction_day"] = df["transactiondate"].dt.day
    df["transaction_quarter"] = (df["transactiondate"].dt.year - 2016)*4 + df["transactiondate"].dt.quarter
    df.drop(["transactiondate"], inplace=True, axis=1)
    return df

In [13]:
train2016 = add_date_features(train2016)
train2017 = add_date_features(train2017)

In [14]:
train2016.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter
0,11016594,0.0276,2016,1,1,1
1,14366692,-0.1684,2016,1,1,1
2,12098116,-0.004,2016,1,1,1
3,12643413,0.0218,2016,1,2,1
4,14432541,-0.005,2016,1,2,1


In [15]:
print('Loading Sample ...')
sample_submission = pd.read_csv(data_path + 'sample_submission.csv', low_memory = False)

Loading Sample ...


<font color = blue> Our sample submission will be a file with the parcel id for each property and six prediction columns for different dates (last 2016 quarter and last 2017 quarter): </font>

In [16]:
sample_submission.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0,0,0,0,0,0
1,10759547,0,0,0,0,0,0
2,10843547,0,0,0,0,0,0
3,10859147,0,0,0,0,0,0
4,10879947,0,0,0,0,0,0


<font color = blue> As we are going to apply the first model, cat boost, to the column for 201611 predictions, we remove the other columns: </font>

In [17]:
sample_submission_model_3 = sample_submission.drop(columns = ['201610', '201612', '201710', '201711', '201712'])
sample_submission_model_3.head()

Unnamed: 0,ParcelId,201611
0,10754147,0
1,10759547,0
2,10843547,0
3,10859147,0
4,10879947,0


<font color = blue> We will merge properties and train files to get all data grouped: </font>

In [18]:
print('Merging Train with Properties ...')
train2016 = pd.merge(train2016, properties2016, how = 'left', on = 'parcelid')
train2017 = pd.merge(train2017, properties2017, how = 'left', on = 'parcelid')

Merging Train with Properties ...


In [19]:
train2016.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016,1,1,1,1.0,,,2.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,-0.1684,2016,1,1,1,,,,3.5,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016,1,1,1,1.0,,,3.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,0.0218,2016,1,2,1,1.0,,,2.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016,1,2,1,,,,2.5,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


In [20]:
train2016.shape

(90275, 63)

In [21]:
train2017.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,14297519,0.025595,2017,13,1,5,,,,3.5,...,,,485713.0,1023282.0,2016.0,537569.0,11013.72,,,60590630000000.0
1,17052889,0.055619,2017,13,1,5,,,,1.0,...,1.0,,88000.0,464000.0,2016.0,376000.0,5672.48,,,61110010000000.0
2,14186244,0.005383,2017,13,1,5,,,,2.0,...,1.0,,85289.0,564778.0,2016.0,479489.0,6488.3,,,60590220000000.0
3,12177905,-0.10341,2017,13,1,5,,,,3.0,...,,,108918.0,145143.0,2016.0,36225.0,1777.51,,,60373000000000.0
4,10887214,0.00694,2017,13,1,5,1.0,,,3.0,...,,,73681.0,119407.0,2016.0,45726.0,1533.89,,,60371240000000.0


In [22]:
train2017.shape

(77613, 63)

<font color = blue> As we should not use the 2016 tax values when predicting log errors against the 2016 log errors, we will omit this information for the 2016 predicitons columns: </font>

In [23]:
print('Tax Features 2017  ...')
train2017.iloc[:, train2017.columns.str.startswith('tax')] = np.nan
train2017.iloc[:, train2017.columns.str.endswith('taxvaluedollarcnt')] = np.nan

Tax Features 2017  ...


In [24]:
train2017.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,14297519,0.025595,2017,13,1,5,,,,3.5,...,,,,,2016.0,,,,,60590630000000.0
1,17052889,0.055619,2017,13,1,5,,,,1.0,...,1.0,,,,2016.0,,,,,61110010000000.0
2,14186244,0.005383,2017,13,1,5,,,,2.0,...,1.0,,,,2016.0,,,,,60590220000000.0
3,12177905,-0.10341,2017,13,1,5,,,,3.0,...,,,,,2016.0,,,,,60373000000000.0
4,10887214,0.00694,2017,13,1,5,1.0,,,3.0,...,,,,,2016.0,,,,,60371240000000.0


<font color = blue> Now we will generate our training and test files to run the model, the training file will be all properties and train from 2016 and 2017, while for the test, we will merge the sample submission with the unique properties file: </font>

In [25]:
print('Concat Train 2016 & 2017 ...')
train_df = pd.concat([train2016, train2017], axis = 0)
test_df = pd.merge(sample_submission_model_3[['ParcelId']], properties2016.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')


Concat Train 2016 & 2017 ...


In [26]:
train_df.head()

Unnamed: 0,parcelid,logerror,transaction_year,transaction_month,transaction_day,transaction_quarter,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,11016594,0.0276,2016,1,1,1,1.0,,,2.0,...,,,122754.0,360170.0,2015.0,237416.0,6735.88,,,60371070000000.0
1,14366692,-0.1684,2016,1,1,1,,,,3.5,...,,,346458.0,585529.0,2015.0,239071.0,10153.02,,,
2,12098116,-0.004,2016,1,1,1,1.0,,,3.0,...,,,61994.0,119906.0,2015.0,57912.0,11484.48,,,60374640000000.0
3,12643413,0.0218,2016,1,2,1,1.0,,,2.0,...,,,171518.0,244880.0,2015.0,73362.0,3048.74,,,60372960000000.0
4,14432541,-0.005,2016,1,2,1,,,,2.5,...,2.0,,169574.0,434551.0,2015.0,264977.0,5488.96,,,60590420000000.0


In [27]:
train_df.shape

(167888, 63)

In [28]:
test_df.head()

Unnamed: 0,ParcelId,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,7.0,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [29]:
test_df.shape

(2985217, 58)

<font color = blue> To optimize memory management, we are using garbage collections algorithms (GC), which solve reference cycles (when one or more objects are referencing each other) that reference counting cannot detect: </font>

In [30]:
del properties2016, properties2017, train2016, train2017
gc.collect();

<font color = blue> Let's do some feature engineering. We have to deal with missing values, where we will establish a threshold of 98% missing values to remove those fields, and also with features with one unique value and others that we do not want to use in our training: </font>

In [31]:
print('Missing data fields to remove ...')
missing_perc_thresh = 0.95
exclude_missing = []
num_rows = train_df.shape[0]
for c in train_df.columns:
    num_missing = train_df[c].isnull().sum()
    if num_missing == 0:
        continue
    missing_frac = num_missing / float(num_rows)
    if missing_frac > missing_perc_thresh:
        exclude_missing.append(c)
print(exclude_missing)
print("We exclude: %s" % len(exclude_missing))

Missing data fields to remove ...
['architecturalstyletypeid', 'basementsqft', 'buildingclasstypeid', 'decktypeid', 'finishedsquarefeet13', 'finishedsquarefeet15', 'finishedsquarefeet6', 'hashottuborspa', 'poolsizesum', 'pooltypeid10', 'pooltypeid2', 'storytypeid', 'typeconstructiontypeid', 'yardbuildingsqft17', 'yardbuildingsqft26', 'fireplaceflag', 'taxdelinquencyflag', 'taxdelinquencyyear']
We exclude: 18


In [32]:
print ("Remove features with one unique value ...")
exclude_unique = []
for c in train_df.columns:
    num_uniques = len(train_df[c].unique())
    if train_df[c].isnull().sum() != 0:
        num_uniques -= 1
    if num_uniques == 1:
        exclude_unique.append(c)
print(exclude_unique)
print("We exclude: %s" % len(exclude_unique))

Remove features with one unique value ...
['decktypeid', 'hashottuborspa', 'poolcnt', 'pooltypeid10', 'pooltypeid2', 'pooltypeid7', 'storytypeid', 'fireplaceflag', 'taxdelinquencyflag']
We exclude: 9


In [33]:
del num_rows, missing_perc_thresh
gc.collect();

In [34]:
print ("Define training features ...")
exclude_other = ['parcelid', 'logerror', 'propertyzoningdesc']
train_features = []
for c in train_df.columns:
    if c not in exclude_missing \
       and c not in exclude_other:
        train_features.append(c)
print(train_features)
print("We use these for training: %s" % len(train_features))

Define training features ...
['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'bathroomcnt', 'bedroomcnt', 'buildingqualitytypeid', 'calculatedbathnbr', 'finishedfloor1squarefeet', 'calculatedfinishedsquarefeet', 'finishedsquarefeet12', 'finishedsquarefeet50', 'fips', 'fireplacecnt', 'fullbathcnt', 'garagecarcnt', 'garagetotalsqft', 'heatingorsystemtypeid', 'latitude', 'longitude', 'lotsizesquarefeet', 'poolcnt', 'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'rawcensustractandblock', 'regionidcity', 'regionidcounty', 'regionidneighborhood', 'regionidzip', 'roomcnt', 'threequarterbathnbr', 'unitcnt', 'yearbuilt', 'numberofstories', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt', 'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock']
We use these for training: 42


<font color = blue> Now we have to deal with categorical features: </font>

In [35]:
print ("Define categorical features ...")
cat_feature_inds = []
cat_unique_thresh = 100
for i, c in enumerate(train_features):
    num_uniques = len(train_df[c].unique())
    if num_uniques < cat_unique_thresh \
       and not 'sqft' in c \
       and not 'cnt' in c \
       and not 'nbr' in c \
       and not 'number' in c:
        cat_feature_inds.append(i)
print("Cat features are: %s" % [train_features[ind] for ind in cat_feature_inds])

Define categorical features ...
Cat features are: ['transaction_year', 'transaction_month', 'transaction_day', 'transaction_quarter', 'airconditioningtypeid', 'buildingqualitytypeid', 'fips', 'heatingorsystemtypeid', 'pooltypeid7', 'propertycountylandusecode', 'propertylandusetypeid', 'regionidcounty', 'assessmentyear']


<font color = blue> As we are going to use a tree based model, we will replace NaN values by '-999' so they do not interfere with proper data: </font>

In [36]:
print ("Replacing NaN values by -999 ...")
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Replacing NaN values by -999 ...


In [37]:
print ("Training model 3: Cat boost ...")
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

test_df['transactiondate'] = pd.Timestamp('2016-12-01') 
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

Training model 3: Cat boost ...
(167888, 42) (167888,)
(2985217, 42)


In [38]:
y_pred = 0.0
model = CatBoostRegressor(
    iterations = 720, learning_rate = 0.03,
    depth = 8, l2_leaf_reg = 5,
    loss_function = 'MAE',
    eval_metric = 'MAE',
    random_seed = i)
model.fit(
    X_train, y_train,
    cat_features = cat_feature_inds)
y_pred += model.predict(X_test)


0:	learn: 0.0688886	total: 575ms	remaining: 6m 53s
1:	learn: 0.0686979	total: 1.1s	remaining: 6m 35s
2:	learn: 0.0685757	total: 1.67s	remaining: 6m 38s
3:	learn: 0.0684958	total: 2.34s	remaining: 6m 58s
4:	learn: 0.0684539	total: 2.78s	remaining: 6m 37s
5:	learn: 0.0684085	total: 4.21s	remaining: 8m 20s
6:	learn: 0.0683441	total: 4.77s	remaining: 8m 5s
7:	learn: 0.0683032	total: 5.24s	remaining: 7m 46s
8:	learn: 0.0682586	total: 5.68s	remaining: 7m 28s
9:	learn: 0.0682175	total: 6.22s	remaining: 7m 21s
10:	learn: 0.0681764	total: 6.66s	remaining: 7m 9s
11:	learn: 0.0681526	total: 7.06s	remaining: 6m 56s
12:	learn: 0.0681346	total: 7.47s	remaining: 6m 46s
13:	learn: 0.0681071	total: 7.89s	remaining: 6m 37s
14:	learn: 0.0680927	total: 8.34s	remaining: 6m 31s
15:	learn: 0.0680698	total: 8.8s	remaining: 6m 27s
16:	learn: 0.0680362	total: 9.3s	remaining: 6m 24s
17:	learn: 0.0680272	total: 9.71s	remaining: 6m 18s
18:	learn: 0.0679979	total: 10.2s	remaining: 6m 15s
19:	learn: 0.0679702	total:

158:	learn: 0.0670785	total: 1m 10s	remaining: 4m 8s
159:	learn: 0.0670749	total: 1m 10s	remaining: 4m 7s
160:	learn: 0.0670699	total: 1m 11s	remaining: 4m 7s
161:	learn: 0.0670617	total: 1m 11s	remaining: 4m 6s
162:	learn: 0.0670602	total: 1m 11s	remaining: 4m 5s
163:	learn: 0.0670601	total: 1m 11s	remaining: 4m 4s
164:	learn: 0.0670555	total: 1m 12s	remaining: 4m 3s
165:	learn: 0.0670546	total: 1m 12s	remaining: 4m 2s
166:	learn: 0.0670499	total: 1m 13s	remaining: 4m 1s
167:	learn: 0.0670475	total: 1m 13s	remaining: 4m 1s
168:	learn: 0.0670429	total: 1m 13s	remaining: 4m
169:	learn: 0.0670376	total: 1m 14s	remaining: 3m 59s
170:	learn: 0.0670322	total: 1m 14s	remaining: 3m 59s
171:	learn: 0.0670312	total: 1m 14s	remaining: 3m 58s
172:	learn: 0.0670307	total: 1m 15s	remaining: 3m 57s
173:	learn: 0.0670258	total: 1m 15s	remaining: 3m 57s
174:	learn: 0.0670200	total: 1m 16s	remaining: 3m 56s
175:	learn: 0.0670188	total: 1m 16s	remaining: 3m 55s
176:	learn: 0.0670163	total: 1m 17s	remain

311:	learn: 0.0665643	total: 2m 33s	remaining: 3m 20s
312:	learn: 0.0665626	total: 2m 34s	remaining: 3m 21s
313:	learn: 0.0665578	total: 2m 35s	remaining: 3m 21s
314:	learn: 0.0665578	total: 2m 35s	remaining: 3m 20s
315:	learn: 0.0665562	total: 2m 36s	remaining: 3m 20s
316:	learn: 0.0665562	total: 2m 36s	remaining: 3m 19s
317:	learn: 0.0665554	total: 2m 37s	remaining: 3m 19s
318:	learn: 0.0665544	total: 2m 38s	remaining: 3m 19s
319:	learn: 0.0665452	total: 2m 40s	remaining: 3m 20s
320:	learn: 0.0665417	total: 2m 41s	remaining: 3m 20s
321:	learn: 0.0665351	total: 2m 42s	remaining: 3m 20s
322:	learn: 0.0665330	total: 2m 43s	remaining: 3m 20s
323:	learn: 0.0665257	total: 2m 43s	remaining: 3m 20s
324:	learn: 0.0665204	total: 2m 45s	remaining: 3m 20s
325:	learn: 0.0665190	total: 2m 45s	remaining: 3m 20s
326:	learn: 0.0665161	total: 2m 46s	remaining: 3m 20s
327:	learn: 0.0665105	total: 2m 47s	remaining: 3m 19s
328:	learn: 0.0665101	total: 2m 47s	remaining: 3m 19s
329:	learn: 0.0665082	total:

465:	learn: 0.0660034	total: 4m 7s	remaining: 2m 14s
466:	learn: 0.0659992	total: 4m 7s	remaining: 2m 14s
467:	learn: 0.0659967	total: 4m 8s	remaining: 2m 13s
468:	learn: 0.0659934	total: 4m 8s	remaining: 2m 13s
469:	learn: 0.0659912	total: 4m 9s	remaining: 2m 12s
470:	learn: 0.0659840	total: 4m 11s	remaining: 2m 12s
471:	learn: 0.0659819	total: 4m 11s	remaining: 2m 12s
472:	learn: 0.0659793	total: 4m 12s	remaining: 2m 11s
473:	learn: 0.0659737	total: 4m 12s	remaining: 2m 11s
474:	learn: 0.0659713	total: 4m 13s	remaining: 2m 10s
475:	learn: 0.0659675	total: 4m 13s	remaining: 2m 9s
476:	learn: 0.0659662	total: 4m 13s	remaining: 2m 9s
477:	learn: 0.0659612	total: 4m 14s	remaining: 2m 8s
478:	learn: 0.0659574	total: 4m 15s	remaining: 2m 8s
479:	learn: 0.0659565	total: 4m 15s	remaining: 2m 7s
480:	learn: 0.0659539	total: 4m 16s	remaining: 2m 7s
481:	learn: 0.0659515	total: 4m 17s	remaining: 2m 6s
482:	learn: 0.0659476	total: 4m 19s	remaining: 2m 7s
483:	learn: 0.0659469	total: 4m 20s	remai

619:	learn: 0.0652200	total: 5m 50s	remaining: 56.6s
620:	learn: 0.0652151	total: 5m 52s	remaining: 56.1s
621:	learn: 0.0652113	total: 5m 53s	remaining: 55.6s
622:	learn: 0.0652091	total: 5m 53s	remaining: 55.1s
623:	learn: 0.0652064	total: 5m 54s	remaining: 54.6s
624:	learn: 0.0651998	total: 5m 55s	remaining: 54s
625:	learn: 0.0651985	total: 5m 56s	remaining: 53.5s
626:	learn: 0.0651953	total: 5m 56s	remaining: 52.9s
627:	learn: 0.0651935	total: 5m 57s	remaining: 52.4s
628:	learn: 0.0651875	total: 5m 58s	remaining: 51.9s
629:	learn: 0.0651801	total: 5m 59s	remaining: 51.4s
630:	learn: 0.0651732	total: 6m	remaining: 50.9s
631:	learn: 0.0651716	total: 6m 1s	remaining: 50.3s
632:	learn: 0.0651631	total: 6m 2s	remaining: 49.9s
633:	learn: 0.0651605	total: 6m 3s	remaining: 49.3s
634:	learn: 0.0651596	total: 6m 4s	remaining: 48.8s
635:	learn: 0.0651570	total: 6m 5s	remaining: 48.2s
636:	learn: 0.0651537	total: 6m 6s	remaining: 47.7s
637:	learn: 0.0651466	total: 6m 6s	remaining: 47.2s
638:	l

In [39]:
submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})
test_dates = {
    '201611': pd.Timestamp('2016-10-31')
}
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    submission[label] = y_pred
    
submission.to_csv(data_path + 'Model_3_CatBoost.csv', float_format='%.6f',index=False)

Predicting for: 201611 ... 


In [40]:
predictions_model_3 = pd.read_csv(data_path + 'Model_3_CatBoost.csv')
predictions_model_3.head()

Unnamed: 0,ParcelId,201611
0,10754147,0.030659
1,10759547,-0.003619
2,10843547,0.07579
3,10859147,0.105809
4,10879947,0.020252
