In [1]:
import gc
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb



In [19]:
properties2016 = pd.read_csv('../input/properties_2016.csv', low_memory=False)
properties2017 = pd.read_csv('../input/properties_2017.csv', low_memory=False)
train2016 = pd.read_csv('../input/train_2016_v2.csv')
train2017 = pd.read_csv('../input/train_2017.csv')

properties=pd.concat([properties2016,properties2017])
train=pd.concat([train2016,train2017])

In [21]:
del train2016, train2017,properties2016,properties2017
gc.collect()

584

In [3]:
for c, dtype in zip(properties.columns, properties.dtypes):
    if dtype == np.float64:
        properties[c] = properties[c].astype(np.float32)

In [4]:
print('Creating training set ...')

df_train = train.merge(properties, how='left', on='parcelid')

Creating training set ...


In [5]:
x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'],
                        axis=1)  # XGboost is good at dealing with numbers but definitely not good when dealing with string. So the solution is neither we drop them or transform them.
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:  # The columns which are "object" types have NaN values and True values. I he's converting the NaNs to False for easier processing.
    x_train[c] = (x_train[c] == True)

del df_train;
gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]



(335776, 55) (335776,)


In [6]:
print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid;
gc.collect()



Building DMatrix...


14

In [7]:
print('Training ...')

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid



Training ...
[0]	train-mae:0.489636	valid-mae:0.48535
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 100 rounds.
[10]	train-mae:0.403782	valid-mae:0.399788
[20]	train-mae:0.334292	valid-mae:0.330464
[30]	train-mae:0.278177	valid-mae:0.274569
[40]	train-mae:0.23302	valid-mae:0.229623
[50]	train-mae:0.196785	valid-mae:0.193646
[60]	train-mae:0.167901	valid-mae:0.164963
[70]	train-mae:0.144949	valid-mae:0.14222
[80]	train-mae:0.126862	valid-mae:0.124312
[90]	train-mae:0.112762	valid-mae:0.110356
[100]	train-mae:0.101856	valid-mae:0.09955
[110]	train-mae:0.09355	valid-mae:0.091308
[120]	train-mae:0.08729	valid-mae:0.085123
[130]	train-mae:0.082641	valid-mae:0.080559
[140]	train-mae:0.079217	valid-mae:0.077232
[150]	train-mae:0.07672	valid-mae:0.074842
[160]	train-mae:0.074897	valid-mae:0.073138
[170]	train-mae:0.073581	valid-mae:0.071936
[180]	train-mae:0.07263	valid-mae:0.071073
[190]	train-mae:0.071928	v

In [None]:
print('Building test set ...')
sample = pd.read_csv('../input/sample_submission.csv', low_memory=False)
sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(properties, on='parcelid', how='left')

del properties;
gc.collect()

x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

del df_test, sample;
gc.collect()

d_test = xgb.DMatrix(x_test)

del x_test;
gc.collect()



Building test set ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [16]:
d_test.shape()

NameError: name 'd_test' is not defined

In [9]:
print('Predicting on test ...')

p_test = clf.predict(d_test)

del d_test;
gc.collect()

Predicting on test ...


0

In [15]:
p_test.shape

(5970434,)

In [11]:
sub = pd.read_csv('../input/sample_submission.csv')

In [12]:
sub.shape

(2985217, 7)

In [None]:
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

from datetime import datetime

print('Writing csv ...')
sub.to_csv('../submission/xgb_starter{}.csv.gz'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False, float_format='%.4g', compression='gzip')  # Thanks to @inversion
