In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

In [None]:
print('Loading data ...')

names = ['parcelid', 'air_conditioning_type', 'architectural_style', 
         'area_basement', 'num_bathroom', 'num_bedroom', 'framing_type',
         'building_quality', 'num_bathroom_calc', 'deck_type',
         'area_firstfloor_finished', 'area_total_calc',
         'area_living_finished', 'perimeter_living',
         'area_total', 'area_firstfloor_unfinished',
         'area_base', 'fips', 'num_fireplace', 'num_fullbath',
         'num_garagecar', 'area_garage', 'hashottuborspa',
         'heating_type', 'latitude', 'longitude',
         'area_lot', 'num_pool', 'area_pools', 'pooltypeid10',
         'pooltypeid2', 'pooltypeid7', 'property_land_use_code',
         'property_land_use_type', 'property_zoning_desc',
         'census_raw_tract_block', 'region_city', 'region_county',
         'region_neighborhood', 'region_zipcode', 'num_room', 'story_type',
         'num_34_bath', 'material_type', 'num_unit',
         'area_patio', 'area_shed', 'build_year',
         'num_stories', 'flag_fireplace', 'tax_assessed_structure_value',
         'tax_assessed_parcel_value', 'tax_assessment_year', 'tax_assessed_land_alue',
         'tax_property', 'tax_delinquency_flag', 'tax_delinquency_year',
         'census_tract_block']

train  = pd.read_csv('../input/train_2016.csv')
prop   = pd.read_csv('../input/properties_2016.csv', names=names, header=0)
sample = pd.read_csv('../input/sample_submission.csv')

print('Binding to float32')

for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)
        
print(np.shape(train))
print(np.shape(prop))
print(np.shape(sample))

train.head()

In [None]:
print('Creating training set ...')

df_train = train.merge(prop, how='left', on='parcelid')
y_train  = df_train['logerror'].values
df_train = df_train.drop(['logerror', 'transactiondate'], axis=1)

print(np.shape(df_train))

print('Creating testing set ...')

sample['parcelid'] = sample['ParcelId'];
sample = sample.drop(['ParcelId', '201610', '201611', '201612', '201710', '201711', '201712'], axis=1)
df_sample = pd.merge(sample, prop, on='parcelid', how='left')

print(np.shape(df_sample))

In [None]:
print('Merging training and testing for preprocessing...')

assert(np.array_equal(df_train.columns.values, df_sample.columns.values))

rows_train = np.shape(df_train)[0]
df_all = pd.concat([df_train, df_sample])

print(np.shape(df_all))

In [None]:
df_all = pd.concat([df_all, pd.get_dummies(df_all['air_conditioning_type'], prefix='air_conditioning_type', dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['architectural_style'],   prefix='architectural_style',   dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['framing_type'],          prefix='framing_type',          dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['deck_type'],             prefix='deck_type',             dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['fips'],                  prefix='fips',                  dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['hashottuborspa'],        prefix='hashottuborspa',        dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['heating_type'],          prefix='heating_type',          dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['pooltypeid10'],          prefix='pooltypeid10',          dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['pooltypeid2'],           prefix='pooltypeid2',           dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['pooltypeid7'],           prefix='pooltypeid7',           dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['region_county'],         prefix='region_county',         dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['story_type'],            prefix='story_type',            dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['flag_fireplace'],        prefix='flag_fireplace',        dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['tax_assessment_year'],   prefix='tax_assessment_year',   dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['tax_delinquency_flag'],  prefix='tax_delinquency_flag',  dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['tax_delinquency_year'],  prefix='tax_delinquency_year',  dummy_na=True)], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['material_type'],         prefix='material_type',         dummy_na=True)], axis=1);

df_all = df_all.drop('air_conditioning_type', axis=1);
df_all = df_all.drop('architectural_style',   axis=1);
df_all = df_all.drop('framing_type',          axis=1);
df_all = df_all.drop('deck_type',             axis=1);
df_all = df_all.drop('fips',                  axis=1);
df_all = df_all.drop('hashottuborspa',        axis=1);
df_all = df_all.drop('heating_type',          axis=1);
df_all = df_all.drop('pooltypeid10',          axis=1);
df_all = df_all.drop('pooltypeid2',           axis=1);
df_all = df_all.drop('pooltypeid7',           axis=1);
df_all = df_all.drop('region_county',         axis=1);
df_all = df_all.drop('story_type',            axis=1);
df_all = df_all.drop('flag_fireplace',        axis=1);
df_all = df_all.drop('tax_assessment_year',   axis=1);
df_all = df_all.drop('tax_delinquency_flag',  axis=1);
df_all = df_all.drop('tax_delinquency_year',  axis=1);
df_all = df_all.drop('material_type',         axis=1);

df_all = df_all.drop(['census_raw_tract_block', 'census_tract_block', 'region_city', 'region_neighborhood', 'region_zipcode'], axis=1);

print(np.shape(df_all))

In [None]:
df_train  = df_all[:rows_train]
df_sample = df_all[rows_train:]

print(np.shape(df_train))
print(np.shape(df_sample))

In [None]:
x_train = df_train.drop(['parcelid'], axis=1)
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

#del df_train; gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

In [None]:
print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

#del x_train, x_valid; gc.collect()

In [None]:
print('Training ...')

params = {}
params['eta'] = 0.1
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 3
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 100, watchlist, early_stopping_rounds=10, verbose_eval=10)

del d_train, d_valid

In [None]:
print('Building test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

print(df_test.values.shape)

In [None]:
zoning_counts        = df_all['property_zoning_desc'].value_counts().reset_index();
land_use_type_counts = df_all['property_land_use_type'].value_counts().reset_index();
land_use_code_counts = df_all['property_land_use_code'].value_counts().reset_index();

df_all['property_zoning_desc']   = df_all['property_zoning_desc'].replace(  to_replace=zoning_counts[zoning_counts['property_zoning_desc']                 < 1000]['index'].values, value='zone_other')
df_all['property_land_use_type'] = df_all['property_land_use_type'].replace(to_replace=land_use_type_counts[land_use_type_counts['property_land_use_type'] < 1000]['index'].values, value='landtype_other')
df_all['property_land_use_code'] = df_all['property_land_use_code'].replace(to_replace=land_use_code_counts[land_use_code_counts['property_land_use_code'] < 1000]['index'].values, value='landuse_other')

df_all = pd.concat([df_all, pd.get_dummies(df_all['property_zoning_desc'])],   axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['property_land_use_type'])], axis=1);
df_all = pd.concat([df_all, pd.get_dummies(df_all['property_land_use_code'])], axis=1);

df_all = df_all.drop('property_zoning_desc',   axis=1);
df_all = df_all.drop('property_land_use_type', axis=1);
df_all = df_all.drop('property_land_use_code', axis=1);

print(np.shape(df_all))