In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
plt.style.use('fivethirtyeight')

In [4]:
prop_2016 = pd.read_csv('data/properties_2016.csv')

In [5]:
prop_2016.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2015.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,,,,,...,,,650756.0,1413387.0,2015.0,762631.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,7.0,,,...,1.0,,571346.0,1156834.0,2015.0,585488.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,,,193796.0,433491.0,2015.0,239695.0,5725.17,,,


In [6]:
prop_2017 = pd.read_csv('data/properties_2017.csv')

In [7]:
prop_2017.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,10754147,,,,0.0,0.0,,,,,...,,,,9.0,2016.0,9.0,,,,
1,10759547,,,,0.0,0.0,,,,,...,,,,27516.0,2015.0,27516.0,,,,
2,10843547,,,,0.0,0.0,5.0,,,,...,1.0,,660680.0,1434941.0,2016.0,774261.0,20800.37,,,
3,10859147,,,,0.0,0.0,3.0,6.0,,,...,1.0,,580059.0,1174475.0,2016.0,594416.0,14557.57,,,
4,10879947,,,,0.0,0.0,4.0,,,,...,1.0,,196751.0,440101.0,2016.0,243350.0,5725.17,,,


In [8]:
len(prop_2016), len(prop_2017)

(2985217, 2985217)

## Join with target variable

In [9]:
target_2016 = pd.read_csv('data/train_2016_v2.csv')

In [10]:
target_2017 = pd.read_csv('data/train_2017.csv')

In [11]:
target_2017.head()

Unnamed: 0,parcelid,logerror,transactiondate
0,14297519,0.025595,2017-01-01
1,17052889,0.055619,2017-01-01
2,14186244,0.005383,2017-01-01
3,12177905,-0.10341,2017-01-01
4,10887214,0.00694,2017-01-01


In [12]:
joined_data_2016 = pd.merge(target_2016,prop_2016,on="parcelid",how="left")

In [13]:
joined_data_2017 = pd.merge(target_2017, prop_2017,on='parcelid',how='left')

In [14]:
joined_data_2017.head()

Unnamed: 0,parcelid,logerror,transactiondate,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock
0,14297519,0.025595,2017-01-01,,,,3.5,4.0,,,...,,,485713.0,1023282.0,2016.0,537569.0,11013.72,,,60590630000000.0
1,17052889,0.055619,2017-01-01,,,,1.0,2.0,,,...,1.0,,88000.0,464000.0,2016.0,376000.0,5672.48,,,61110010000000.0
2,14186244,0.005383,2017-01-01,,,,2.0,3.0,,,...,1.0,,85289.0,564778.0,2016.0,479489.0,6488.3,,,60590220000000.0
3,12177905,-0.10341,2017-01-01,,,,3.0,4.0,,8.0,...,,,108918.0,145143.0,2016.0,36225.0,1777.51,,,60373000000000.0
4,10887214,0.00694,2017-01-01,1.0,,,3.0,3.0,,8.0,...,,,73681.0,119407.0,2016.0,45726.0,1533.89,,,60371240000000.0


In [250]:
# save memory
# del target_2016
# del target_2017
# del prop_2016
# del prop_2017

In [15]:
len(joined_data_2017), len(joined_data_2016)

(77613, 90275)

In [16]:
joined_data_2016.transactiondate = pd.to_datetime(joined_data_2016.transactiondate,format="%Y-%m-%d")

In [17]:
joined_data_2017.transactiondate = pd.to_datetime(joined_data_2017.transactiondate,format="%Y-%m-%d")

## Columns to drop

In [18]:
drop_nulls_cols=['garagetotalsqft',	
'garagecarcnt',	
'numberofstories',	
'poolcnt',	
'threequarterbathnbr',	
'fireplacecnt',	
'finishedfloor1squarefeet',
'finishedsquarefeet50',
'finishedsquarefeet15',
'finishedsquarefeet12',              
'yardbuildingsqft17',	
'poolsizesum',	
'finishedsquarefeet6',	
'yardbuildingsqft26',	
'basementsqft',	
'finishedsquarefeet13',
'assessmentyear',
'calculatedbathnbr',
'parcelid',
]	
joined_data_2016 = joined_data_2016.drop(drop_nulls_cols,axis=1)
joined_data_2017 = joined_data_2017.drop(drop_nulls_cols,axis=1)

In [19]:
joined_data = pd.concat([joined_data_2016,joined_data_2017])

In [20]:
len(joined_data)

167888

### Did things change?

In [21]:
joined_data=joined_data.drop_duplicates()

In [22]:
len(joined_data)

167884

In [23]:
joined_data['transaction_mth'] = joined_data.transactiondate.apply(lambda x:x.month)

In [24]:
joined_data['transaction_yr'] = joined_data.transactiondate.apply(lambda x: x.year)

In [25]:
joined_data['transaction_day_of_wk'] = joined_data.transactiondate.apply(lambda x: x.dayofweek)

In [26]:
joined_data=joined_data.drop('transactiondate',axis=1)

In [27]:
joined_data.head()

Unnamed: 0,logerror,airconditioningtypeid,architecturalstyletypeid,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,decktypeid,calculatedfinishedsquarefeet,fips,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transaction_mth,transaction_yr,transaction_day_of_wk
0,0.0276,1.0,,2.0,3.0,,4.0,,1684.0,6037.0,...,122754.0,360170.0,237416.0,6735.88,,,60371070000000.0,1,2016,4
1,-0.1684,,,3.5,4.0,,,,2263.0,6059.0,...,346458.0,585529.0,239071.0,10153.02,,,,1,2016,4
2,-0.004,1.0,,3.0,2.0,,4.0,,2217.0,6037.0,...,61994.0,119906.0,57912.0,11484.48,,,60374640000000.0,1,2016,4
3,0.0218,1.0,,2.0,2.0,,4.0,,839.0,6037.0,...,171518.0,244880.0,73362.0,3048.74,,,60372960000000.0,1,2016,5
4,-0.005,,,2.5,4.0,,,,2283.0,6059.0,...,169574.0,434551.0,264977.0,5488.96,,,60590420000000.0,1,2016,5


In [28]:
from sklearn.model_selection import train_test_split

In [29]:
train, test = train_test_split(joined_data, test_size=0.2,random_state=4)

In [30]:
len(train),len(test)

(134307, 33577)

In [31]:
train.head()

Unnamed: 0,logerror,airconditioningtypeid,architecturalstyletypeid,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,decktypeid,calculatedfinishedsquarefeet,fips,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,transaction_mth,transaction_yr,transaction_day_of_wk
11695,0.0257,,,2.5,3.0,,,,1470.0,6059.0,...,178209.0,512265.0,334056.0,7332.6,,,60590320000000.0,2,2016,3
90140,-0.0545,1.0,,2.0,3.0,,4.0,,1096.0,6037.0,...,110000.0,485000.0,375000.0,5756.66,,,60377030000000.0,12,2016,3
31009,0.061324,,,1.0,4.0,,4.0,,1184.0,6037.0,...,108330.0,299238.0,190908.0,4348.8,,,60375410000000.0,4,2017,4
13347,-0.034105,,,3.0,4.0,,8.0,,2155.0,6037.0,...,209293.0,302897.0,93604.0,4218.2,,,60375430000000.0,2,2017,1
73221,-0.009,,,1.0,2.0,,7.0,,1171.0,6037.0,...,59324.0,296828.0,237504.0,4103.85,,,60375020000000.0,9,2016,4


## Numeric Columns

In [32]:
numeric_cols = ['logerror', 'basementsqft',	'bathroomcnt',	'bedroomcnt',	'calculatedbathnbr', 
                'calculatedfinishedsquarefeet',	'finishedfloor1squarefeet',	'finishedsquarefeet12',
                'finishedsquarefeet13',	'finishedsquarefeet15',	'finishedsquarefeet50',	'finishedsquarefeet6',
                'fireplacecnt',	'fullbathcnt',	'garagecarcnt',	'garagetotalsqft',	'landtaxvaluedollarcnt',
                'lotsizesquarefeet',	'numberofstories',	'poolcnt',	'poolsizesum',	'roomcnt',
                'structuretaxvaluedollarcnt',	'taxamount',	'taxvaluedollarcnt',	'threequarterbathnbr',
                'unitcnt',	'yardbuildingsqft17',	'yardbuildingsqft26','transaction_day_of_wk','transaction_mth','transaction_yr',
               'taxdelinquencyyear','yearbuilt','latitude','longitude']

num_cols = list(set(numeric_cols) - set(drop_nulls_cols))

## Fill in NAs with median

In [33]:
for col in num_cols:
    train[col] = train[col].fillna(train[col].median())

In [438]:
categorical_cols = list(set(train.columns) - set(numeric_cols))

## Sample

In [34]:
train = train.sample(frac=0.01)

## Feature importance on Numeric Columns

In [35]:
X_train = train[num_cols].drop('logerror', axis=1)

In [36]:
train.shape

(1343, 43)

In [37]:
Y_train = train['logerror']

In [38]:
from sklearn.ensemble import RandomForestRegressor

In [42]:
rf_feat = RandomForestRegressor(n_jobs=-1,criterion='mae', random_state=4)

In [43]:
import time

In [44]:
start = time.time()

rf_feat.fit(X_train,Y_train)

stop = time.time()


In [45]:
print("Elapsed_time: {}".format(stop - start))

Elapsed_time: 2.6917481422424316


In [46]:
# When we use 1,343 observations
for numeric_feature, score in sorted(zip(rf_feat.feature_importances_,X_train.columns),reverse=True):
    print(numeric_feature, score)

0.121184768123 calculatedfinishedsquarefeet
0.11459088064 yearbuilt
0.109009929748 latitude
0.0869595053764 lotsizesquarefeet
0.0776987705731 landtaxvaluedollarcnt
0.077505588674 taxamount
0.0772508093351 longitude
0.0701180762111 structuretaxvaluedollarcnt
0.0662174961851 taxvaluedollarcnt
0.053085645884 transaction_mth
0.0354851995409 bedroomcnt
0.0331595336364 transaction_day_of_wk
0.0248484607795 bathroomcnt
0.0142210323165 fullbathcnt
0.0137335767679 unitcnt
0.0109381348019 roomcnt
0.00938145300279 transaction_yr
0.00461113840477 taxdelinquencyyear


In [453]:
# When we use 13,431 observations
for numeric_feature, score in sorted(zip(rf_feat.feature_importances_,X_train.columns),reverse=True):
    print(numeric_feature, score)

0.105275693118 calculatedfinishedsquarefeet
0.0945696537625 structuretaxvaluedollarcnt
0.09385762192 latitude
0.0910812204267 longitude
0.0906990473811 lotsizesquarefeet
0.0889718714763 yearbuilt
0.0830311672715 taxamount
0.0826697856521 landtaxvaluedollarcnt
0.0767326731044 taxvaluedollarcnt
0.0479610924268 transaction_mth
0.031974283538 transaction_day_of_wk
0.0296311045449 bedroomcnt
0.0203548183099 bathroomcnt
0.0180914909617 roomcnt
0.0138421131186 taxdelinquencyyear
0.0129543126702 fullbathcnt
0.0108114136248 transaction_yr
0.00749063669243 unitcnt
