In [1]:
import pandas as pd
import numpy as np
import gc 

import logging

logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer

In [2]:
def load_data(path_to_data='data', sample_size = None):
	logging.info('Reading Properties 2016...')
	prop_2016 = pd.read_csv('{}/properties_2016.csv'.format(path_to_data))
	
	logging.info('Reading Properties 2017...')
	prop_2017 = pd.read_csv('{}/properties_2017.csv'.format(path_to_data))
	
	logging.info('Reading Train 2016...')
	target_2016 = pd.read_csv('{}/train_2016_v2.csv'.format(path_to_data))
	
	logging.info('Reading Train 2017..')
	target_2017 = pd.read_csv('{}/train_2017.csv'.format(path_to_data))
	
	logging.info('Performing merge')
	joined_data_2016 = pd.merge(target_2016,prop_2016,on="parcelid",how="left")
	joined_data_2017 = pd.merge(target_2017, prop_2017,on='parcelid',how='left')

	joined_data = pd.concat([joined_data_2016,joined_data_2017])

	# convert dates:
	joined_data.transactiondate = pd.to_datetime(joined_data.transactiondate,format="%Y-%m-%d")

	joined_data['transaction_mth'] = joined_data.transactiondate.apply(lambda x:x.month)
	joined_data['transaction_yr'] = joined_data.transactiondate.apply(lambda x: x.year)
	joined_data['transaction_day_of_wk'] = joined_data.transactiondate.apply(lambda x: x.dayofweek)
	joined_data=joined_data.drop('transactiondate',axis=1)


	# save memory
	for c, dtype in zip(joined_data.columns, joined_data.dtypes):
		if dtype == np.float64:
			joined_data[c] = joined_data[c].astype(np.float32)
    
	del target_2016
	del target_2017
	del prop_2016
	del prop_2017

	gc.collect()
	

	if sample_size is not None:
		logging.info('Sampling: {} of data'.format(sample_size))
		joined_data = joined_data.sample(frac=sample_size)

	return joined_data, joined_data['logerror'].values


In [3]:
def drop_columns(data, drop_cols):
	# mostly null
	data = data.drop(drop_cols, axis=1)
	data.drop_duplicates(inplace = True)

	return data

In [4]:
def columns_after_drop(numeric, categorical, drop_columns):
	numeric = list(set(numeric) - (set(numeric) & set(drop_columns)))
	categorical = list(set(categorical) - (set(categorical) & set(drop_columns)))

	return numeric, categorical

In [5]:
def impute_numerical_var(joined_data, numerical_cols):
	logging.info('Filling numeric NAs')

	# numerical vars
	numerical_data = joined_data.copy().reset_index()
	numerical_data = numerical_data[numerical_cols]
	numerical_data_cols = numerical_data.columns

	numeric_imp  = Imputer(strategy='median', axis=0)     
	numerical_data = pd.DataFrame(numeric_imp.fit_transform(numerical_data.values), columns=numerical_data_cols)

	return numerical_data,  {key:val for key,val in  zip(numerical_data_cols, numeric_imp.statistics_)}

In [6]:
def impute_categorical_var(joined_data, categorical_cols):
	# categorical vars
	categorical_data = joined_data.copy().reset_index()
	categorical_data  = categorical_data[categorical_cols]

	if 'hashottuborspa' in categorical_cols:
		categorical_data['hashottuborspa']=categorical_data['hashottuborspa'].apply(lambda x: 1 if x == 'True' else 0)

	if 'taxdelinquencyflag' in categorical_cols:
		categorical_data['taxdelinquencyflag']=categorical_data['hashottuborspa'].apply(lambda x: 1 if str(x).strip().lower() == 'y' else 0)

	for c, dtype in zip(categorical_data.columns, categorical_data.dtypes):
		categorical_data[c] = categorical_data[c].apply(lambda x: x if pd.isnull(x) else str(x))

	categorical_data_cols = categorical_data.columns

	most_frequent_lst = []
    
	logging.info('Using most frequent...')
    
	for col in categorical_data_cols:
		logging.info("Filling NA: {}".format(col))
		# logging.info("Filling NA: {}".format(col))
		mk=categorical_data[col].notnull()
		value_counts = categorical_data[mk][col].value_counts()
		most_frequent_lst.append(value_counts.index[0])
		categorical_data[col].fillna(most_frequent_lst[-1], inplace=True)

	return categorical_data, {key:val for key,val in zip(categorical_data_cols, most_frequent_lst)}

In [7]:
numeric_cols = ['assessmentyear','basementsqft',	'bathroomcnt',	'bedroomcnt',	'calculatedbathnbr', 
                'calculatedfinishedsquarefeet',	'finishedfloor1squarefeet',	'finishedsquarefeet12',
                'finishedsquarefeet13',	'finishedsquarefeet15',	'finishedsquarefeet50',	'finishedsquarefeet6',
                'fireplacecnt',	'fullbathcnt',	'garagecarcnt',	'garagetotalsqft',	'landtaxvaluedollarcnt',
                'lotsizesquarefeet',	'numberofstories',	'poolcnt',	'poolsizesum',	'roomcnt',
                'structuretaxvaluedollarcnt',	'taxamount',	'taxvaluedollarcnt',	'threequarterbathnbr',
                'unitcnt',	'yardbuildingsqft17',	'yardbuildingsqft26','transaction_day_of_wk','transaction_mth','transaction_yr',
               'taxdelinquencyyear','yearbuilt','latitude','longitude']

categorical_cols = ['airconditioningtypeid','architecturalstyletypeid','buildingclasstypeid','buildingqualitytypeid',
'censustractandblock','decktypeid','fips','fireplaceflag','hashottuborspa',
 'heatingorsystemtypeid','parcelid','pooltypeid10','pooltypeid2','pooltypeid7','propertycountylandusecode',
 'propertylandusetypeid','propertyzoningdesc', 'rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip',
 'storytypeid','taxdelinquencyflag','typeconstructiontypeid']

drop_cols = ['buildingclasstypeid','propertyzoningdesc','garagetotalsqft',	'garagecarcnt',	'numberofstories',	'poolcnt',	'threequarterbathnbr',	
	'fireplacecnt',	'finishedfloor1squarefeet','finishedsquarefeet50','finishedsquarefeet15',
	'finishedsquarefeet12', 'yardbuildingsqft17',	'poolsizesum',	'finishedsquarefeet6',	'yardbuildingsqft26',	
	'basementsqft',	'finishedsquarefeet13','assessmentyear','calculatedbathnbr','parcelid',
	'rawcensustractandblock', 'censustractandblock','regionidzip','regionidcounty','regionidcity','regionidneighborhood',
	'regionidneighborhood','taxvaluedollarcnt','buildingclasstypeid','fireplaceflag','storytypeid']	

In [8]:
joined_data, logerror_var = load_data(path_to_data='data')


2017-10-10 15:32:11,427 Reading Properties 2016...
  if self.run_code(code, result):
2017-10-10 15:32:34,804 Reading Properties 2017...
  if self.run_code(code, result):
2017-10-10 15:32:57,430 Reading Train 2016...
2017-10-10 15:32:57,504 Reading Train 2017..
2017-10-10 15:32:57,573 Performing merge


In [9]:
len(joined_data),len(logerror_var)

(167888, 167888)

In [10]:
joined_data = drop_columns(joined_data, drop_cols)

In [11]:
# if you need to drop a column, add it to drop_cols
numeric_cols, categorical_cols = columns_after_drop(numeric_cols, categorical_cols, drop_cols)

In [12]:
numeric_data, imputations_numeric = impute_numerical_var(joined_data, numeric_cols)

2017-10-10 15:33:22,631 Filling numeric NAs


In [13]:
numeric_data.head()

Unnamed: 0,taxdelinquencyyear,transaction_mth,lotsizesquarefeet,bedroomcnt,taxamount,bathroomcnt,transaction_day_of_wk,latitude,calculatedfinishedsquarefeet,unitcnt,structuretaxvaluedollarcnt,transaction_yr,roomcnt,longitude,landtaxvaluedollarcnt,fullbathcnt,yearbuilt
0,14.0,1.0,7528.0,3.0,6735.879883,2.0,4.0,34280992.0,1684.0,1.0,122754.0,2016.0,0.0,-118488536.0,237416.0,2.0,1959.0
1,14.0,1.0,3643.0,4.0,10153.019531,3.5,4.0,33668120.0,2263.0,1.0,346458.0,2016.0,0.0,-117677552.0,239071.0,3.0,2014.0
2,14.0,1.0,11423.0,2.0,11484.480469,3.0,4.0,34136312.0,2217.0,1.0,61994.0,2016.0,0.0,-118175032.0,57912.0,3.0,1940.0
3,14.0,1.0,70859.0,2.0,3048.73999,2.0,5.0,33755800.0,839.0,1.0,171518.0,2016.0,0.0,-118309000.0,73362.0,2.0,1987.0
4,14.0,1.0,6000.0,4.0,5488.959961,2.5,5.0,33485644.0,2283.0,1.0,169574.0,2016.0,8.0,-117700232.0,264977.0,2.0,1981.0


In [14]:
imputations_numeric

{'bathroomcnt': 2.0,
 'bedroomcnt': 3.0,
 'calculatedfinishedsquarefeet': 1541.0,
 'fullbathcnt': 2.0,
 'landtaxvaluedollarcnt': 197783.0,
 'latitude': 34021700.0,
 'longitude': -118176848.0,
 'lotsizesquarefeet': 7200.0,
 'roomcnt': 0.0,
 'structuretaxvaluedollarcnt': 134048.0,
 'taxamount': 4501.080078125,
 'taxdelinquencyyear': 14.0,
 'transaction_day_of_wk': 2.0,
 'transaction_mth': 6.0,
 'transaction_yr': 2016.0,
 'unitcnt': 1.0,
 'yearbuilt': 1970.0}

In [15]:
categorical_data, imputations_categorical = impute_categorical_var(joined_data, categorical_cols)

2017-10-10 15:33:26,307 Using most frequent...
2017-10-10 15:33:26,308 Filling NA: pooltypeid10
2017-10-10 15:33:26,429 Filling NA: decktypeid
2017-10-10 15:33:26,448 Filling NA: taxdelinquencyflag
2017-10-10 15:33:26,563 Filling NA: fips
2017-10-10 15:33:26,671 Filling NA: propertylandusetypeid
2017-10-10 15:33:26,767 Filling NA: architecturalstyletypeid
2017-10-10 15:33:26,785 Filling NA: typeconstructiontypeid
2017-10-10 15:33:26,804 Filling NA: pooltypeid7
2017-10-10 15:33:26,849 Filling NA: airconditioningtypeid
2017-10-10 15:33:26,904 Filling NA: heatingorsystemtypeid
2017-10-10 15:33:26,978 Filling NA: buildingqualitytypeid
2017-10-10 15:33:27,053 Filling NA: propertycountylandusecode
2017-10-10 15:33:27,144 Filling NA: pooltypeid2
2017-10-10 15:33:27,171 Filling NA: hashottuborspa


In [16]:
categorical_data.head()

Unnamed: 0,pooltypeid10,decktypeid,taxdelinquencyflag,fips,propertylandusetypeid,architecturalstyletypeid,typeconstructiontypeid,pooltypeid7,airconditioningtypeid,heatingorsystemtypeid,buildingqualitytypeid,propertycountylandusecode,pooltypeid2,hashottuborspa
0,1.0,66.0,0,6037.0,261.0,7.0,6.0,1.0,1.0,2.0,4.0,0100,1.0,0
1,1.0,66.0,0,6059.0,261.0,7.0,6.0,1.0,1.0,2.0,7.0,1,1.0,0
2,1.0,66.0,0,6037.0,261.0,7.0,6.0,1.0,1.0,2.0,4.0,0100,1.0,0
3,1.0,66.0,0,6037.0,266.0,7.0,6.0,1.0,1.0,2.0,4.0,010C,1.0,0
4,1.0,66.0,0,6059.0,261.0,7.0,6.0,1.0,1.0,2.0,7.0,122,1.0,0


In [17]:
imputations_categorical

{'airconditioningtypeid': '1.0',
 'architecturalstyletypeid': '7.0',
 'buildingqualitytypeid': '7.0',
 'decktypeid': '66.0',
 'fips': '6037.0',
 'hashottuborspa': '0',
 'heatingorsystemtypeid': '2.0',
 'pooltypeid10': '1.0',
 'pooltypeid2': '1.0',
 'pooltypeid7': '1.0',
 'propertycountylandusecode': '0100',
 'propertylandusetypeid': '261.0',
 'taxdelinquencyflag': '0',
 'typeconstructiontypeid': '6.0'}

In [18]:
cat_dummies = pd.get_dummies(categorical_data, drop_first=True)

In [19]:
joined_after_imputation = pd.concat([numeric_data,cat_dummies], axis=1)

In [20]:
len(joined_after_imputation)

167882

In [21]:
joined_after_imputation['logerror'] = pd.Series(logerror_var)

In [22]:
joined_after_imputation.head()

Unnamed: 0,taxdelinquencyyear,transaction_mth,lotsizesquarefeet,bedroomcnt,taxamount,bathroomcnt,transaction_day_of_wk,latitude,calculatedfinishedsquarefeet,unitcnt,...,propertycountylandusecode_1720,propertycountylandusecode_1722,propertycountylandusecode_200,propertycountylandusecode_34,propertycountylandusecode_38,propertycountylandusecode_6050,propertycountylandusecode_73,propertycountylandusecode_8800,propertycountylandusecode_96,logerror
0,14.0,1.0,7528.0,3.0,6735.879883,2.0,4.0,34280992.0,1684.0,1.0,...,0,0,0,0,0,0,0,0,0,0.0276
1,14.0,1.0,3643.0,4.0,10153.019531,3.5,4.0,33668120.0,2263.0,1.0,...,0,0,0,0,0,0,0,0,0,-0.1684
2,14.0,1.0,11423.0,2.0,11484.480469,3.0,4.0,34136312.0,2217.0,1.0,...,0,0,0,0,0,0,0,0,0,-0.004
3,14.0,1.0,70859.0,2.0,3048.73999,2.0,5.0,33755800.0,839.0,1.0,...,0,0,0,0,0,0,0,0,0,0.0218
4,14.0,1.0,6000.0,4.0,5488.959961,2.5,5.0,33485644.0,2283.0,1.0,...,0,0,0,0,0,0,0,0,0,-0.005


## Feature importance

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(joined_after_imputation.drop('logerror', axis=1).values,
                                                    joined_after_imputation['logerror'].values,
                                                    test_size=0.2, random_state=4)

In [25]:
len(X_train), len(X_test)

(134305, 33577)

In [26]:
X_train.shape

(134305, 156)

In [27]:
from sklearn.linear_model import Lasso

In [28]:
all_vars = [col for col in joined_after_imputation.drop('logerror', axis=1).columns]
len(all_vars)

156

In [29]:
lasso_reg = Lasso(alpha=0.0001)

In [30]:
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [31]:
all_var_dict = {var:coeff for var,coeff in zip(all_vars, lasso_reg.coef_) if coeff != 0}
all_var_dict=sorted(all_var_dict.items(), key=lambda x: np.abs(x[1]))

In [32]:
all_var_dict

[('longitude', 1.3933486993596964e-09),
 ('lotsizesquarefeet', -3.031127353872585e-09),
 ('landtaxvaluedollarcnt', -3.4115444162655875e-09),
 ('latitude', 3.9809732374193372e-09),
 ('structuretaxvaluedollarcnt', 5.7544000356620697e-09),
 ('taxamount', 5.3131712147494121e-08),
 ('calculatedfinishedsquarefeet', -4.1176225065170676e-07),
 ('yearbuilt', -5.1304358852944771e-05),
 ('roomcnt', 0.0001127179225120754),
 ('buildingqualitytypeid_8.0', 0.00015320774245032365),
 ('propertycountylandusecode_0100', -0.00017652291615917259),
 ('unitcnt', 0.00026809945905595703),
 ('transaction_mth', 0.00039801918321146244),
 ('propertycountylandusecode_122', 0.0004347999300523306),
 ('transaction_day_of_wk', -0.00044042598119878892),
 ('bedroomcnt', 0.00070204102442653906),
 ('buildingqualitytypeid_6.0', -0.0013475501172625234),
 ('heatingorsystemtypeid_7.0', 0.0017550960504351357),
 ('propertycountylandusecode_010C', 0.00218121903654036),
 ('buildingqualitytypeid_4.0', -0.0024976555175130568),
 ('tr

In [34]:
all_var = [var for var, _ in all_var_dict]
all_var.insert(len(all_var), 'logerror')
all_var

['longitude',
 'lotsizesquarefeet',
 'landtaxvaluedollarcnt',
 'latitude',
 'structuretaxvaluedollarcnt',
 'taxamount',
 'calculatedfinishedsquarefeet',
 'yearbuilt',
 'roomcnt',
 'buildingqualitytypeid_8.0',
 'propertycountylandusecode_0100',
 'unitcnt',
 'transaction_mth',
 'propertycountylandusecode_122',
 'transaction_day_of_wk',
 'bedroomcnt',
 'buildingqualitytypeid_6.0',
 'heatingorsystemtypeid_7.0',
 'propertycountylandusecode_010C',
 'buildingqualitytypeid_4.0',
 'transaction_yr',
 'logerror']

In [35]:
joined_after_imputation[all_var].to_csv('data/train-ml_v2.csv', index=False)