In [1]:
import pandas as pd
import numpy as np
import gc 

import logging

logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer

In [214]:
def load_data(path_to_data='data', sample_size = None):
	logging.info('Reading Properties 2016...')
	prop_2016 = pd.read_csv('{}/properties_2016.csv'.format(path_to_data))
	
	logging.info('Reading Properties 2017...')
	prop_2017 = pd.read_csv('{}/properties_2017.csv'.format(path_to_data))
	
	logging.info('Reading Train 2016...')
	target_2016 = pd.read_csv('{}/train_2016_v2.csv'.format(path_to_data))
	
	logging.info('Reading Train 2017..')
	target_2017 = pd.read_csv('{}/train_2017.csv'.format(path_to_data))
	
	logging.info('Performing merge')
	joined_data_2016 = pd.merge(target_2016,prop_2016,on="parcelid",how="left")
	joined_data_2017 = pd.merge(target_2017, prop_2017,on='parcelid',how='left')

	joined_data = pd.concat([joined_data_2016,joined_data_2017])

	# convert dates:
	joined_data.transactiondate = pd.to_datetime(joined_data.transactiondate,format="%Y-%m-%d")

	joined_data['transaction_mth'] = joined_data.transactiondate.apply(lambda x:x.month)
	joined_data['transaction_yr'] = joined_data.transactiondate.apply(lambda x: x.year)
	joined_data['transaction_day_of_wk'] = joined_data.transactiondate.apply(lambda x: x.dayofweek)
	joined_data=joined_data.drop('transactiondate',axis=1)


	# save memory
	for c, dtype in zip(joined_data.columns, joined_data.dtypes):
		if dtype == np.float64:
			joined_data[c] = joined_data[c].astype(np.float32)
    
	del target_2016
	del target_2017
	del prop_2016
	del prop_2017

	gc.collect()
	

	if sample_size is not None:
		logging.info('Sampling: {} of data'.format(sample_size))
		joined_data = joined_data.sample(frac=sample_size)

	return joined_data, joined_data['logerror'].values


In [215]:
def drop_columns(data, drop_cols):
	# mostly null
	data = data.drop(drop_cols, axis=1)
	data.drop_duplicates(inplace = True)

	return data

In [216]:
def columns_after_drop(numeric, categorical, drop_columns):
	numeric = list(set(numeric) - (set(numeric) & set(drop_columns)))
	categorical = list(set(categorical) - (set(categorical) & set(drop_columns)))

	return numeric, categorical

In [217]:
def impute_numerical_var(joined_data, numerical_cols):
	logging.info('Filling numeric NAs')

	# numerical vars
	numerical_data = joined_data.copy().reset_index()
	numerical_data = numerical_data[numerical_cols]
	numerical_data_cols = numerical_data.columns

	numeric_imp  = Imputer(strategy='median', axis=0)     
	numerical_data = pd.DataFrame(numeric_imp.fit_transform(numerical_data.values), columns=numerical_data_cols)

	return numerical_data,  {key:val for key,val in  zip(numerical_data_cols, numeric_imp.statistics_)}

In [218]:
def impute_categorical_var(joined_data, categorical_cols):
	# categorical vars
	categorical_data = joined_data.copy().reset_index()
	categorical_data  = categorical_data[categorical_cols]

	if 'hashottuborspa' in categorical_cols:
		categorical_data['hashottuborspa']=categorical_data['hashottuborspa'].apply(lambda x: 1 if x == 'True' else 0)

	if 'taxdelinquencyflag' in categorical_cols:
		categorical_data['taxdelinquencyflag']=categorical_data['hashottuborspa'].apply(lambda x: 1 if str(x).strip().lower() == 'y' else 0)

	for c, dtype in zip(categorical_data.columns, categorical_data.dtypes):
		categorical_data[c] = categorical_data[c].apply(lambda x: x if pd.isnull(x) else str(x))

	categorical_data_cols = categorical_data.columns

	most_frequent_lst = []
    
	logging.info('Using most frequent...')
    
	for col in categorical_data_cols:
		logging.info("Filling NA: {}".format(col))
		# logging.info("Filling NA: {}".format(col))
		mk=categorical_data[col].notnull()
		value_counts = categorical_data[mk][col].value_counts()
		most_frequent_lst.append(value_counts.index[0])
		categorical_data[col].fillna(most_frequent_lst[-1], inplace=True)

	return categorical_data, {key:val for key,val in zip(categorical_data_cols, most_frequent_lst)}

In [219]:
numeric_cols = ['assessmentyear','basementsqft',	'bathroomcnt',	'bedroomcnt',	'calculatedbathnbr', 
                'calculatedfinishedsquarefeet',	'finishedfloor1squarefeet',	'finishedsquarefeet12',
                'finishedsquarefeet13',	'finishedsquarefeet15',	'finishedsquarefeet50',	'finishedsquarefeet6',
                'fireplacecnt',	'fullbathcnt',	'garagecarcnt',	'garagetotalsqft',	'landtaxvaluedollarcnt',
                'lotsizesquarefeet',	'numberofstories',	'poolcnt',	'poolsizesum',	'roomcnt',
                'structuretaxvaluedollarcnt',	'taxamount',	'taxvaluedollarcnt',	'threequarterbathnbr',
                'unitcnt',	'yardbuildingsqft17',	'yardbuildingsqft26','transaction_day_of_wk','transaction_mth','transaction_yr',
               'taxdelinquencyyear','yearbuilt','latitude','longitude']

categorical_cols = ['airconditioningtypeid','architecturalstyletypeid','buildingclasstypeid','buildingqualitytypeid',
'censustractandblock','decktypeid','fips','fireplaceflag','hashottuborspa',
 'heatingorsystemtypeid','parcelid','pooltypeid10','pooltypeid2','pooltypeid7','propertycountylandusecode',
 'propertylandusetypeid','propertyzoningdesc', 'rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip',
 'storytypeid','taxdelinquencyflag','typeconstructiontypeid']

drop_cols = ['buildingclasstypeid','propertyzoningdesc','garagetotalsqft',	'garagecarcnt',	'numberofstories',	'poolcnt',	'threequarterbathnbr',	
	'fireplacecnt',	'finishedfloor1squarefeet','finishedsquarefeet50','finishedsquarefeet15',
	'finishedsquarefeet12', 'yardbuildingsqft17',	'poolsizesum',	'finishedsquarefeet6',	'yardbuildingsqft26',	
	'basementsqft',	'finishedsquarefeet13','assessmentyear','calculatedbathnbr','parcelid',
	'rawcensustractandblock', 'censustractandblock','regionidzip','regionidcounty','regionidcity','regionidneighborhood',
	'regionidneighborhood','taxvaluedollarcnt','buildingclasstypeid','fireplaceflag','storytypeid']	

In [220]:
joined_data, logerror_var = load_data(path_to_data='data', sample_size=0.1)


2017-10-10 13:26:26,964 Reading Properties 2016...
  if self.run_code(code, result):
2017-10-10 13:26:48,288 Reading Properties 2017...
  if self.run_code(code, result):
2017-10-10 13:27:09,753 Reading Train 2016...
2017-10-10 13:27:09,834 Reading Train 2017..
2017-10-10 13:27:09,889 Performing merge
2017-10-10 13:27:20,170 Sampling: 0.1 of data


In [221]:
len(joined_data),len(logerror_var)

(16789, 16789)

In [222]:
joined_data = drop_columns(joined_data, drop_cols)

In [223]:
# if you need to drop a column, add it to drop_cols
numeric_cols, categorical_cols = columns_after_drop(numeric_cols, categorical_cols, drop_cols)

In [224]:
numeric_data, imputations_numeric = impute_numerical_var(joined_data, numeric_cols)

2017-10-10 13:27:24,491 Filling numeric NAs


In [225]:
numeric_data.head()

Unnamed: 0,transaction_mth,yearbuilt,roomcnt,taxamount,bedroomcnt,fullbathcnt,calculatedfinishedsquarefeet,bathroomcnt,structuretaxvaluedollarcnt,taxdelinquencyyear,longitude,transaction_day_of_wk,lotsizesquarefeet,landtaxvaluedollarcnt,unitcnt,latitude,transaction_yr
0,6.0,1971.0,6.0,5074.060059,3.0,1.0,1121.0,1.5,69258.0,14.0,-117850552.0,3.0,1453.0,297742.0,1.0,33905404.0,2016.0
1,8.0,1989.0,0.0,2276.600098,2.0,2.0,975.0,2.0,125044.0,14.0,-117744560.0,1.0,2392.0,95410.0,1.0,33599748.0,2016.0
2,6.0,1979.0,0.0,5003.439941,2.0,3.0,1417.0,3.0,159885.0,14.0,-118182632.0,4.0,88080.0,254327.0,1.0,34099472.0,2017.0
3,2.0,1969.0,0.0,3504.340088,3.0,2.0,1221.0,2.0,73519.0,14.0,-118075000.0,0.0,7214.0,202350.0,1.0,33785200.0,2017.0
4,9.0,1987.0,0.0,3023.610107,3.0,2.0,1140.0,2.0,155940.0,15.0,-118256568.0,4.0,5582.0,86083.0,1.0,34070128.0,2016.0


In [226]:
imputations_numeric

{'bathroomcnt': 2.0,
 'bedroomcnt': 3.0,
 'calculatedfinishedsquarefeet': 1534.0,
 'fullbathcnt': 2.0,
 'landtaxvaluedollarcnt': 198152.0,
 'latitude': 34024556.0,
 'longitude': -118178908.0,
 'lotsizesquarefeet': 7214.0,
 'roomcnt': 0.0,
 'structuretaxvaluedollarcnt': 134433.0,
 'taxamount': 4491.1201171875,
 'taxdelinquencyyear': 14.0,
 'transaction_day_of_wk': 2.0,
 'transaction_mth': 6.0,
 'transaction_yr': 2016.0,
 'unitcnt': 1.0,
 'yearbuilt': 1970.0}

In [227]:
categorical_data, imputations_categorical = impute_categorical_var(joined_data, categorical_cols)

2017-10-10 13:27:26,155 Using most frequent...
2017-10-10 13:27:26,156 Filling NA: hashottuborspa
2017-10-10 13:27:26,184 Filling NA: typeconstructiontypeid
2017-10-10 13:27:26,189 Filling NA: airconditioningtypeid
2017-10-10 13:27:26,200 Filling NA: heatingorsystemtypeid
2017-10-10 13:27:26,212 Filling NA: pooltypeid7
2017-10-10 13:27:26,221 Filling NA: decktypeid
2017-10-10 13:27:26,225 Filling NA: pooltypeid2
2017-10-10 13:27:26,234 Filling NA: buildingqualitytypeid
2017-10-10 13:27:26,245 Filling NA: taxdelinquencyflag
2017-10-10 13:27:26,262 Filling NA: fips
2017-10-10 13:27:26,278 Filling NA: architecturalstyletypeid
2017-10-10 13:27:26,283 Filling NA: pooltypeid10
2017-10-10 13:27:26,291 Filling NA: propertycountylandusecode
2017-10-10 13:27:26,305 Filling NA: propertylandusetypeid


In [228]:
categorical_data.head()

Unnamed: 0,hashottuborspa,typeconstructiontypeid,airconditioningtypeid,heatingorsystemtypeid,pooltypeid7,decktypeid,pooltypeid2,buildingqualitytypeid,taxdelinquencyflag,fips,architecturalstyletypeid,pooltypeid10,propertycountylandusecode,propertylandusetypeid
0,0,6.0,1.0,2.0,1.0,66.0,1.0,7.0,0,6059.0,7.0,1.0,122,261.0
1,0,6.0,1.0,2.0,1.0,66.0,1.0,7.0,0,6059.0,7.0,1.0,34,266.0
2,0,6.0,1.0,2.0,1.0,66.0,1.0,8.0,0,6037.0,7.0,1.0,010C,266.0
3,0,6.0,1.0,2.0,1.0,66.0,1.0,7.0,0,6059.0,7.0,1.0,34,266.0
4,0,6.0,1.0,2.0,1.0,66.0,1.0,7.0,0,6037.0,7.0,1.0,0100,261.0


In [229]:
imputations_categorical

{'airconditioningtypeid': '1.0',
 'architecturalstyletypeid': '7.0',
 'buildingqualitytypeid': '7.0',
 'decktypeid': '66.0',
 'fips': '6037.0',
 'hashottuborspa': '0',
 'heatingorsystemtypeid': '2.0',
 'pooltypeid10': '1.0',
 'pooltypeid2': '1.0',
 'pooltypeid7': '1.0',
 'propertycountylandusecode': '0100',
 'propertylandusetypeid': '261.0',
 'taxdelinquencyflag': '0',
 'typeconstructiontypeid': '6.0'}

In [230]:
cat_dummies = pd.get_dummies(categorical_data, drop_first=True)

In [231]:
joined_after_imputation = pd.concat([numeric_data,cat_dummies], axis=1)

In [232]:
len(joined_after_imputation)

16789

In [233]:
joined_after_imputation['logerror_var'] = pd.Series(logerror_var)

In [234]:
joined_after_imputation.head()

Unnamed: 0,transaction_mth,yearbuilt,roomcnt,taxamount,bedroomcnt,fullbathcnt,calculatedfinishedsquarefeet,bathroomcnt,structuretaxvaluedollarcnt,taxdelinquencyyear,...,propertylandusetypeid_248.0,propertylandusetypeid_260.0,propertylandusetypeid_261.0,propertylandusetypeid_263.0,propertylandusetypeid_265.0,propertylandusetypeid_266.0,propertylandusetypeid_267.0,propertylandusetypeid_269.0,propertylandusetypeid_275.0,logerror_var
0,6.0,1971.0,6.0,5074.060059,3.0,1.0,1121.0,1.5,69258.0,14.0,...,0,0,1,0,0,0,0,0,0,-0.0263
1,8.0,1989.0,0.0,2276.600098,2.0,2.0,975.0,2.0,125044.0,14.0,...,0,0,0,0,0,1,0,0,0,0.0421
2,6.0,1979.0,0.0,5003.439941,2.0,3.0,1417.0,3.0,159885.0,14.0,...,0,0,0,0,0,1,0,0,0,0.011746
3,2.0,1969.0,0.0,3504.340088,3.0,2.0,1221.0,2.0,73519.0,14.0,...,0,0,0,0,0,1,0,0,0,-0.029979
4,9.0,1987.0,0.0,3023.610107,3.0,2.0,1140.0,2.0,155940.0,15.0,...,0,0,1,0,0,0,0,0,0,-0.1827


## Feature importance

In [235]:
from sklearn.model_selection import train_test_split

In [236]:
X_train, X_test, y_train, y_test = train_test_split(joined_after_imputation.drop('logerror_var', axis=1).values,
                                                    joined_after_imputation['logerror_var'].values,
                                                    test_size=0.2, random_state=4)

In [237]:
len(X_train), len(X_test)

(13431, 3358)

In [238]:
from sklearn.linear_model import Lasso

In [239]:
all_vars = [col for col in joined_after_imputation.drop('logerror_var', axis=1).columns]
len(all_vars)

108

In [240]:
lasso_reg = Lasso(alpha=0.0001)

In [241]:
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [242]:
all_var_dict = {var:coeff for var,coeff in zip(all_vars, lasso_reg.coef_) if coeff != 0}
all_var_dict=sorted(all_var_dict.items(), key=lambda x: np.abs(x[1]))

In [210]:
all_var_dict

[('lotsizesquarefeet', -3.8122527250570593e-09),
 ('structuretaxvaluedollarcnt', -7.6757558005120995e-09),
 ('longitude', 7.8510234284182427e-09),
 ('latitude', 9.5071629164452201e-09),
 ('landtaxvaluedollarcnt', 2.369367770381383e-08),
 ('taxamount', -1.3499263462390239e-06),
 ('yearbuilt', 1.324246844513963e-05),
 ('calculatedfinishedsquarefeet', 1.7735282082377871e-05),
 ('roomcnt', 0.00024057264413665831),
 ('fullbathcnt', -0.0003011648662250127),
 ('unitcnt', -0.00031788767677477857),
 ('transaction_day_of_wk', 0.00038144041973245553),
 ('transaction_mth', 0.00078437879616370318),
 ('propertycountylandusecode_0300', -0.0009920080505423179),
 ('transaction_yr', 0.0013468662697211278),
 ('buildingqualitytypeid_7.0', 0.001365274637010838),
 ('propertycountylandusecode_1128', 0.0014760996588195729),
 ('bathroomcnt', 0.0016616814333212709),
 ('buildingqualitytypeid_8.0', -0.0022709429046287102),
 ('buildingqualitytypeid_4.0', 0.0024804242424680653),
 ('propertycountylandusecode_1', 0.0