In [1]:
import pandas as pd
import numpy as np
import gc 

import logging

logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer

In [2]:
# https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    logging.debug("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # logging.debug current column type
            logging.debug("******************************")
            logging.debug("Column: ",col)
            logging.debug("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # logging.debug new column type
            logging.debug("dtype after: ",props[col].dtype)
            logging.debug("******************************")
    
    # logging.debug final result
    logging.debug("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    logging.debug("Memory usage is: ",mem_usg," MB")
    logging.debug("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [3]:
def load_data(path_to_data='data', sample_size = None):
    logging.info('Reading Properties 2016...')
    prop_2016 = pd.read_csv('{}/properties_2016.csv'.format(path_to_data))
    
    logging.info('Reading Properties 2017...')
    prop_2017 = pd.read_csv('{}/properties_2017.csv'.format(path_to_data))
    
    logging.info('Reading Train 2016...')
    target_2016 = pd.read_csv('{}/train_2016_v2.csv'.format(path_to_data))
    
    logging.info('Reading Train 2017..')
    target_2017 = pd.read_csv('{}/train_2017.csv'.format(path_to_data))
    
    logging.info('Performing merge')
    joined_data_2016 = pd.merge(target_2016,prop_2016,on="parcelid",how="left")
    joined_data_2017 = pd.merge(target_2017, prop_2017,on='parcelid',how='left')

    joined_data = pd.concat([joined_data_2016,joined_data_2017])

    # convert dates:
    joined_data.transactiondate = pd.to_datetime(joined_data.transactiondate,format="%Y-%m-%d")

    joined_data['transaction_mth'] = joined_data.transactiondate.apply(lambda x:x.month)
    joined_data['transaction_yr'] = joined_data.transactiondate.apply(lambda x: x.year)
    joined_data['transaction_day_of_wk'] = joined_data.transactiondate.apply(lambda x: x.dayofweek)
    joined_data=joined_data.drop('transactiondate',axis=1)


    # save memory
    for c, dtype in zip(joined_data.columns, joined_data.dtypes):
        if dtype == np.float64:
            joined_data[c] = joined_data[c].astype(np.float32)
    
    del target_2016
    del target_2017
    del prop_2016
    del prop_2017

    gc.collect()
    

    if sample_size is not None:
        logging.info('Sampling: {} of data'.format(sample_size))
        joined_data = joined_data.sample(frac=sample_size)

    return joined_data, joined_data['logerror'].values


In [4]:
def drop_columns(data, drop_cols):
	# mostly null
	data = data.drop(drop_cols, axis=1)
	data.drop_duplicates(inplace = True)

	return data

In [5]:
def columns_after_drop(numeric, categorical, drop_columns):
	numeric = list(set(numeric) - (set(numeric) & set(drop_columns)))
	categorical = list(set(categorical) - (set(categorical) & set(drop_columns)))

	return numeric, categorical

In [6]:
def impute_numerical_var(joined_data, numerical_cols, imputation_num=None):
    logging.info('Filling numeric NAs')
    
    
    
    if bool(imputation_num):
        numerical_data = joined_data.copy().reset_index()
        numerical_data = numerical_data[numerical_cols]
        numerical_data_cols = numerical_data.columns
        
        for col,val in imputation_num.items():
            logging.info("Filling NA: {}".format(col))
            numerical_data[col].fillna(val, inplace=True)
        
        return numerical_data
    
    else:
        median_lst = []
    # numerical vars
        numerical_data = joined_data.copy().reset_index()
        numerical_data = numerical_data[numerical_cols]
        numerical_data_cols = numerical_data.columns
        
        for col in numerical_data_cols:
            logging.info("2. Filling NA: {}".format(col))
            median_lst.append(numerical_data[col].median())
            numerical_data[col].fillna(median_lst[-1], inplace=True)

        return numerical_data, {key:val for key,val in zip(numeric_cols, median_lst)}

In [15]:
def impute_categorical_var(joined_data, categorical_cols, imputation_cat=None):
    # categorical vars
    categorical_data = joined_data.copy().reset_index()
    categorical_data  = categorical_data[categorical_cols]

    print(len(categorical_data))
    
    if 'hashottuborspa' in categorical_cols:
        categorical_data['hashottuborspa']=categorical_data['hashottuborspa'].apply(lambda x: 1 if str(x).strip().lower() == 'true' else 0)

    if 'taxdelinquencyflag' in categorical_cols:
        categorical_data['taxdelinquencyflag']=categorical_data['taxdelinquencyflag'].apply(lambda x: 1 if str(x).strip().lower() == 'y' else 0)

    for c, dtype in zip(categorical_data.columns, categorical_data.dtypes):
        categorical_data[c] = categorical_data[c].apply(lambda x: x if pd.isnull(x) else str(x))
                                                        
    if bool(imputation_cat):
        for col,val in imputation_cat.items():
            logging.info("Filling NA: {}".format(col))
            categorical_data[col].fillna(val, inplace=True)
            
        return categorical_data

    else:
        
        categorical_data_cols = categorical_data.columns

        most_frequent_lst = []

        logging.info('Using most frequent...')

        for col in categorical_data_cols:
            logging.info("Filling NA: {}".format(col))
            # logging.info("Filling NA: {}".format(col))
            mk=categorical_data[col].notnull()
            value_counts = categorical_data[mk][col].value_counts()
            most_frequent_lst.append(value_counts.index[0])
            categorical_data[col].fillna(most_frequent_lst[-1], inplace=True)

        return categorical_data, {key:val for key,val in zip(categorical_data_cols, most_frequent_lst)}

In [29]:
numeric_cols = ['assessmentyear','basementsqft',	'bathroomcnt',	'bedroomcnt',	'calculatedbathnbr', 
                'calculatedfinishedsquarefeet',	'finishedfloor1squarefeet',	'finishedsquarefeet12',
                'finishedsquarefeet13',	'finishedsquarefeet15',	'finishedsquarefeet50',	'finishedsquarefeet6',
                'fireplacecnt',	'fullbathcnt',	'garagecarcnt',	'garagetotalsqft',	'landtaxvaluedollarcnt',
                'lotsizesquarefeet',	'numberofstories',	'poolcnt',	'poolsizesum',	'roomcnt',
                'structuretaxvaluedollarcnt',	'taxamount',	'taxvaluedollarcnt',	'threequarterbathnbr',
                'unitcnt',	'yardbuildingsqft17',	'yardbuildingsqft26','taxdelinquencyyear','yearbuilt','latitude','longitude']

categorical_cols = ['airconditioningtypeid','architecturalstyletypeid','buildingclasstypeid','buildingqualitytypeid',
'censustractandblock','decktypeid','fips','fireplaceflag','hashottuborspa',
 'heatingorsystemtypeid','parcelid','pooltypeid10','pooltypeid2','pooltypeid7','propertycountylandusecode',
 'propertylandusetypeid','propertyzoningdesc', 'rawcensustractandblock','regionidcity','regionidcounty','regionidneighborhood','regionidzip',
 'storytypeid','taxdelinquencyflag','typeconstructiontypeid']

drop_cols = ['buildingclasstypeid','propertyzoningdesc','garagetotalsqft',	'garagecarcnt',	'numberofstories',	'poolcnt',	'threequarterbathnbr',	
	'fireplacecnt',	'finishedfloor1squarefeet','finishedsquarefeet50','finishedsquarefeet15',
	'finishedsquarefeet12', 'yardbuildingsqft17',	'poolsizesum',	'finishedsquarefeet6',	'yardbuildingsqft26',	
	'basementsqft',	'finishedsquarefeet13','assessmentyear','calculatedbathnbr','parcelid',
	'rawcensustractandblock', 'censustractandblock','regionidzip','regionidcounty','regionidcity','regionidneighborhood',
	'regionidneighborhood','taxvaluedollarcnt','buildingclasstypeid','fireplaceflag','storytypeid','structuretaxvaluedollarcnt']	

In [62]:
joined_data, logerror_var = load_data(path_to_data='data')


2017-10-16 22:48:02,526 Reading Properties 2016...
  if self.run_code(code, result):
2017-10-16 22:48:26,371 Reading Properties 2017...
  if self.run_code(code, result):
2017-10-16 22:48:47,589 Reading Train 2016...
2017-10-16 22:48:47,666 Reading Train 2017..
2017-10-16 22:48:47,720 Performing merge


In [63]:
joined_data = drop_columns(joined_data, drop_cols)

In [64]:
# if you need to drop a column, add it to drop_cols
numeric_cols, categorical_cols = columns_after_drop(numeric_cols, categorical_cols, drop_cols)

In [65]:
numeric_data, imputations_numeric = impute_numerical_var(joined_data, numeric_cols)

2017-10-16 22:49:14,469 Filling numeric NAs
2017-10-16 22:49:14,518 2. Filling NA: bedroomcnt
2017-10-16 22:49:14,571 2. Filling NA: unitcnt
2017-10-16 22:49:14,612 2. Filling NA: taxamount
2017-10-16 22:49:14,651 2. Filling NA: longitude
2017-10-16 22:49:14,691 2. Filling NA: roomcnt
2017-10-16 22:49:14,729 2. Filling NA: landtaxvaluedollarcnt
2017-10-16 22:49:14,767 2. Filling NA: taxdelinquencyyear
2017-10-16 22:49:14,808 2. Filling NA: fullbathcnt
2017-10-16 22:49:14,853 2. Filling NA: lotsizesquarefeet
2017-10-16 22:49:14,893 2. Filling NA: yearbuilt
2017-10-16 22:49:14,933 2. Filling NA: latitude
2017-10-16 22:49:14,974 2. Filling NA: calculatedfinishedsquarefeet
2017-10-16 22:49:15,014 2. Filling NA: bathroomcnt


In [66]:
numeric_data.head()

Unnamed: 0,bedroomcnt,unitcnt,taxamount,longitude,roomcnt,landtaxvaluedollarcnt,taxdelinquencyyear,fullbathcnt,lotsizesquarefeet,yearbuilt,latitude,calculatedfinishedsquarefeet,bathroomcnt
0,3.0,1.0,6735.879883,-118488536.0,0.0,237416.0,14.0,2.0,7528.0,1959.0,34280992.0,1684.0,2.0
1,4.0,1.0,10153.019531,-117677552.0,0.0,239071.0,14.0,3.0,3643.0,2014.0,33668120.0,2263.0,3.5
2,2.0,1.0,11484.480469,-118175032.0,0.0,57912.0,14.0,3.0,11423.0,1940.0,34136312.0,2217.0,3.0
3,2.0,1.0,3048.73999,-118309000.0,0.0,73362.0,14.0,2.0,70859.0,1987.0,33755800.0,839.0,2.0
4,4.0,1.0,5488.959961,-117700232.0,8.0,264977.0,14.0,2.0,6000.0,1981.0,33485644.0,2283.0,2.5


In [67]:
imputations_numeric

{'bathroomcnt': 2.0,
 'bedroomcnt': 3.0,
 'calculatedfinishedsquarefeet': 1541.0,
 'fullbathcnt': 2.0,
 'landtaxvaluedollarcnt': 197783.0,
 'latitude': 34021700.0,
 'longitude': -118176848.0,
 'lotsizesquarefeet': 7200.0,
 'roomcnt': 0.0,
 'taxamount': 4501.080078125,
 'taxdelinquencyyear': 14.0,
 'unitcnt': 1.0,
 'yearbuilt': 1970.0}

In [68]:
categorical_data, imputations_categorical = impute_categorical_var(joined_data, categorical_cols)

167882


2017-10-16 22:49:20,271 Using most frequent...
2017-10-16 22:49:20,272 Filling NA: pooltypeid10
2017-10-16 22:49:20,389 Filling NA: taxdelinquencyflag
2017-10-16 22:49:20,514 Filling NA: pooltypeid7
2017-10-16 22:49:20,559 Filling NA: propertycountylandusecode
2017-10-16 22:49:20,653 Filling NA: typeconstructiontypeid
2017-10-16 22:49:20,671 Filling NA: hashottuborspa
2017-10-16 22:49:20,772 Filling NA: buildingqualitytypeid
2017-10-16 22:49:20,850 Filling NA: heatingorsystemtypeid
2017-10-16 22:49:20,925 Filling NA: airconditioningtypeid
2017-10-16 22:49:20,979 Filling NA: decktypeid
2017-10-16 22:49:20,999 Filling NA: fips
2017-10-16 22:49:21,093 Filling NA: architecturalstyletypeid
2017-10-16 22:49:21,114 Filling NA: propertylandusetypeid
2017-10-16 22:49:21,207 Filling NA: pooltypeid2


In [69]:
categorical_data.head()

Unnamed: 0,pooltypeid10,taxdelinquencyflag,pooltypeid7,propertycountylandusecode,typeconstructiontypeid,hashottuborspa,buildingqualitytypeid,heatingorsystemtypeid,airconditioningtypeid,decktypeid,fips,architecturalstyletypeid,propertylandusetypeid,pooltypeid2
0,1.0,0,1.0,0100,6.0,0,4.0,2.0,1.0,66.0,6037.0,7.0,261.0,1.0
1,1.0,0,1.0,1,6.0,0,7.0,2.0,1.0,66.0,6059.0,7.0,261.0,1.0
2,1.0,0,1.0,0100,6.0,0,4.0,2.0,1.0,66.0,6037.0,7.0,261.0,1.0
3,1.0,0,1.0,010C,6.0,0,4.0,2.0,1.0,66.0,6037.0,7.0,266.0,1.0
4,1.0,0,1.0,122,6.0,0,7.0,2.0,1.0,66.0,6059.0,7.0,261.0,1.0


In [70]:
categorical_data.shape

(167882, 14)

In [71]:
imputations_categorical

{'airconditioningtypeid': '1.0',
 'architecturalstyletypeid': '7.0',
 'buildingqualitytypeid': '7.0',
 'decktypeid': '66.0',
 'fips': '6037.0',
 'hashottuborspa': '0',
 'heatingorsystemtypeid': '2.0',
 'pooltypeid10': '1.0',
 'pooltypeid2': '1.0',
 'pooltypeid7': '1.0',
 'propertycountylandusecode': '0100',
 'propertylandusetypeid': '261.0',
 'taxdelinquencyflag': '0',
 'typeconstructiontypeid': '6.0'}

In [72]:
cat_dummies = pd.get_dummies(categorical_data, drop_first=True)

In [73]:
cat_dummies.shape

(167882, 140)

In [74]:
transaction_dates = joined_data[['transaction_mth','transaction_yr','transaction_day_of_wk']].copy().reset_index()

In [75]:
transaction_dates.drop('index',inplace=True,axis=1)

In [76]:
joined_after_imputation = pd.concat([numeric_data,cat_dummies, transaction_dates], axis=1)

In [77]:
len(joined_after_imputation)

167882

In [78]:
joined_after_imputation.shape

(167882, 156)

In [79]:
joined_after_imputation['logerror'] = pd.Series(logerror_var)

In [80]:
joined_after_imputation.iloc[1]

bedroomcnt                        4.000000e+00
unitcnt                           1.000000e+00
taxamount                         1.015302e+04
longitude                        -1.176776e+08
roomcnt                           0.000000e+00
landtaxvaluedollarcnt             2.390710e+05
taxdelinquencyyear                1.400000e+01
fullbathcnt                       3.000000e+00
lotsizesquarefeet                 3.643000e+03
yearbuilt                         2.014000e+03
latitude                          3.366812e+07
calculatedfinishedsquarefeet      2.263000e+03
bathroomcnt                       3.500000e+00
propertycountylandusecode_010     0.000000e+00
propertycountylandusecode_0100    0.000000e+00
propertycountylandusecode_0101    0.000000e+00
propertycountylandusecode_0102    0.000000e+00
propertycountylandusecode_0103    0.000000e+00
propertycountylandusecode_0104    0.000000e+00
propertycountylandusecode_0105    0.000000e+00
propertycountylandusecode_0108    0.000000e+00
propertycount

In [81]:
joined_after_imputation.head()

Unnamed: 0,bedroomcnt,unitcnt,taxamount,longitude,roomcnt,landtaxvaluedollarcnt,taxdelinquencyyear,fullbathcnt,lotsizesquarefeet,yearbuilt,...,propertylandusetypeid_266.0,propertylandusetypeid_267.0,propertylandusetypeid_269.0,propertylandusetypeid_275.0,propertylandusetypeid_31.0,propertylandusetypeid_47.0,transaction_mth,transaction_yr,transaction_day_of_wk,logerror
0,3.0,1.0,6735.879883,-118488536.0,0.0,237416.0,14.0,2.0,7528.0,1959.0,...,0,0,0,0,0,0,1,2016,4,0.0276
1,4.0,1.0,10153.019531,-117677552.0,0.0,239071.0,14.0,3.0,3643.0,2014.0,...,0,0,0,0,0,0,1,2016,4,-0.1684
2,2.0,1.0,11484.480469,-118175032.0,0.0,57912.0,14.0,3.0,11423.0,1940.0,...,0,0,0,0,0,0,1,2016,4,-0.004
3,2.0,1.0,3048.73999,-118309000.0,0.0,73362.0,14.0,2.0,70859.0,1987.0,...,1,0,0,0,0,0,1,2016,5,0.0218
4,4.0,1.0,5488.959961,-117700232.0,8.0,264977.0,14.0,2.0,6000.0,1981.0,...,0,0,0,0,0,0,1,2016,5,-0.005


## Feature importance

In [82]:
from sklearn.model_selection import train_test_split

In [83]:
X_train, X_test, y_train, y_test = train_test_split(joined_after_imputation.drop(['logerror'], axis=1).values,
                                                    joined_after_imputation['logerror'].values,
                                                    test_size=0.2, random_state=4)

In [84]:
len(X_train), len(X_test)

(134305, 33577)

In [85]:
from sklearn.metrics import mean_absolute_error

In [86]:
X_train.shape

(134305, 156)

In [87]:
from sklearn.linear_model import Lasso

In [88]:
all_vars = [col for col in joined_after_imputation.drop(['logerror'], axis=1).columns]
len(all_vars)

156

In [351]:
lasso_reg = Lasso(alpha=0.01)

In [352]:
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [353]:
ys = lasso_reg.predict(X_test)

In [354]:
mean_absolute_error(y_pred=ys,y_true=y_test)

0.069848562616405693

In [355]:
y_ts = lasso_reg.predict(X_train)

In [356]:
mean_absolute_error(y_pred=y_ts, y_true=y_train)

0.069382414322877214

In [357]:
all_var_dict = {var:coeff for var,coeff in zip(all_vars, lasso_reg.coef_) if coeff != 0}
all_var_dict=sorted(all_var_dict.items(), key=lambda x: np.abs(x[1]))

In [358]:
all_var_dict

[('longitude', 5.9574347861529381e-10),
 ('latitude', 2.3554716441613366e-09),
 ('lotsizesquarefeet', -3.0232171582229489e-09),
 ('landtaxvaluedollarcnt', -3.4686903404659436e-09),
 ('taxamount', 1.3394342212159957e-07),
 ('calculatedfinishedsquarefeet', 4.1696609357659166e-07),
 ('yearbuilt', -2.6204821796648278e-05)]

In [359]:
all_var = [var for var, _ in all_var_dict]
all_var.insert(len(all_var), 'logerror')
all_var

['longitude',
 'latitude',
 'lotsizesquarefeet',
 'landtaxvaluedollarcnt',
 'taxamount',
 'calculatedfinishedsquarefeet',
 'yearbuilt',
 'logerror']

In [336]:
joined_after_imputation[all_var].to_csv('data/train-ml_v3.csv', index=False)

## Persist model

In [160]:
from sklearn.externals import joblib

In [161]:
joblib.dump(lasso_reg,'data/lass_reg.pkl')

['data/lass_reg.pkl']

## Submission

In [162]:
def load_data_for_submission(path_to_data='data'):
	logging.info('Reading Properties 2016...')
	prop_2016 = pd.read_csv('{}/properties_2016.csv'.format(path_to_data))
	
	logging.info('Reading Properties 2017...')
	prop_2017 = pd.read_csv('{}/properties_2017.csv'.format(path_to_data))
    
	properties = pd.concat([prop_2016,prop_2017], axis = 0)
    
	return properties

In [164]:
drop_cols.remove('parcelid')

In [165]:
properties = load_data_for_submission()
properties = drop_columns(properties, drop_cols)


2017-10-16 22:56:13,717 Reading Properties 2016...
  if self.run_code(code, result):
2017-10-16 22:56:36,089 Reading Properties 2017...
  if self.run_code(code, result):


In [198]:
properties.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,bathroomcnt,bedroomcnt,buildingqualitytypeid,decktypeid,calculatedfinishedsquarefeet,fips,fullbathcnt,...,propertycountylandusecode,propertylandusetypeid,roomcnt,typeconstructiontypeid,unitcnt,yearbuilt,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear
0,10754147,,,0.0,0.0,,,,6037.0,,...,010D,269.0,0.0,,,,9.0,,,
1,10759547,,,0.0,0.0,,,,6037.0,,...,0109,261.0,0.0,,,,27516.0,,,
2,10843547,,,0.0,0.0,,,73026.0,6037.0,,...,1200,47.0,0.0,,2.0,,762631.0,20800.37,,
3,10859147,,,0.0,0.0,7.0,,5068.0,6037.0,,...,1200,47.0,0.0,,,1948.0,585488.0,14557.57,,
4,10879947,,,0.0,0.0,,,1776.0,6037.0,,...,1210,31.0,0.0,,1.0,1947.0,239695.0,5725.17,,


In [172]:
len(properties)

5887958

In [173]:
properties_numerical = impute_numerical_var(properties, numeric_cols,imputations_numeric)

2017-10-16 22:58:27,725 Filling numeric NAs
2017-10-16 22:58:29,904 Filling NA: bedroomcnt
2017-10-16 22:58:30,240 Filling NA: unitcnt
2017-10-16 22:58:30,287 Filling NA: taxamount
2017-10-16 22:58:30,333 Filling NA: longitude
2017-10-16 22:58:30,377 Filling NA: roomcnt
2017-10-16 22:58:30,424 Filling NA: landtaxvaluedollarcnt
2017-10-16 22:58:30,470 Filling NA: taxdelinquencyyear
2017-10-16 22:58:30,516 Filling NA: fullbathcnt
2017-10-16 22:58:30,561 Filling NA: lotsizesquarefeet
2017-10-16 22:58:30,608 Filling NA: yearbuilt
2017-10-16 22:58:30,655 Filling NA: latitude
2017-10-16 22:58:30,700 Filling NA: calculatedfinishedsquarefeet
2017-10-16 22:58:30,744 Filling NA: bathroomcnt


In [212]:
len(properties_numerical)

5887958

In [213]:
properties_numerical.isnull().sum()

bedroomcnt                      0
unitcnt                         0
taxamount                       0
longitude                       0
roomcnt                         0
landtaxvaluedollarcnt           0
taxdelinquencyyear              0
fullbathcnt                     0
lotsizesquarefeet               0
yearbuilt                       0
latitude                        0
calculatedfinishedsquarefeet    0
bathroomcnt                     0
dtype: int64

In [176]:
properties_numerical.head()

Unnamed: 0,bedroomcnt,unitcnt,taxamount,longitude,roomcnt,landtaxvaluedollarcnt,taxdelinquencyyear,fullbathcnt,lotsizesquarefeet,yearbuilt,latitude,calculatedfinishedsquarefeet,bathroomcnt
0,0.0,1.0,4501.080078,-118654084.0,0.0,9.0,14.0,2.0,85768.0,1970.0,34144442.0,1541.0,0.0
1,0.0,1.0,4501.080078,-118625364.0,0.0,27516.0,14.0,2.0,4083.0,1970.0,34140430.0,1541.0,0.0
2,0.0,2.0,20800.37,-118394633.0,0.0,762631.0,14.0,2.0,63085.0,1970.0,33989359.0,73026.0,0.0
3,0.0,1.0,14557.57,-118437206.0,0.0,585488.0,14.0,2.0,7521.0,1948.0,34148863.0,5068.0,0.0
4,0.0,1.0,5725.17,-118385816.0,0.0,239695.0,14.0,2.0,8512.0,1947.0,34194168.0,1776.0,0.0


In [177]:
properties_categorical = impute_categorical_var(properties, categorical_cols,imputations_categorical)

5887958


2017-10-16 23:00:01,320 Filling NA: pooltypeid10
2017-10-16 23:00:01,685 Filling NA: taxdelinquencyflag
2017-10-16 23:00:02,066 Filling NA: pooltypeid7
2017-10-16 23:00:02,410 Filling NA: propertycountylandusecode
2017-10-16 23:00:02,663 Filling NA: typeconstructiontypeid
2017-10-16 23:00:03,012 Filling NA: hashottuborspa
2017-10-16 23:00:03,339 Filling NA: buildingqualitytypeid
2017-10-16 23:00:03,679 Filling NA: heatingorsystemtypeid
2017-10-16 23:00:04,052 Filling NA: airconditioningtypeid
2017-10-16 23:00:04,599 Filling NA: decktypeid
2017-10-16 23:00:05,087 Filling NA: fips
2017-10-16 23:00:05,403 Filling NA: architecturalstyletypeid
2017-10-16 23:00:05,766 Filling NA: propertylandusetypeid
2017-10-16 23:00:06,095 Filling NA: pooltypeid2


In [178]:
properties_categorical.isnull().sum()

pooltypeid10                 0
taxdelinquencyflag           0
pooltypeid7                  0
propertycountylandusecode    0
typeconstructiontypeid       0
hashottuborspa               0
buildingqualitytypeid        0
heatingorsystemtypeid        0
airconditioningtypeid        0
decktypeid                   0
fips                         0
architecturalstyletypeid     0
propertylandusetypeid        0
pooltypeid2                  0
dtype: int64

In [214]:
len(properties_categorical)

5887958

In [215]:
cat_dummies = pd.get_dummies(properties_categorical, drop_first=True)

In [226]:
joined_after_ = pd.concat([properties_numerical,cat_dummies], axis=1)

In [255]:
joined_after_[all_vars].shape

(5887958, 156)

In [228]:
joined_after_['transaction_mth'] = 10

In [229]:
joined_after_['transaction_yr'] = 2016

In [230]:
joined_after_['transaction_day_of_wk'] = 4

In [337]:
def make_prediction(yr, mth, day=4):
    joined_after_['transaction_mth'] = mth
    joined_after_['transaction_yr'] = yr
    joined_after_['transaction_day_of_wk'] = day
    return lasso_reg.predict(joined_after_[all_vars].values)

In [263]:
preds= ['201610','201611','201612','201710','201711','201712']

In [272]:
int(preds[1][:4]),int(preds[1][-2:])

(2016, 11)

In [273]:
prediction_vals = {}

In [360]:
for pred in preds:
    prediction_vals[pred]=make_prediction(yr=int(pred[:4]),mth=int(pred[-2:]))

2017-10-17 00:43:05,307 Internal Python error in the inspect module.
Below is the traceback from this internal error.

2017-10-17 00:43:05,347 
Unfortunately, your original traceback can not be constructed.



Traceback (most recent call last):
  File "/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-360-308c4626f73e>", line 2, in <module>
    prediction_vals[pred]=make_prediction(yr=int(pred[:4]),mth=int(pred[-2:]))
  File "<ipython-input-337-dc717427112b>", line 5, in make_prediction
    return lasso_reg.predict(joined_after_[all_vars].values)
  File "/anaconda/lib/python3.6/site-packages/pandas/core/frame.py", line 2056, in __getitem__
    return self._getitem_array(key)
  File "/anaconda/lib/python3.6/site-packages/pandas/core/frame.py", line 2101, in _getitem_array
    return self.take(indexer, axis=1, convert=True)
  File "/anaconda/lib/python3.6/site-packages/pandas/core/generic.py", line 1817, in take
    convert=True, verify=True)
  File "/anaconda/lib/python3.6/site-packages/pandas/core/internals.py", line 4011, in take
    axis=axis, allow_dups=True)
  File "

KeyboardInterrupt: 

In [339]:
sub=pd.DataFrame.from_dict(prediction_vals)

In [340]:
sub['ParcelId'] = pd.Series([item for item in properties.parcelid])

In [341]:
sub=sub[['ParcelId','201610','201611','201612','201710','201711','201712']]

In [342]:
len(sub)

5887958

In [343]:
sample = pd.read_csv('data/sample_submission.csv')

In [344]:
len(sample)

2985217

In [348]:
sub2=sub.drop_duplicates('ParcelId')

In [349]:
len(sub2)

2985217

In [350]:
sub2.to_csv('data/zillow_submission2.csv', index=False)

In [324]:
df = pd.read_csv('data/zillow_submission.csv')

In [325]:
df.head()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
0,10754147,0.011427,0.011828,0.01223,0.016818,0.017219,0.017621
1,10759547,0.011546,0.011948,0.012349,0.016937,0.017339,0.01774
2,10843547,0.013009,0.013411,0.013812,0.0184,0.018802,0.019203
3,10859147,0.0126,0.013002,0.013403,0.017991,0.018393,0.018794
4,10879947,0.012289,0.01269,0.013092,0.01768,0.018081,0.018483


In [326]:
len(df)

2985217