In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [2]:
dir_path = os.path.join("..", "data", "raw")
dir_to_save = os.path.join("..", "data", "interim")
building_metadata = "building_metadata.csv"

In [3]:
metadata = pd.read_csv(os.path.join(dir_path, building_metadata))
original = metadata
metadata.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [4]:
metadata.shape

(1449, 6)

In [5]:
metadata = metadata.drop(['floor_count', 'primary_use'], axis = 1)

# Transform year_built to age of the building

In [6]:
metadata.loc[:, 'year_built'] = 2017 - metadata.loc[:, 'year_built']


## Generate a controlling train and validation set (no need for a test set) 

- Drop primary_use column (as of now)
- Delete all existing NaN values
- Set seed and delete random values from floor_count and year_built separately, thus gaining a more distributed deletion of the values

In [7]:
c_train = metadata
c_train.head()

Unnamed: 0,site_id,building_id,square_feet,year_built
0,0,0,7432,9.0
1,0,1,2720,13.0
2,0,2,5376,26.0
3,0,3,23685,15.0
4,0,4,116607,42.0


In [8]:
c_train['year_built'].isna().sum()

774

In [9]:
c_train.shape

(1449, 4)

In [10]:
c_train = c_train[np.isfinite(c_train['year_built'])]

- Check if missing values are droped

In [11]:
c_train.shape

(675, 4)

In [12]:
c_train['year_built'].isna().sum()

0

In [13]:
c_valid = metadata
c_valid = c_valid[np.isfinite(c_valid['year_built'])]
c_valid.head()

Unnamed: 0,site_id,building_id,square_feet,year_built
0,0,0,7432,9.0
1,0,1,2720,13.0
2,0,2,5376,26.0
3,0,3,23685,15.0
4,0,4,116607,42.0


In [14]:
c_valid.shape

(675, 4)

- set seed
- set the number of rows to be droped from the dataframe
- TODO: find a better way of calculating replace_n, e.g. relatively to the number of the total rows in the dataframe and replace 10% with NaNs

In [15]:
np.random.seed(20)
replace_frac = 0.1 

- set values of year_built to NaN
- check the nr. of replaced values

In [16]:
sample_idx = np.random.randint(c_train.shape[0], size=int(c_train.shape[0]*replace_frac))
c_train.iloc[sample_idx, 3] = np.nan
c_train['year_built'].isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


64

In [17]:
c_train.shape

(675, 4)

In [18]:
assert c_train.shape == c_valid.shape

- Create list with different imputers

In [19]:
estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    KNeighborsRegressor(n_neighbors=15)
]

In [20]:
score_imputer = pd.DataFrame()

In [21]:
N_SPLITS = 5

In [22]:
br_estimator = BayesianRidge()

# Save for later use 

In [23]:
#score_simple_imputer = pd.DataFrame()
#for strategy in ('mean', 'median'):
#    estimator = SimpleImputer(missing_values=np.nan, strategy=strategy)
#    
#    estimator.fit(c_train)
#    score_simple_imputer[strategy] = cross_val_score(
#        estimator, c_train, c_valid, scoring='neg_mean_squared_error',
#        cv=N_SPLITS
#    )

In [24]:
#score_iterat_imputer = pd.DataFrame()
#for impute_estimator in estimators:
#    estimator = make_pipeline(
#        IterativeImputer(random_state=0, estimator=impute_estimator),
#        br_estimator
#    )
#    score_iterat_imputer[impute_estimator.__class__.__name__] = \
#        cross_val_score(
#            estimator, c_train, c_valid, scoring='neg_mean_squared_error',
#            cv=N_SPLITS
#        )

In [25]:
#scores = pd.concat(
#    [score_simple_imputer, score_iterat_imputer],
#    keys=['SimpleImputer', 'IterativeImputer'], axis=1
#)


In [26]:
#scores.head()

# The best scoring algorithm

In [27]:
imp = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=0), missing_values=np.nan, sample_posterior=False, 
                                 max_iter=100, tol=0.001, 
                                 n_nearest_features=4, initial_strategy='median')
imp.fit(c_train)

IterativeImputer(add_indicator=False,
                 estimator=ExtraTreesRegressor(bootstrap=False, criterion='mse',
                                               max_depth=None,
                                               max_features='auto',
                                               max_leaf_nodes=None,
                                               min_impurity_decrease=0.0,
                                               min_impurity_split=None,
                                               min_samples_leaf=1,
                                               min_samples_split=2,
                                               min_weight_fraction_leaf=0.0,
                                               n_estimators=10, n_jobs=None,
                                               oob_score=False, random_state=0,
                                               verbose=0, warm_start=False),
                 imputation_order='ascending', initial_strategy='median',
                

In [28]:
imputed =pd.DataFrame(imp.transform(metadata), columns=['site_id', 'building_id', 'square_feet', 'age'],
                             dtype='int')

In [29]:
imputed.head()

Unnamed: 0,site_id,building_id,square_feet,age
0,0,0,7432,9
1,0,1,2720,13
2,0,2,5376,26
3,0,3,23685,15
4,0,4,116607,42


In [30]:
building_metadata_imputed = pd.DataFrame(columns=['site_id', 'building_id', 'primary_use', 'square_feet', 'age'])

In [31]:
building_metadata_imputed['site_id'] = imputed['site_id'].to_numpy()
building_metadata_imputed['building_id'] = imputed['site_id'].to_numpy()
building_metadata_imputed['primary_use'] = original['primary_use']
building_metadata_imputed['square_feet'] = imputed['square_feet'].to_numpy()
building_metadata_imputed['age'] = imputed['age'].to_numpy()

In [32]:
building_metadata_imputed.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,age
0,0,0,Education,7432,9
1,0,0,Education,2720,13
2,0,0,Education,5376,26
3,0,0,Education,23685,15
4,0,0,Education,116607,42


In [33]:
building_metadata_imputed.to_csv(os.path.join(dir_to_save, r'building_metadata_imputed.csv'))