In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [2]:
dir_path = os.path.join("..","data", "raw")
building_metadata = "building_metadata.csv"

In [3]:
metadata = pd.read_csv(os.path.join(dir_path, building_metadata))
metadata.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [4]:
metadata.shape

(1449, 6)

## Generate a controlling train and validation set (no need for a test set) 

- Drop primary_use column (as of now)
- Delete all existing NaN values
- Set seed and delete random values from floor_count and year_built separately, thus gaining a more distributed deletion of the values

In [5]:
c_train = metadata.drop(['primary_use'], axis = 1)
c_train.head()

Unnamed: 0,site_id,building_id,square_feet,year_built,floor_count
0,0,0,7432,2008.0,
1,0,1,2720,2004.0,
2,0,2,5376,1991.0,
3,0,3,23685,2002.0,
4,0,4,116607,1975.0,


In [6]:
c_train['year_built'].isna().sum()

774

In [7]:
c_train.shape

(1449, 5)

In [8]:
c_train = c_train[np.isfinite(c_train['year_built'])]

- Check if missing values are droped

In [9]:
c_train.shape

(675, 5)

In [10]:
c_train['year_built'].isna().sum()

0

In [11]:
c_valid = metadata.drop(['primary_use'], axis = 1)
c_valid.head()

Unnamed: 0,site_id,building_id,square_feet,year_built,floor_count
0,0,0,7432,2008.0,
1,0,1,2720,2004.0,
2,0,2,5376,1991.0,
3,0,3,23685,2002.0,
4,0,4,116607,1975.0,


In [12]:
c_valid.shape

(1449, 5)

- set seed
- set the number of rows to be droped from the dataframe
- TODO: find a better way of calculating replace_n, e.g. relatively to the number of the total rows in the dataframe and replace 10% with NaNs

In [13]:
np.random.seed(20)
replace_frac = 0.1 

- set values of year_built to NaN
- check the nr. of replaced values

In [15]:
sample_idx = np.random.randint(c_train.shape[0], size=int(c_train.shape[0]*replace_frac))
c_train.iloc[sample_idx, 3] = np.nan
c_train['year_built'].isna().sum()

64

In [16]:
c_train.shape

(675, 5)

- Create list with different imputers

In [None]:
estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    KNeighborsRegressor(n_neighbors=15)
]

In [None]:
score_imputer = pd.DataFrame()

In [None]:
for impute_estimator in estimators:
    estimator = make_pipeline(
        IterativeImputer(random_state=0, estimator=impute_estimator),
        br_estimator
    )
    score_imputer[impute_estimator.__class__.__name__] = \
        cross_val_score(
            estimator, c_train, c_valid, scoring='neg_mean_squared_error',
            cv=N_SPLITS
        )

In [None]:
#scores = pd.concat(
#    [score_full_data, score_simple_imputer, score_iterative_imputer],
#    keys=['Original', 'SimpleImputer', 'IterativeImputer'], axis=1
#)

In [None]:
#imp = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=0), missing_values=np.nan, sample_posterior=False, 
#                                 max_iter=100, tol=0.001, 
#                                 n_nearest_features=4, initial_strategy='median')
#imp.fit(c_train)

In [None]:
#imputed =pd.DataFrame(imp.transform(c_train)) 

In [None]:
#imputed.head

In [None]:
#imputed.to_csv('output.csv')