In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [None]:
dir_path = os.path.join("..", "data", "raw")
building_metadata = "building_metadata.csv"

In [None]:
metadata = pd.read_csv(os.path.join(dir_path, building_metadata))
metadata.head()

In [None]:
metadata.shape

## Generate a controlling train and validation set (no need for a test set) 

- Drop primary_use column (as of now)
- Delete all existing NaN values
- Set seed and delete random values from floor_count and year_built separately, thus gaining a more distributed deletion of the values

In [None]:
c_train = metadata.drop(['primary_use'], axis = 1)
c_train.head()

In [None]:
c_train['year_built'].isna().sum()

In [None]:
c_train.shape

In [None]:
c_train = c_train[np.isfinite(c_train['year_built'])]

- Check if missing values are droped

In [None]:
c_train.shape

In [None]:
c_train['year_built'].isna().sum()

In [None]:
c_valid = metadata.drop(['primary_use'], axis = 1)
c_valid = c_valid[np.isfinite(c_valid['year_built'])]
c_valid.head()

In [None]:
c_valid.shape

- set seed
- set the number of rows to be droped from the dataframe
- TODO: find a better way of calculating replace_n, e.g. relatively to the number of the total rows in the dataframe and replace 10% with NaNs

In [None]:
np.random.seed(20)
replace_frac = 0.1 

- set values of year_built to NaN
- check the nr. of replaced values

In [None]:
sample_idx = np.random.randint(c_train.shape[0], size=int(c_train.shape[0]*replace_frac))
c_train.iloc[sample_idx, 3] = np.nan
c_train['year_built'].isna().sum()

In [None]:
c_train.shape

In [None]:
assert()

- Create list with different imputers

In [None]:
estimators = [
    BayesianRidge(),
    DecisionTreeRegressor(max_features='sqrt', random_state=0),
    ExtraTreesRegressor(n_estimators=10, random_state=0),
    KNeighborsRegressor(n_neighbors=15)
]

In [None]:
score_imputer = pd.DataFrame()

In [None]:
N_SPLITS = 5

In [None]:
br_estimator = BayesianRidge()

In [None]:
score_simple_imputer = pd.DataFrame()
for strategy in ('mean', 'median'):
    estimator = make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy=strategy),
        br_estimator
    )
    score_simple_imputer[strategy] = cross_val_score(
        estimator, c_train, c_valid, scoring='neg_mean_squared_error'
    )

In [None]:
score_iterat_imputer = pd.DataFrame()
for impute_estimator in estimators:
    estimator = make_pipeline(
        IterativeImputer(random_state=0, estimator=impute_estimator),
        br_estimator
    )
    score_iterat_imputer[impute_estimator.__class__.__name__] = \
        cross_val_score(
            estimator, c_train, c_valid, scoring='neg_mean_squared_error',
            cv=N_SPLITS
        )

In [None]:
scores = pd.concat(
    [score_simple_imputer, score_iterat_imputer],
    keys=['SimpleImputer', 'IterativeImputer'], axis=1
)


In [None]:
scores.head()

In [None]:
#imp = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=0), missing_values=np.nan, sample_posterior=False, 
#                                 max_iter=100, tol=0.001, 
#                                 n_nearest_features=4, initial_strategy='median')
#imp.fit(c_train)

In [None]:
#imputed =pd.DataFrame(imp.transform(c_train)) 

In [None]:
#imputed.head

In [None]:
#imputed.to_csv('output.csv')