## Environment

In [204]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np

In [205]:
SEED = 32
np.random.seed(SEED)
TARGET_COL = "num_price"

## Load inputs

In [206]:
df = pd.read_parquet("./.data/features")
df = df.drop("id", axis=1)
print(df.columns)
print(df.shape)

Index(['num_price', 'cat_building_type', 'cat_quality_state', 'cat_ownership',
       'has_elevator', 'num_area', 'cat_neighborhood', 'has_furniture',
       'has_parking', 'num_terrace_area', 'num_balcony_area',
       'num_storage_area', 'num_cellar_area', 'has_barrierfree', 'has_garage',
       'has_swimming_poool', 'cat_locality_district', 'cat_watersupply',
       'cat_wasteamenities', 'cat_energy_class', 'cat_floor', 'cat_planning',
       'cat_apartment_type', 'cat_district', 'cat_locality'],
      dtype='object')
(5547, 25)


## Drop outliers in target

In [207]:
# use the 1.5 * IQR rule
q1  = np.quantile(df[TARGET_COL], 0.25)
q3  = np.quantile(df[TARGET_COL], 0.75)
iqr = (q3 - q1) * 1.5

# find outliers
cond     = "(df[TARGET_COL] <= q1 - iqr) | (df[TARGET_COL] >= q3 + iqr)"
outliers = df.loc[eval(cond), ["num_price", "num_area"]]

print(f"IQR-based MIN outlier price: {outliers[TARGET_COL].min()}")
print(f"IQR-based MAX outlier price: {outliers[TARGET_COL].max()}")
print(f"IQR-based num of outlier: {outliers.shape[0]}")

# drop the outliers
df = df.loc[~eval(cond), :]

IQR-based MIN outlier price: 17.212324
IQR-based MAX outlier price: 129.0
IQR-based num of outlier: 379


## Feature encoding

In [208]:
# one-hot encode categorical features
one_hot_features = [col for col in df.columns if "cat" in col]
one_hot_df = df.loc[:, one_hot_features]
one_hot_df = pd.get_dummies(one_hot_df )
df = df.drop(one_hot_features, axis=1  )
one_hot_df.index = df.index
df = pd.concat([df, one_hot_df], axis=1)
df.head()

Unnamed: 0,num_price,has_elevator,num_area,has_furniture,has_parking,num_terrace_area,num_balcony_area,num_storage_area,num_cellar_area,has_barrierfree,...,cat_locality_vinohrady,cat_locality_vinor,cat_locality_vokovice,cat_locality_vrsovice,cat_locality_vysehrad,cat_locality_vysocany,cat_locality_zabehlice,cat_locality_zbraslav,cat_locality_zizkov,cat_locality_zlicin
1,13.1,0,83.0,1,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,6.8,1,57.0,1,0,0.0,6.0,0.0,0.0,1,...,0,0,0,0,0,0,0,0,0,0
3,14.15,0,92.0,0,1,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,7.25,0,54.0,0,0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
5,9.299,1,77.0,1,0,0.0,0.0,0.0,0.0,1,...,0,0,0,0,0,0,0,0,0,0


## Prepare test and K-fold splits

In [212]:
# make index and shuffle to ensure rand
idx = np.arange(df.shape[0])
np.random.shuffle(idx)

# make test set df
test_num_obs = 750
test_idx     = idx[:test_num_obs ]
test_df      = df.iloc[test_idx,:]
idx          = idx[test_num_obs :]

In [214]:
# materialize fold dfs
train_dfs = []
valid_dfs = []
folds = KFold(n_splits=5, shuffle=True).split(idx )
for fold_idx, (train_idx, valid_idx) in enumerate(folds):
    
    # subset train-valid sets
    X_train = df.iloc[train_idx, :]  ; X_valid = df.iloc[valid_idx, :]
    y_train = X_train.pop(TARGET_COL); y_valid = X_valid.pop(TARGET_COL)
    train_dfs.append((X_train, y_train)); valid_dfs.append((X_valid, y_valid))

    # check target stats in splits
    y_train_min, y_train_avg, y_train_max = np.min(y_train), np.mean(y_train), np.max(y_train)
    y_valid_min, y_valid_avg, y_valid_max = np.min(y_valid), np.mean(y_valid), np.max(y_valid)

    # report split stats
    print(f"Fold {fold_idx} X shapes:", X_train.shape, X_valid.shape)
    print(f"Fold {fold_idx} y train min, mean, max: {y_train_min:.2f} | {y_train_avg:.2f} | {y_train_max:.2f}")
    print(f"Fold {fold_idx} y valid min, mean, max: {y_valid_min:.2f} | {y_valid_avg:.2f} | {y_valid_max:.2f}")

Fold 0 X shapes: (3534, 215) (884, 215)
Fold 0 y train min, mean, max: 1.60 | 7.78 | 17.20
Fold 0 y valid min, mean, max: 0.65 | 7.99 | 17.12
Fold 1 X shapes: (3534, 215) (884, 215)
Fold 1 y train min, mean, max: 0.65 | 7.83 | 17.20
Fold 1 y valid min, mean, max: 2.49 | 7.79 | 17.02
Fold 2 X shapes: (3534, 215) (884, 215)
Fold 2 y train min, mean, max: 0.65 | 7.88 | 17.16
Fold 2 y valid min, mean, max: 1.60 | 7.58 | 17.20
Fold 3 X shapes: (3535, 215) (883, 215)
Fold 3 y train min, mean, max: 0.65 | 7.79 | 17.20
Fold 3 y valid min, mean, max: 2.69 | 7.95 | 17.16
Fold 4 X shapes: (3535, 215) (883, 215)
Fold 4 y train min, mean, max: 0.65 | 7.83 | 17.20
Fold 4 y valid min, mean, max: 2.59 | 7.80 | 17.11


## Train a baseline model

In [215]:
def baseline_model(params, train_dfs, valid_dfs):
    """Trains a model with given params, reports training and validation loss."""

    train_loss_vals = []
    valid_loss_vals = []

    # define loss func
    def loss_func(y_true, y_pred):
        """Computes Mean Absolute Error."""
        return np.mean(np.abs(y_true - y_pred))

    # iterate over training and validation folds
    for (X_train, y_train), (X_valid, y_valid) in zip(train_dfs, valid_dfs):

        # train a model
        model = RandomForestRegressor(**params).fit(X_train, y_train)

        # compute loss vals
        h_train = model.predict(X_train)
        h_valid = model.predict(X_valid)

        train_loss_vals.append(loss_func(y_train, h_train))
        valid_loss_vals.append(loss_func(y_valid, h_valid))
    
    # report mean loss across folds
    print(f"Model train loss: {np.mean(train_loss_vals)}" )
    print(f"Model valid loss: {np.mean(valid_loss_vals)}" )

    # refit model on full training data
    X_train_full = pd.concat((train_dfs[0][0], valid_dfs[0][0]), axis=0, ignore_index=True)
    y_train_full = pd.concat((train_dfs[0][1], valid_dfs[0][1]))

    # return model and feature importances
    model = RandomForestRegressor(**params).fit(X_train_full, y_train_full)
    feature_importances = pd.DataFrame(
        {
            "imp_score": model.feature_importances_,
            "feature": X_train_full.columns, 
        }
    )

    return model, feature_importances

In [216]:
params = {
    "min_samples_split": 10  , # regularization param
    "n_estimators"     : 50  , # capacity param
    "max_features"     : 0.80, # regularization param
    "max_samples"      : 0.80, # regularization param
    "random_state"     : SEED,  
    "n_jobs": -1,
}

model, features = baseline_model(params, train_dfs, valid_dfs)
features[:15].sort_values("imp_score", ascending=False)

Model train loss: 0.5946440753913789
Model valid loss: 0.9551035922021862


Unnamed: 0,imp_score,feature
1,0.608126,num_area
14,0.09238,cat_building_type_panelova
4,0.007403,num_terrace_area
7,0.005623,num_cellar_area
11,0.004564,cat_building_type_cihlova
5,0.004481,num_balcony_area
6,0.003398,num_storage_area
3,0.002413,has_parking
2,0.002166,has_furniture
8,0.002154,has_barrierfree
