## Environment

In [94]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np

In [142]:
SEED = 32
np.random.seed(SEED)
TARGET_COL = "num_price"

## Load inputs

In [126]:
df = pd.read_parquet("./.data/features")
df = df.drop("id", axis=1)
print(df.columns)
print(df.shape)

Index(['num_price', 'cat_building_type', 'cat_quality_state', 'cat_ownership',
       'has_elevator', 'num_area', 'cat_neighborhood', 'has_furniture',
       'has_parking', 'num_terrace_area', 'num_balcony_area',
       'num_storage_area', 'num_cellar_area', 'has_barrierfree', 'has_garage',
       'has_swimming_poool', 'cat_locality_district', 'cat_watersupply',
       'cat_wasteamenities', 'cat_energy_class', 'cat_floor', 'cat_planning',
       'cat_apartment_type', 'cat_district', 'cat_locality'],
      dtype='object')
(5547, 25)


## Drop outliers in target

In [127]:
# use the 1.5 * IQR rule
q1  = np.quantile(df[TARGET_COL], 0.25)
q3  = np.quantile(df[TARGET_COL], 0.75)
iqr = (q3 - q1) * 1.5

# find outliers
cond     = "(df[TARGET_COL] <= q1 - iqr) | (df[TARGET_COL] >= q3 + iqr)"
outliers = df.loc[eval(cond), ["num_price", "num_area"]]

print(f"IQR-based MIN outlier price: {outliers[TARGET_COL].min()}")
print(f"IQR-based MAX outlier price: {outliers[TARGET_COL].max()}")
print(f"IQR-based num of outlier: {outliers.shape[0]}")

# drop the outliers
df = df.loc[~eval(cond), :]

IQR-based MIN outlier price: 17.212324
IQR-based MAX outlier price: 129.0
IQR-based num of outlier: 379


## Feature encoding

In [128]:
df.head()

Unnamed: 0,num_price,cat_building_type,cat_quality_state,cat_ownership,has_elevator,num_area,cat_neighborhood,has_furniture,has_parking,num_terrace_area,...,has_swimming_poool,cat_locality_district,cat_watersupply,cat_wasteamenities,cat_energy_class,cat_floor,cat_planning,cat_apartment_type,cat_district,cat_locality
1,13.1,smisena,velmi_dobry,osobni,0,83.0,<na>,1,0,0.0,...,0,5007,dalkovy_vodovod,verejna_kanalizace,trida_c,4,3+kk,(basic),<na>,holesovice
2,6.8,cihlova,novostavba,osobni,1,57.0,klidna_cast_obce,1,0,0.0,...,0,5010,dalkovy_vodovod,verejna_kanalizace,trida_b,2,2+kk,(basic),10,hostivar
3,14.15,smisena,novostavba,osobni,0,92.0,<na>,0,1,0.0,...,0,5005,<na>,<na>,trida_b,1,4+kk,(basic),5,hlubocepy
4,7.25,panelova,velmi_dobry,osobni,0,54.0,<na>,0,0,0.0,...,0,5004,dalkovy_vodovod,verejna_kanalizace,trida_g,4,3+kk,(basic),4,krc
5,9.299,cihlova,velmi_dobry,osobni,1,77.0,centrum_obce,1,0,0.0,...,0,5010,<na>,verejna_kanalizace,trida_c,13,3+kk,(basic),10,strasnice


In [129]:
# find high-cardinality cat features
# these features are going to be encoded based on target value
# it is important to perform this encoding for each split to prevent information leak
high_cardinality_features = []
for col in [col for col in df.columns if "cat" in col]:
    n_levels = df[col].nunique()
    print(f"Num levels {col}: {n_levels}")
    if n_levels > 50: high_cardinality_features.append(col)
print(f"High cardinality features: {high_cardinality_features}")

def target_encode(df, feature, stats=None):
    """Encodes a categorical feature as the mean value of the target in each level."""
    encoded_feature_name = f"{feature}_encoded"
    if stats is None:
        stats = df.copy()
        stats["price_per_meter"] = df["num_price"] / df["num_area"]
        stats = stats.groupby(feature, as_index=False) \
        .agg({"price_per_meter": "mean"})
        stats = stats.rename({"price_per_meter": encoded_feature_name}, axis=1)
    df = df.merge(stats, on=feature).drop(feature , axis=1)
    df = df.rename({encoded_feature_name: f"num_{feature}"}, axis=1)
    return df, stats

def target_encode_all(train_df, valid_df):
    """Applies target encoding to all high-card. features."""
    for feature in high_cardinality_features:
        train_df, stats = target_encode(train_df, feature, None)
        valid_df, _ = target_encode(valid_df, feature, stats)
    return train_df, valid_df

Num levels cat_building_type: 6
Num levels cat_quality_state: 9
Num levels cat_ownership: 3
Num levels cat_neighborhood: 6
Num levels cat_locality_district: 11
Num levels cat_watersupply: 6
Num levels cat_wasteamenities: 6
Num levels cat_energy_class: 8
Num levels cat_floor: 23
Num levels cat_planning: 11
Num levels cat_apartment_type: 4
Num levels cat_district: 11
Num levels cat_locality: 100
High cardinality features: ['cat_locality']


In [170]:
# one-hot encode categorical features (excluding high-cardinality ones)
one_hot_features = [col for col in df.columns if "cat" in col] # and col not in high_cardinality_features]
one_hot_df = df.loc[:, one_hot_features]
one_hot_df = pd.get_dummies(one_hot_df )
df = df.drop(one_hot_features, axis=1  )
one_hot_df.index = df.index
df = pd.concat([df, one_hot_df], axis=1)

## Prepare test and K-fold splits

In [171]:
# make index and shuffle to ensure rand
idx = np.arange(df.shape[0])
np.random.shuffle(idx)

# define set sizes
test_num_obs  = 750

# make test set df
test_idx = idx[:test_num_obs ]
test_df = df.iloc[test_idx, :]
idx = idx[test_num_obs:]

# instantiate KFold splitter
folds = KFold(n_splits=5, shuffle=True)
folds = folds.split(idx)

# materialize fold dfs, apply target encoding
train_dfs = []
valid_dfs = []
for idx, (train_idx, valid_idx) in enumerate(folds):
    
    # subset train-valid X
    X_train = df.iloc[train_idx, :]
    X_valid = df.iloc[valid_idx, :]

    # target-encode
    # X_train, X_valid = target_encode_all(X_train, X_valid)

    # pop target
    y_train = X_train.pop(TARGET_COL)
    y_valid = X_valid.pop(TARGET_COL)

    # dump
    train_dfs.append((X_train, y_train))
    valid_dfs.append((X_valid, y_valid))

    # check target stats in splits
    y_train_min, y_train_avg, y_train_max = np.min(y_train), np.mean(y_train), np.max(y_train)
    y_valid_min, y_valid_avg, y_valid_max = np.min(y_valid), np.mean(y_valid), np.max(y_valid)

    # report split stats
    print(f"Fold {idx} X shapes:", X_train.shape, X_valid.shape)
    print(f"Fold {idx} y train min, mean, max: {y_train_min:.2f} | {y_train_avg:.2f} | {y_train_max:.2f}")
    print(f"Fold {idx} y valid min, mean, max: {y_valid_min:.2f} | {y_valid_avg:.2f} | {y_valid_max:.2f}")
    print("\n")
    

Fold 0 X shapes: (3534, 215) (884, 215)
Fold 0 y train min, mean, max: 0.65 | 7.80 | 17.20
Fold 0 y valid min, mean, max: 2.59 | 7.91 | 17.16


Fold 1 X shapes: (3534, 215) (884, 215)
Fold 1 y train min, mean, max: 1.60 | 7.86 | 17.20
Fold 1 y valid min, mean, max: 0.65 | 7.68 | 17.00


Fold 2 X shapes: (3534, 215) (884, 215)
Fold 2 y train min, mean, max: 0.65 | 7.82 | 17.16
Fold 2 y valid min, mean, max: 1.60 | 7.85 | 17.20


Fold 3 X shapes: (3535, 215) (883, 215)
Fold 3 y train min, mean, max: 0.65 | 7.80 | 17.20
Fold 3 y valid min, mean, max: 2.49 | 7.91 | 17.02


Fold 4 X shapes: (3535, 215) (883, 215)
Fold 4 y train min, mean, max: 0.65 | 7.84 | 17.20
Fold 4 y valid min, mean, max: 1.70 | 7.77 | 16.98




## Train a baseline model

In [173]:
train_dfs[0][0].columns

Index(['has_elevator', 'num_area', 'has_furniture', 'has_parking',
       'num_terrace_area', 'num_balcony_area', 'num_storage_area',
       'num_cellar_area', 'has_barrierfree', 'has_garage',
       ...
       'cat_locality_vinohrady', 'cat_locality_vinor', 'cat_locality_vokovice',
       'cat_locality_vrsovice', 'cat_locality_vysehrad',
       'cat_locality_vysocany', 'cat_locality_zabehlice',
       'cat_locality_zbraslav', 'cat_locality_zizkov', 'cat_locality_zlicin'],
      dtype='object', length=215)

In [174]:
def objective(params, train_dfs, valid_dfs, feature_subset=None):
    """Trains a model with given params, reports training and validation loss."""

    train_loss_vals = []
    valid_loss_vals = []

    def loss_func(y_true, y_pred):
        """Computes Mean Absolute Error."""
        return np.mean(np.abs(y_true - y_pred))

    for (X_train, y_train), (X_valid, y_valid) in zip(train_dfs, valid_dfs):

        if feature_subset:
            X_train = X_train.loc[:, feature_subset]
            X_valid = X_valid.loc[:, feature_subset]

        model = RandomForestRegressor(**params).fit(X_train, y_train)

        h_train = model.predict(X_train)
        h_valid = model.predict(X_valid)

        train_loss_vals.append(loss_func(y_train, h_train))
        valid_loss_vals.append(loss_func(y_valid, h_valid))
    
    print(f"Model train loss: {np.mean(train_loss_vals)}" )
    print(f"Model valid loss: {np.mean(valid_loss_vals)}" )

    # refit model on full training data & report feature importance
    X_train_full = pd.concat((train_dfs[0][0], valid_dfs[0][0]), axis=0, ignore_index=True)
    y_train_full = pd.concat((train_dfs[0][1], valid_dfs[0][1]))
    model = RandomForestRegressor(**params).fit(X_train_full, y_train_full)
    feature_importances = pd.DataFrame(
        {
            "imp_score": model.feature_importances_,
            "feature": X_train_full.columns, 
        }
    )

    return feature_importances

In [176]:
params = {
    "min_samples_split": 10  , # regularization param
    "n_estimators"     : 50  , # capacity param
    "max_features"     : 0.80, # regularization param
    "max_samples"      : 0.80, # regularization param
    "random_state"     : SEED,  
    "n_jobs": -1,
}

# feature_subset = [col for col in train_dfs[0][0].columns if "has" in col]
# feature_subset = feature_subset + ["num_area", "num_cat_locality"]
feature_imps   = objective(params, train_dfs, valid_dfs) #, feature_subset )

Model train loss: 0.5932976539320615
Model valid loss: 0.9553371403944171


In [179]:
feature_imps.sort_values("imp_score", ascending=False)

Unnamed: 0,imp_score,feature
1,0.601527,num_area
14,0.092087,cat_building_type_panelova
94,0.029389,cat_planning_3+kk
90,0.022379,cat_planning_1+kk
60,0.015389,cat_energy_class_trida_b
...,...,...
22,0.000000,cat_quality_state_spatny
204,0.000000,cat_locality_velka_chuchle
170,0.000000,cat_locality_nebusice
184,0.000000,cat_locality_satalice
