In [1]:
import time

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_parquet("../dataset/etl/L3.regression_train.parquet")
df.head()

Unnamed: 0,price,sertifikat,hadap,konsep_dan_gaya_rumah,pemandangan,sumber_air,garasi,lebar_jalan,kondisi_properti,kondisi_perabotan,...,facility_internet,facility_lapangan,facility_laundry,facility_lemari_pakaian,facility_lemari_sepatu,facility_mezzanine,facility_musholla,facility_parkir,facility_shed,facility_water_tank
0,5300.0,SHM - Sertifikat Hak Milik,Selatan,American Classic,Pegunungan,PAM atau PDAM,0.0,4.0,furnished,unfurnished,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1200.0,HGB - Hak Guna Bangunan,Utara,Minimalis Modern,Pemukiman Warga,PAM atau PDAM,0.0,2.0,furnished,unfurnished,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4500.0,SHM - Sertifikat Hak Milik,Timur,Modern,Pemukiman Warga,PAM atau PDAM,1.0,2.0,furnished,unfurnished,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3190.0,SHM - Sertifikat Hak Milik,Timur,Modern,Pegunungan,PAM atau PDAM,0.0,3.0,unfurnished,unfurnished,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,380.0,HGB - Hak Guna Bangunan,Selatan,Minimalis,Pemukiman Warga,PAM atau PDAM,1.0,1.0,furnished,unfurnished,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18240 entries, 0 to 20145
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   price                    18240 non-null  float64
 1   sertifikat               18240 non-null  object 
 2   hadap                    18240 non-null  object 
 3   konsep_dan_gaya_rumah    18240 non-null  object 
 4   pemandangan              18240 non-null  object 
 5   sumber_air               18240 non-null  object 
 6   garasi                   18240 non-null  float64
 7   lebar_jalan              18240 non-null  float64
 8   kondisi_properti         18240 non-null  object 
 9   kondisi_perabotan        18240 non-null  object 
 10  ruang_makan              18240 non-null  float64
 11  tags_pedesaan            18240 non-null  float64
 12  floor_mat_ubin           18240 non-null  float64
 13  facility_air_pam         18240 non-null  float64
 14  facility_air_tanah       18

In [7]:
floor_mat_cols = [col for col in df.columns if col.startswith("floor_mat_")]
house_mat_cols = [col for col in df.columns if col.startswith("house_mat_")]
facility_cols = [col for col in df.columns if col.startswith("facility_")]
tags_cols = [col for col in df.columns if col.startswith("tags_")]

extra_tags = ["ruang_tamu", "ruang_makan", "terjangkau_internet", "hook"]
for tag in extra_tags:
    if tag in df.columns:
        tags_cols.append(tag)

cat_cols = [col for col in df.select_dtypes(include=["object"]).columns]
num_cols = list(set(df.columns) - set(floor_mat_cols + house_mat_cols + facility_cols + tags_cols + cat_cols + ["price"]))

In [8]:
X = df.drop(columns=["price"])
y = df["price"]

## Training Regression Models

In [6]:
# to hold CV results
cv_results = []

### Scikit-Learn

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [None]:
def cross_validate_ex(model, X, y, category, name):
    # create cross-validation param
    cv = KFold(n_splits=10, shuffle=True, random_state=21)

    # define scoring
    scoring = ["r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_mean_absolute_percentage_error"]

    # cross-validate
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, n_jobs=4, verbose=1)

    # change into record-wise
    score_records = []
    for i in range(10):
        score_records.append({
            "fit_time": scores["fit_time"][i],
            "score_time": scores["score_time"][i],
            "r2": scores["test_r2"][i],
            "mse": -scores["test_neg_mean_squared_error"][i],
            "mae": -scores["test_neg_mean_absolute_error"][i],
            "mape": -scores["test_neg_mean_absolute_percentage_error"][i],
            "category": category,
            "name": name
        })
    
    return score_records

In [18]:
categorical_encoder = Pipeline(
  steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
  ]
)

numerical_encoder = Pipeline(
    steps=[
        ("scaler", MinMaxScaler()),
    ]
)

compose_transformers = ColumnTransformer(
    transformers=[
        ("passthrough", "passthrough", tags_cols + floor_mat_cols + house_mat_cols),
        ("catergorical_encoder", categorical_encoder, cat_cols),
        ("numerical_encoder", numerical_encoder, num_cols),
    ]
)

compose_transformers

In [30]:
X_trans, y_trans = compose_transformers.fit_transform(X, y)

ValueError: A given column is not a column of the dataframe

In [31]:
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
# define models
models = [
    ("Linear", "LinearRegression", LinearRegression()),
    ("Linear", "Lasso", Lasso()),
    ("Linear", "Ridge", Ridge()), 
    ("Linear", "BayesianRidge", BayesianRidge()),
    ("Tree", "DecisionTreeRegressor", DecisionTreeRegressor()),
    ("KNN", "KNeighborsRegressor", KNeighborsRegressor()),
    ("SVM", "SVR", SVR()),
    ("SVM", "LinearSVR", LinearSVR()),
    ("Neural Network", "MLPRegressor", MLPRegressor()),
    ("Ensemble", "RandomForestRegressor", RandomForestRegressor()),
    ("Ensemble", "GradientBoostingRegressor", GradientBoostingRegressor()),
]

# evaluate each model
for category, name, model in models:
    print(f"Evaluating {category}/{name} model")

    # create classifier pipeline
    clf = Pipeline(
        steps=[
            ("preprocessor", compose_transformers),
            ("regressor", model),
        ]
    )

    # run cross validation
    cv_results.extend(cross_validate_ex(clf, X, y, category, name))

# with pd.option_context('display.float_format', '{:0.4f}'.format):

### Catboost Models

In [None]:
from catboost import CatBoostRegressor, Pool

In [None]:
# select colums to use
X_catboost = df[cat_cols + num_cols].copy()

# convert multihot columns to list
X_catboost["tags"] = df[tags_cols].values.tolist()
X_catboost["floor_mat"] = df[floor_mat_cols].values.tolist()
X_catboost["house_mat"] = df[house_mat_cols].values.tolist()
X_catboost["facility"] = df[facility_cols].values.tolist()

cbembedding = ["tags", "floor_mat", "house_mat", "facility"]

X_catboost.head(2)

In [None]:
# run training
cv = KFold(n_splits=10, shuffle=True, random_state=21)
for fold_i, (train_idx, test_idx) in enumerate(cv.split(X_catboost, y)):
    print(f"Training fold {fold_i + 1}")

    # split data
    X_train, X_test = X_catboost.iloc[train_idx], X_catboost.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # create pool
    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_cols, embedding_features=cbembedding)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_cols, embedding_features=cbembedding)
    
    # train model
    model = CatBoostRegressor(verbose=0, random_seed=21)

    fit_time_start = time.time()
    model.fit(train_pool, eval_set=test_pool, verbose=0)
    fit_time_end = time.time()

    # run predictions
    score_time_start = time.time()
    y_pred = model.predict(test_pool)
    score_time_end = time.time()

    # store metrics
    cv_results.append({
        "fit_time": fit_time_end - fit_time_start,
        "score_time":score_time_end - score_time_start,
        "r2": r2_score(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred),
        "mae": mean_absolute_error(y_test, y_pred),
        "mape": mean_absolute_percentage_error(y_test, y_pred),
        "category": "CatBoost",
        "name": "CatBoostRegressor with Embedding",
    })

In [None]:
# run training
cv = KFold(n_splits=10, shuffle=True, random_state=21)
for fold_i, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    print(f"Training fold {fold_i + 1}")

    # split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # create pool
    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_cols)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_cols)
    
    # train model
    model = CatBoostRegressor(verbose=0, random_seed=21, task_type="GPU")

    fit_time_start = time.time()
    model.fit(train_pool, eval_set=test_pool, verbose=0)
    fit_time_end = time.time()

    # run predictions
    score_time_start = time.time()
    y_pred = model.predict(test_pool)
    score_time_end = time.time()

    # store metrics
    cv_results.append({
        "fit_time": fit_time_end - fit_time_start,
        "score_time": score_time_end - score_time_start,
        "r2": r2_score(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred),
        "mae": mean_absolute_error(y_test, y_pred),
        "mape": mean_absolute_percentage_error(y_test, y_pred),
        "category": "CatBoost",
        "name": "CatBoostRegressor without Embedding",
    })

### TensorFlow Models

In [7]:
import tensorflow as tf

2024-02-09 15:15:03.955475: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-09 15:15:03.955541: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-09 15:15:03.955553: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-09 15:15:03.960715: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
tf.config.list_physical_devices()

2024-02-09 15:15:05.563996: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-09 15:15:05.581689: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-09 15:15:05.581765: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [16]:
ds_labels = tf.data.Dataset.from_tensor_slices(y.values)
ds_features = tf.data.Dataset.from_tensor_slices(X_float32.values)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:
# generator to convert dataframe to tf dataset
def tf_df_row_gen(df):
  for row in df.itertuples(index=False):
    values = []

    # map multihot columns
    for name, cols in zip(["floor_mat", "house_mat", "tags", "facility"], [floor_mat_cols, house_mat_cols, tags_cols, facility_cols]):
        cvals = []
        for col in cols:
            cvals.append(getattr(row, col))
        
        values.append(tf.constant(cvals, dtype=tf.float32, name=name))

    # map categorical columns
    for col in cat_cols:
        values.append(tf.constant(getattr(row, col), dtype=tf.string, name=col))

    # map numerical columns
    for col in num_cols:
        values.append(tf.constant(getattr(row, col), dtype=tf.float32, name=col))
    
    yield tuple(values)

In [None]:
def construct_tf_dataset(X, y):
    # tensor specs
    num_cols_spec = [tf.TensorSpec(shape=(), dtype=tf.float32, name=col) for col in num_cols]
    cat_cols_spec = [tf.TensorSpec(shape=(), dtype=tf.string, name=col) for col in cat_cols]
    embedding_cols_spec = [
        tf.TensorSpec(shape=(len(floor_mat_cols),), dtype=tf.float32, name="floor_mat"),
        tf.TensorSpec(shape=(len(house_mat_cols),), dtype=tf.float32, name="house_mat"),
        tf.TensorSpec(shape=(len(tags_cols),), dtype=tf.float32, name="tags"),
        tf.TensorSpec(shape=(len(facility_cols),), dtype=tf.float32, name="facility"),
    ]

    # create dataset
    ds_labels = tf.data.Dataset.from_tensor_slices(y.values, name="price")
    ds_features = tf.data.Dataset.from_generator(
        lambda: tf_df_row_gen(X),
        output_signature=tuple(embedding_cols_spec + cat_cols_spec + num_cols_spec)
    )

    return tf.data.Dataset.zip((ds_features, ds_labels))\
        .batch(64) \
        .cache() \
        .prefetch(tf.data.AUTOTUNE)

In [None]:
def create_numeric_norm_layer(index, dataset):
    # Create a Normalization layer for our feature.
    normalizer = tf.keras.layers.Normalization(axis=None)
    normalizer.adapt(dataset.map(lambda x, _: x[index]))
    
    return normalizer

def create_categorical_norm_layer(index, dataset):
    # Create a layer that turns strings into integer indices.
    indexer = tf.keras.layers.StringLookup(max_tokens=None)
    indexer.adapt(dataset.map(lambda x, _: x[index]))
    
    # Encode the integer indices.
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=indexer.vocabulary_size())
    
    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoder(indexer(feature))

In [None]:
def create_tf_model(ds: tf.data.Dataset) -> tf.keras.Model:
    tf_inputs = []
    tf_layers = []

    tf_embedding_cols = [("floor_mat", len(floor_mat_cols)), ("house_mat", len(house_mat_cols)), ("tags", len(tags_cols)), ("facility", len(facility_cols))]
    for i, (col, col_len) in enumerate(tf_embedding_cols):
        input_layer = tf.keras.Input(shape=(col_len,), name=col)
        normalizer = tf.keras.layers.Dense(64, activation="relu")(input_layer)

        tf_inputs.append(input_layer)
        tf_layers.append(normalizer)

    for i, col in enumerate(cat_cols):
        input_layer = tf.keras.Input(shape=(1,), name=col, dtype=tf.string)
        normalizer = create_categorical_norm_layer(i + len(tf_embedding_cols), ds)

        tf_inputs.append(input_layer)
        tf_layers.append(normalizer(input_layer))

    for i, col in enumerate(num_cols):
        input_layer = tf.keras.Input(shape=(1,), name=col)
        normalizer = create_numeric_norm_layer(i + len(tf_embedding_cols) + len(cat_cols), ds)

        tf_inputs.append(input_layer)
        tf_layers.append(normalizer(input_layer))

    # concatenate all layers
    x = tf.keras.layers.concatenate(tf_layers)
    x = tf.keras.layers.Dense(512, activation="gelu")(x)
    x = tf.keras.layers.Dense(256, activation="gelu")(x)
    x = tf.keras.layers.Dense(128, activation="gelu")(x)
    x = tf.keras.layers.Dense(64, activation="gelu")(x)
    output = tf.keras.layers.Dense(1, name="price")(x)

    # create model
    model = tf.keras.Model(inputs=tf_inputs, outputs=output)
    return model

In [None]:
# run training
cv = KFold(n_splits=10, shuffle=True, random_state=21)
for fold_i, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    print(f"Training fold {fold_i + 1}")

    # split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # create dataset
    train_ds = construct_tf_dataset(X_train, y_train)
    test_ds = construct_tf_dataset(X_test, y_test)

    # create model
    model = create_tf_model(train_ds)
    # model.summary()
    # tf.keras.utils.plot_model(model, show_shapes=True, show_trainable=True, show_dtype=True, rankdir="LR")

    # compile model
    model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae", "mse"])

    # train model
    fit_time_start = time.time()
    model.fit(train_ds, epochs=200, validation_data=test_ds)
    fit_time_end = time.time()

    # run predictions
    score_time_start = time.time()
    y_pred = model.predict(test_ds).reshape(-1)
    score_time_end = time.time()

    # store metrics
    cv_results.append({
        "fit_time": fit_time_end - fit_time_start,
        "score_time": score_time_end - score_time_start,
        "r2": r2_score(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred),
        "mae": mean_absolute_error(y_test, y_pred),
        "mape": mean_absolute_percentage_error(y_test, y_pred),
        "category": "TensorFlow",
        "name": "DNNRegressor",
    })

## Summary

In [None]:
df_scores = pd.DataFrame(cv_results)
df_scores.head()

In [None]:
rdf = df_scores.pivot_table(index=["category", "name"], values=["r2", "mse", "mae", "mape"]).round(4).sort_values(by="mse", ascending=True)
rdf.style.background_gradient()

In [None]:
rdf.to_csv("../dataset/train_baseline/summary_regression_scores.csv", index=True)
df_scores.to_csv("../dataset/train_baseline/raw_regression_scores.csv", index=False)