In [1]:
import time

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [2]:
df = pd.read_parquet("../dataset/etl/L3.regression_train.parquet")
df.head()

Unnamed: 0,price,sertifikat,tahun_dibangun,garasi,pemandangan,hadap,sumber_air,tahun_di_renovasi,konsep_dan_gaya_rumah,lebar_jalan,...,facility_masjid,facility_mezzanine,facility_musholla,facility_one_gate_system,facility_parkir,facility_playground,facility_shed,facility_taman,facility_wastafel,facility_water_tank
0,3300.0,SHM - Sertifikat Hak Milik,1000.0,0.0,Pemukiman Warga,Timur,PAM atau PDAM,0.0,Minimalis Modern,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,850.0,SHM - Sertifikat Hak Milik,2016.0,4.0,Pegunungan,Timur,PAM atau PDAM,0.0,Minimalis Modern,2.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
2,1000.0,SHM - Sertifikat Hak Milik,2020.0,1.0,Pemukiman Warga,Timur,PAM atau PDAM,0.0,Minimalis Modern,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2500.0,SHM - Sertifikat Hak Milik,2013.0,3.0,Pemukiman Warga,Timur,PAM atau PDAM,0.0,Minimalis Modern,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,3000.0,"Lainnya (PPJB,Girik,Adat,dll)",0.0,0.0,Pemukiman Warga,Timur,PAM atau PDAM,0.0,Minimalis Modern,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.columns

Index(['price', 'sertifikat', 'tahun_dibangun', 'garasi', 'pemandangan',
       'hadap', 'sumber_air', 'tahun_di_renovasi', 'konsep_dan_gaya_rumah',
       'lebar_jalan', 'kondisi_properti', 'kondisi_perabotan', 'ruang_makan',
       'ruang_tamu', 'terjangkau_internet', 'hook', 'tags_bisa_nego',
       'tags_cash_bertahap', 'tags_cash_keras', 'tags_dijual_cepat',
       'tags_komplek', 'tags_kpr', 'tags_masuk_gang', 'tags_one_gate_system',
       'tags_pedesaan', 'tags_perumahan', 'tags_pinggir_jalan',
       'house_mat_bata_hebel', 'house_mat_batako', 'house_mat_beton',
       'floor_mat_granit', 'floor_mat_marmer', 'floor_mat_ubin',
       'floor_mat_vinyl', 'facility_air_pam', 'facility_air_tanah',
       'facility_aula', 'facility_balcony', 'facility_canopy',
       'facility_carport', 'facility_dishwasher', 'facility_floorboards',
       'facility_garasi', 'facility_gas', 'facility_gym', 'facility_halaman',
       'facility_heating', 'facility_internet', 'facility_jalur_telepon',


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18240 entries, 0 to 20145
Data columns (total 65 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   price                     18240 non-null  float64
 1   sertifikat                18240 non-null  object 
 2   tahun_dibangun            18240 non-null  float64
 3   garasi                    18240 non-null  float64
 4   pemandangan               18240 non-null  object 
 5   hadap                     18240 non-null  object 
 6   sumber_air                18240 non-null  object 
 7   tahun_di_renovasi         18240 non-null  float64
 8   konsep_dan_gaya_rumah     18240 non-null  object 
 9   lebar_jalan               18240 non-null  float64
 10  kondisi_properti          18240 non-null  object 
 11  kondisi_perabotan         18240 non-null  object 
 12  ruang_makan               18240 non-null  float64
 13  ruang_tamu                18240 non-null  float64
 14  terjangkau_

## Preprocessor Pipeline

In [5]:
floor_mat_cols = [col for col in df.columns if col.startswith("floor_mat_")]
house_mat_cols = [col for col in df.columns if col.startswith("house_mat_")]
tags_cols = [col for col in df.columns if col.startswith("tags_")] + ["hook_available", "ruang_tamu_available", "ruang_makan_available", "terjangkau_internet_available"]

cat_cols = ["kondisi_perabotan_norm", "kondisi_properti_norm", "konsep_dan_gaya_rumah", "sumber_air", "pemandangan", "sertifikat"]
num_cols = ["lebar_jalan_num", "daya_listrik_num", "luas_bangunan_num", "luas_tanah_num", "carport", "garasi", "dapur", "jumlah_lantai", "kamar_mandi_pembantu", "kamar_pembantu", "kamar_mandi", "kamar_tidur"]

In [4]:
X = df.drop(columns=["price"])
y = df["price"]

## Training Regression Models

In [5]:
def cross_validate_ex(model, X, y, category, name):
    # create cross-validation param
    cv = KFold(n_splits=10, shuffle=True, random_state=21)

    # define scoring
    scoring = ["r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_mean_absolute_percentage_error"]

    # cross-validate
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, n_jobs=4, verbose=1)

    # change into record-wise
    score_records = []
    for i in range(10):
        score_records.append({
            "fit_time": scores["fit_time"][i],
            "score_time": scores["score_time"][i],
            "r2": scores["test_r2"][i],
            "mse": -scores["test_neg_mean_squared_error"][i],
            "mae": -scores["test_neg_mean_absolute_error"][i],
            "mape": -scores["test_neg_mean_absolute_percentage_error"][i],
            "category": category,
            "name": name
        })
    
    return score_records

In [6]:
categorical_encoder = Pipeline(
  steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
  ]
)

numerical_encoder = Pipeline(
    steps=[
        ("scaler", MinMaxScaler()),
    ]
)

compose_transformers = ColumnTransformer(
    transformers=[
        ("passthrough", "passthrough", tags_cols + floor_mat_cols + house_mat_cols),
        ("catergorical_encoder", categorical_encoder, cat_cols),
        ("numerical_encoder", numerical_encoder, num_cols),
    ]
)

compose_transformers

In [7]:
# to hold CV results
cv_results = []

### Scikit-Learn Models

In [8]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [9]:
# define models
models = [
    ("Linear", "LinearRegression", LinearRegression()),
    ("Linear", "Lasso", Lasso()),
    ("Linear", "Ridge", Ridge()), 
    ("Linear", "BayesianRidge", BayesianRidge()),
    ("Tree", "DecisionTreeRegressor", DecisionTreeRegressor()),
    ("KNN", "KNeighborsRegressor", KNeighborsRegressor()),
    # ("SVM", "SVR", SVR()),
    ("Neural Network", "MLPRegressor", MLPRegressor()),
    ("Ensemble", "RandomForestRegressor", RandomForestRegressor()),
    ("Ensemble", "GradientBoostingRegressor", GradientBoostingRegressor()),
]

# evaluate each model
for category, name, model in models:
    print(f"Evaluating {category}/{name} model")

    # create classifier pipeline
    clf = Pipeline(
        steps=[
            ("preprocessor", compose_transformers),
            ("regressor", model),
        ]
    )

    # run cross validation
    cv_results.extend(cross_validate_ex(clf, X, y, category, name))

# with pd.option_context('display.float_format', '{:0.4f}'.format):

Evaluating Linear/LinearRegression model


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    1.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Evaluating Linear/Lasso model


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Evaluating Linear/Ridge model


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Evaluating Linear/BayesianRidge model


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.8s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Evaluating Tree/DecisionTreeRegressor model


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Evaluating KNN/KNeighborsRegressor model


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Evaluating Neural Network/MLPRegressor model


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   54.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Evaluating Ensemble/RandomForestRegressor model


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   30.8s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Evaluating Ensemble/GradientBoostingRegressor model


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   10.8s finished


### Catboost Models

In [10]:
from catboost import CatBoostRegressor, Pool

In [11]:
# select colums to use
X_catboost = df[cat_cols + num_cols].copy()

# convert multihot columns to list
X_catboost["tags"] = df[tags_cols].values.tolist()
X_catboost["floor_mat"] = df[floor_mat_cols].values.tolist()
X_catboost["house_mat"] = df[house_mat_cols].values.tolist()

cbembedding = ["tags", "floor_mat", "house_mat"]

X_catboost.head(2)

Unnamed: 0,kondisi_perabotan_norm,kondisi_properti_norm,konsep_dan_gaya_rumah,sumber_air,pemandangan,sertifikat,lebar_jalan_num,daya_listrik_num,luas_bangunan_num,luas_tanah_num,...,garasi,dapur,jumlah_lantai,kamar_mandi_pembantu,kamar_pembantu,kamar_mandi,kamar_tidur,tags,floor_mat,house_mat
0,unfurnished,furnished,Minimalis Modern,PAM atau PDAM,Pemukiman Warga,SHM - Sertifikat Hak Milik,2.0,2200.0,180.0,300.0,...,0.0,1.0,1.0,1.0,1.0,3.0,3.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0, 0.0, 0.0]"
1,semi furnished,furnished,Minimalis Modern,PAM atau PDAM,Pegunungan,SHM - Sertifikat Hak Milik,2.0,2200.0,270.0,385.0,...,4.0,1.0,1.0,1.0,4.0,3.0,4.0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0]","[0.0, 1.0, 0.0, 0.0]"


In [29]:
# run training
cv = KFold(n_splits=10, shuffle=True, random_state=21)
for fold_i, (train_idx, test_idx) in enumerate(cv.split(X_catboost, y)):
    print(f"Training fold {fold_i + 1}")

    # split data
    X_train, X_test = X_catboost.iloc[train_idx], X_catboost.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # create pool
    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_cols, embedding_features=cbembedding)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_cols, embedding_features=cbembedding)
    
    # train model
    model = CatBoostRegressor(loss_function="MAE", verbose=0, random_seed=21)

    fit_time_start = time.time()
    model.fit(train_pool, eval_set=test_pool, verbose=0)
    fit_time_end = time.time()

    # run predictions
    score_time_start = time.time()
    y_pred = model.predict(test_pool)
    score_time_end = time.time()

    # store metrics
    cv_results.append({
        "fit_time": fit_time_end - fit_time_start,
        "score_time":score_time_end - score_time_start,
        "r2": r2_score(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred),
        "mae": mean_absolute_error(y_test, y_pred),
        "mape": mean_absolute_percentage_error(y_test, y_pred),
        "category": "CatBoost",
        "name": "CatBoostRegressor V4",
    })

Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Training fold 6
Training fold 7
Training fold 8
Training fold 9
Training fold 10


In [42]:
# run training
cv = KFold(n_splits=10, shuffle=True, random_state=21)
for fold_i, (train_idx, test_idx) in enumerate(cv.split(X_catboost, y)):
    print(f"Training fold {fold_i + 1}")

    # split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # create pool
    train_pool = Pool(data=X_train.drop(columns=["city"]), label=y_train, cat_features=cat_cols + ["district"])
    test_pool = Pool(data=X_test.drop(columns=["city"]), label=y_test, cat_features=cat_cols + ["district"])
    
    # train model
    model = CatBoostRegressor(loss_function="RMSE", verbose=0, random_seed=21)

    fit_time_start = time.time()
    model.fit(train_pool, eval_set=test_pool, verbose=0)
    fit_time_end = time.time()

    # run predictions
    score_time_start = time.time()
    y_pred = model.predict(test_pool)
    score_time_end = time.time()

    # store metrics
    cv_results.append({
        "fit_time": fit_time_end - fit_time_start,
        "score_time": score_time_end - score_time_start,
        "r2": r2_score(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred),
        "mae": mean_absolute_error(y_test, y_pred),
        "mape": mean_absolute_percentage_error(y_test, y_pred),
        "category": "CatBoost MultiHot",
        "name": "CatBoostRegressor with City",
    })

Training fold 1
Training fold 2
Training fold 3
Training fold 4
Training fold 5
Training fold 6
Training fold 7
Training fold 8
Training fold 9
Training fold 10


### TensorFlow Models

In [13]:
import tensorflow as tf

2024-02-05 15:36:47.276092: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
# generator to convert dataframe to tf dataset
def tf_df_row_gen(df):
  for row in df.itertuples(index=False):
    values = []

    # map multihot columns
    for name, cols in zip(["floor_mat", "house_mat", "tags"], [floor_mat_cols, house_mat_cols, tags_cols]):
        cvals = []
        for col in cols:
            cvals.append(getattr(row, col))
        
        values.append(tf.constant(cvals, dtype=tf.float32, name=name))

    # map categorical columns
    for col in cat_cols:
        values.append(tf.constant(getattr(row, col), dtype=tf.string, name=col))

    # map numerical columns
    for col in num_cols:
        values.append(tf.constant(getattr(row, col), dtype=tf.float32, name=col))
    
    yield tuple(values)

In [15]:
def construct_tf_dataset(X, y):
    # tensor specs
    num_cols_spec = [tf.TensorSpec(shape=(), dtype=tf.float32, name=col) for col in num_cols]
    cat_cols_spec = [tf.TensorSpec(shape=(), dtype=tf.string, name=col) for col in cat_cols]
    embedding_cols_spec = [
        tf.TensorSpec(shape=(5,), dtype=tf.float32, name="floor_mat"),
        tf.TensorSpec(shape=(4,), dtype=tf.float32, name="house_mat"),
        tf.TensorSpec(shape=(15,), dtype=tf.float32, name="tags"),
    ]

    # create dataset
    ds_labels = tf.data.Dataset.from_tensor_slices(y.values, name="price")
    ds_features = tf.data.Dataset.from_generator(
        lambda: tf_df_row_gen(X),
        output_signature=tuple(embedding_cols_spec + cat_cols_spec + num_cols_spec)
    )

    return tf.data.Dataset.zip((ds_features, ds_labels))\
        .batch(64) \
        .cache() \
        .prefetch(tf.data.AUTOTUNE)

In [16]:
def create_numeric_norm_layer(index, dataset):
    # Create a Normalization layer for our feature.
    normalizer = tf.keras.layers.Normalization(axis=None)
    normalizer.adapt(dataset.map(lambda x, _: x[index]))
    
    return normalizer

def create_categorical_norm_layer(index, dataset):
    # Create a layer that turns strings into integer indices.
    indexer = tf.keras.layers.StringLookup(max_tokens=None)
    indexer.adapt(dataset.map(lambda x, _: x[index]))
    
    # Encode the integer indices.
    encoder = tf.keras.layers.CategoryEncoding(num_tokens=indexer.vocabulary_size())
    
    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoder(indexer(feature))

In [17]:
def create_tf_model(ds: tf.data.Dataset) -> tf.keras.Model:
    tf_inputs = []
    tf_layers = []

    for i, (col, col_len) in enumerate([("floor_mat", 5), ("house_mat", 4), ("tags", 15)]):
        input_layer = tf.keras.Input(shape=(col_len,), name=col)
        normalizer = tf.keras.layers.Dense(10, activation="relu")(input_layer)

        tf_inputs.append(input_layer)
        tf_layers.append(normalizer)

    for i, col in enumerate(cat_cols):
        input_layer = tf.keras.Input(shape=(1,), name=col, dtype=tf.string)
        normalizer = create_categorical_norm_layer(i + 3, ds)

        tf_inputs.append(input_layer)
        tf_layers.append(normalizer(input_layer))

    for i, col in enumerate(num_cols):
        input_layer = tf.keras.Input(shape=(1,), name=col)
        normalizer = create_numeric_norm_layer(i + 9, ds)

        tf_inputs.append(input_layer)
        tf_layers.append(normalizer(input_layer))

    # concatenate all layers
    x = tf.keras.layers.concatenate(tf_layers)
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    output = tf.keras.layers.Dense(1, name="price")(x)

    # create model
    model = tf.keras.Model(inputs=tf_inputs, outputs=output)
    return model

In [18]:
# run training
cv = KFold(n_splits=10, shuffle=True, random_state=21)
for fold_i, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    print(f"Training fold {fold_i + 1}")

    # split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # create dataset
    train_ds = construct_tf_dataset(X_train, y_train)
    test_ds = construct_tf_dataset(X_test, y_test)

    # create model
    model = create_tf_model(train_ds)
    # model.summary()
    # tf.keras.utils.plot_model(model, show_shapes=True, show_trainable=True, show_dtype=True, rankdir="LR")

    # compile model
    model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae", "mse"])

    # train model
    fit_time_start = time.time()
    model.fit(train_ds, epochs=100, validation_data=test_ds, verbose=0)
    fit_time_end = time.time()

    # run predictions
    score_time_start = time.time()
    y_pred = model.predict(test_ds).reshape(-1)
    score_time_end = time.time()

    # store metrics
    cv_results.append({
        "fit_time": fit_time_end - fit_time_start,
        "score_time": score_time_end - score_time_start,
        "r2": r2_score(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred),
        "mae": mean_absolute_error(y_test, y_pred),
        "mape": mean_absolute_percentage_error(y_test, y_pred),
        "category": "TensorFlow",
        "name": "DNNRegressorV1",
    })

Training fold 1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch

## Summary

In [46]:
df_scores = pd.DataFrame(cv_results)
df_scores.head()

Unnamed: 0,fit_time,score_time,r2,mse,mae,mape,category,name
0,0.096538,0.011033,0.392106,991462.351233,530.215342,0.531833,Linear,LinearRegression
1,0.122072,0.016042,0.598495,686327.11589,553.312329,0.53019,Linear,LinearRegression
2,0.128099,0.016167,0.646759,559877.950274,512.482466,1.363745,Linear,LinearRegression
3,0.131772,0.011631,0.63958,623993.178728,532.505482,0.536984,Linear,LinearRegression
4,0.127868,0.013432,0.64178,584207.024868,520.310526,0.526457,Linear,LinearRegression


In [47]:
rdf = df_scores.pivot_table(index=["category", "name"], values=["r2", "mse", "mae", "mape"]).round(4).sort_values(by="mse", ascending=True)
rdf.style.background_gradient()

Unnamed: 0_level_0,Unnamed: 1_level_0,mae,mape,mse,r2
category,name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CatBoost MultiHot,CatBoostRegressor with City,288.2309,0.4463,210627.6411,0.8693
Ensemble,RandomForestRegressor,273.5485,0.432,229367.8322,0.8576
CatBoost,CatBoostRegressor V2,315.2196,0.4565,248704.8505,0.8457
CatBoost MultiHot,CatBoostRegressor,319.4802,0.5071,250509.9097,0.8444
CatBoost,CatBoostRegressor,333.6472,0.5369,264625.1571,0.8357
CatBoost,CatBoostRegressor V4,343.1349,0.4899,296346.6007,0.8163
Ensemble,GradientBoostingRegressor,359.8924,0.5555,300612.5608,0.8134
TensorFlow,DNNRegressorV1,418.1895,0.8817,407731.1149,0.7469
Tree,DecisionTreeRegressor,347.023,0.4336,441138.639,0.7257
CatBoost,CatBoostRegressor V3,554.3796,0.9332,569044.8151,0.6476


In [48]:
df_scores.to_csv("../dataset/raw_regression_scores.csv", index=False)
rdf.to_csv("../dataset/summary_regression_scores.csv")