In [1]:
import pandas as pd
import numpy as np
import re
import json
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [2]:
df = pd.read_csv("house_price_sleman.csv")
df.head()


Unnamed: 0,price,nav-link,description,listing-location,bed,bath,carport,surface_area,building_area
0,695.0,https://www.rumah123.com/properti/sleman/hos17...,Rumah 2 Lantai Baru di jalan Palagan Sleman Y...,"Ngaglik, Sleman",3.0,3.0,2.0,120.0,110.0
1,682.0,https://www.rumah123.com/properti/sleman/hos17...,RUMAH BARU DEKAT AL AZHAR DAN UGM,"Jombor, Sleman",3.0,2.0,1.0,102.0,126.0
2,580.0,https://www.rumah123.com/properti/sleman/hos17...,RUMAH ASRI DAN SEJUK DI BERBAH SLEMAN DEKAT PA...,"Berbah, Sleman",2.0,2.0,1.0,100.0,100.0
3,504.0,https://www.rumah123.com/properti/sleman/hos17...,Rumah Murah 5 Menit Dari Candi Prambanan Tersi...,"Prambanan, Sleman",3.0,1.0,1.0,109.0,67.0
4,275.0,https://www.rumah123.com/properti/sleman/hos17...,Rumah Murah Cicilan 1jt Di Moyudan Sleman,"Moyudan, Sleman",2.0,1.0,1.0,60.0,30.0


In [3]:
needed_cols = [
    "price",
    "listing-location",
    "bed",
    "bath",
    "carport",
    "surface_area",
    "building_area"  # ditambahkan
]

missing = [c for c in needed_cols if c not in df.columns]
print("Missing columns:", missing)

df = df[needed_cols].copy()
df = df.dropna()
df.reset_index(drop=True, inplace=True)

df.head()


Missing columns: []


Unnamed: 0,price,listing-location,bed,bath,carport,surface_area,building_area
0,695.0,"Ngaglik, Sleman",3.0,3.0,2.0,120.0,110.0
1,682.0,"Jombor, Sleman",3.0,2.0,1.0,102.0,126.0
2,580.0,"Berbah, Sleman",2.0,2.0,1.0,100.0,100.0
3,504.0,"Prambanan, Sleman",3.0,1.0,1.0,109.0,67.0
4,275.0,"Moyudan, Sleman",2.0,1.0,1.0,60.0,30.0


In [4]:
def clean_price_to_rupiah(x):
    if pd.isna(x):
        return np.nan

    s = str(x).lower().strip()
    s = s.replace("rp", "").replace(" ", "")
    s = s.replace(",", ".")
    multiplier = 1

    if "miliar" in s:
        multiplier = 1_000_000_000
        s = s.replace("miliar", "")
    elif "juta" in s:
        multiplier = 1_000_000
        s = s.replace("juta", "")

    s = re.sub(r"[^0-9\.]", "", s)
    if s == "":
        return np.nan

    return float(s) * multiplier

def clean_surface_area(x):
    if pd.isna(x):
        return np.nan

    s = str(x).lower().strip()
    s = s.replace("m²", "").replace("m2", "").replace(" ", "")
    s = re.sub(r"[^0-9\.]", "", s)

    if s == "":
        return np.nan

    return float(s)

# Bersihkan kolom
df["price"] = df["price"].apply(clean_price_to_rupiah)
df["surface_area"] = df["surface_area"].apply(clean_surface_area)
df["building_area"] = df["building_area"].apply(clean_surface_area)  # ditambahkan

df["bed"] = pd.to_numeric(df["bed"], errors="coerce")
df["bath"] = pd.to_numeric(df["bath"], errors="coerce")
df["carport"] = pd.to_numeric(df["carport"], errors="coerce")

df = df.dropna()
df.reset_index(drop=True, inplace=True)

df.head()


Unnamed: 0,price,listing-location,bed,bath,carport,surface_area,building_area
0,695.0,"Ngaglik, Sleman",3.0,3.0,2.0,120.0,110.0
1,682.0,"Jombor, Sleman",3.0,2.0,1.0,102.0,126.0
2,580.0,"Berbah, Sleman",2.0,2.0,1.0,100.0,100.0
3,504.0,"Prambanan, Sleman",3.0,1.0,1.0,109.0,67.0
4,275.0,"Moyudan, Sleman",2.0,1.0,1.0,60.0,30.0


In [5]:
print(df.dtypes)
print("\nHarga MIN:", df["price"].min())
print("Harga MAX:", df["price"].max())


price               float64
listing-location     object
bed                 float64
bath                float64
carport             float64
surface_area        float64
building_area       float64
dtype: object

Harga MIN: 217.0
Harga MAX: 722445.0


In [6]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Train:", X_train.shape)
print("Test :", X_test.shape)


Train: (935, 6)
Test : (234, 6)


In [7]:
numeric_features = ["bed", "bath", "carport", "surface_area", "building_area"]
categorical_features = ["listing-location"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [8]:
models = {
    "Linear Regression": Pipeline([
        ("preprocess", preprocessor),
        ("model", LinearRegression())
    ]),

    "Random Forest": Pipeline([
        ("preprocess", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=400,
            random_state=42,
            n_jobs=-1
        ))
    ])
}

results_dict = {}

for name, pipeline_model in models.items():
    pipeline_model.fit(X_train, y_train)
    pred_test = pipeline_model.predict(X_test)

    test_r2 = r2_score(y_test, pred_test)
    test_mae = mean_absolute_error(y_test, pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, pred_test))

    results_dict[name] = {
        "test_r2": float(test_r2),
        "test_mae": float(test_mae),
        "test_rmse": float(test_rmse)
    }

results_dict


{'Linear Regression': {'test_r2': 0.9196532464453151,
  'test_mae': 128.13963669971636,
  'test_rmse': 227.50459421827378},
 'Random Forest': {'test_r2': 0.923484004443253,
  'test_mae': 45.80091426017017,
  'test_rmse': 222.01489947663916}}

In [9]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# ubah sparse matrix jadi dense array
X_train_processed = X_train_processed.toarray()
X_test_processed = X_test_processed.toarray()

print("Train processed:", X_train_processed.shape)
print("Test processed :", X_test_processed.shape)


Train processed: (935, 28)
Test processed : (234, 28)


In [10]:
def build_dnn(input_dim, lr=0.001, dropout1=0.2, dropout2=0.15):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(256, activation="relu"),
        layers.Dropout(dropout1),
        layers.Dense(128, activation="relu"),
        layers.Dropout(dropout2),
        layers.Dense(64, activation="relu"),
        layers.Dense(1)
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="mse",
        metrics=[keras.metrics.MeanAbsoluteError(name="mae")]
    )

    return model

dnn_model = build_dnn(X_train_processed.shape[1])
dnn_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               7424      
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 48641 (190.00 KB)
Trainable params: 48641 

In [11]:
early_stop = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=12,
    restore_best_weights=True
)

reduce_lr = keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=5,
    min_lr=1e-6
)

history = dnn_model.fit(
    X_train_processed, y_train,
    validation_split=0.2,
    epochs=250,
    batch_size=32,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250


In [12]:
pred_dnn = dnn_model.predict(X_test_processed).flatten()

dnn_r2 = r2_score(y_test, pred_dnn)
dnn_mae = mean_absolute_error(y_test, pred_dnn)
dnn_rmse = np.sqrt(mean_squared_error(y_test, pred_dnn))

results_dict["DNN"] = {
    "test_r2": float(dnn_r2),
    "test_mae": float(dnn_mae),
    "test_rmse": float(dnn_rmse)
}

results_dict




{'Linear Regression': {'test_r2': 0.9196532464453151,
  'test_mae': 128.13963669971636,
  'test_rmse': 227.50459421827378},
 'Random Forest': {'test_r2': 0.923484004443253,
  'test_mae': 45.80091426017017,
  'test_rmse': 222.01489947663916},
 'DNN': {'test_r2': 0.8936806945932074,
  'test_mae': 131.36543260884082,
  'test_rmse': 261.70502989702595}}

In [13]:
with open("model_results.json", "w") as f:
    json.dump(results_dict, f, indent=4)

print("Saved: model_results.json ✅")


Saved: model_results.json ✅


In [14]:
metrics_df = pd.DataFrame(results_dict).T
metrics_df


Unnamed: 0,test_r2,test_mae,test_rmse
Linear Regression,0.919653,128.139637,227.504594
Random Forest,0.923484,45.800914,222.014899
DNN,0.893681,131.365433,261.70503


In [15]:
df_score = metrics_df.copy()

# Normalisasi MAE (lebih kecil lebih bagus)
df_score["mae_norm"] = (df_score["test_mae"] - df_score["test_mae"].min()) / (df_score["test_mae"].max() - df_score["test_mae"].min())

# Normalisasi RMSE (lebih kecil lebih bagus)
df_score["rmse_norm"] = (df_score["test_rmse"] - df_score["test_rmse"].min()) / (df_score["test_rmse"].max() - df_score["test_rmse"].min())

# Normalisasi R² (lebih besar lebih bagus -> dibalik)
df_score["r2_norm"] = (df_score["test_r2"].max() - df_score["test_r2"]) / (df_score["test_r2"].max() - df_score["test_r2"].min())

# Bobot gabungan
w_rmse = 0.5
w_mae = 0.3
w_r2 = 0.2

df_score["final_score"] = (
    w_rmse * df_score["rmse_norm"] +
    w_mae  * df_score["mae_norm"] +
    w_r2   * df_score["r2_norm"]
)

df_score.sort_values("final_score")


Unnamed: 0,test_r2,test_mae,test_rmse,mae_norm,rmse_norm,r2_norm,final_score
Random Forest,0.923484,45.800914,222.014899,0.0,0.0,0.0,0.0
Linear Regression,0.919653,128.139637,227.504594,0.9623,0.138314,0.128535,0.383554
DNN,0.893681,131.365433,261.70503,1.0,1.0,1.0,1.0


In [16]:
best_model_name = df_score["final_score"].idxmin()
print("BEST MODEL (Combined MAE+RMSE+R2):", best_model_name)


BEST MODEL (Combined MAE+RMSE+R2): Random Forest


In [17]:
# simpan preprocessor
joblib.dump(preprocessor, "preprocessor.pkl")
print("Saved: preprocessor.pkl ✅")

# simpan model terbaik
if best_model_name in ["Linear Regression", "Random Forest"]:
    joblib.dump(models[best_model_name], "best_model.pkl")
    print("Saved: best_model.pkl ✅")
else:
    dnn_model.save("best_model_tf.h5")
    print("Saved: best_model_tf.h5 ✅")


Saved: preprocessor.pkl ✅
Saved: best_model.pkl ✅
