In [15]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

import xgboost as xgb

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


In [16]:
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = '/content/drive/MyDrive/ML/midterm-regresi-dataset.csv'

df = pd.read_csv(DATA_PATH, header=None)

print("Dataset shape:", df.shape)
df.head()



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset shape: (515345, 91)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [17]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (515345, 90)
y shape: (515345,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [19]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)

mse_lr  = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr  = mean_absolute_error(y_test, y_pred_lr)
r2_lr   = r2_score(y_test, y_pred_lr)

print("Linear Regression")
print("RMSE:", rmse_lr)
print("MAE :", mae_lr)
print("R2  :", r2_lr)


Linear Regression
RMSE: 9.523312054048365
MAE : 6.778168687522752
R2  : 0.23796616215080746


In [22]:
xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=2
)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
mae_xgb  = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb   = r2_score(y_test, y_pred_xgb)

print("XGBoost")
print("RMSE:", rmse_xgb)
print("MAE :", mae_xgb)
print("R2  :", r2_xgb)


XGBoost
RMSE: 8.78830291856605
MAE : 6.124345302581787
R2  : 0.3510543704032898


In [23]:
tf.random.set_seed(42)

model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    Dense(1)
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mse'
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-5
)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=256,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)


Epoch 1/100
[1m1289/1289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 14ms/step - loss: 2013008.7500 - val_loss: 46457.8203 - learning_rate: 0.0010
Epoch 2/100
[1m1289/1289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 12ms/step - loss: 41866.2109 - val_loss: 8289.6338 - learning_rate: 0.0010
Epoch 3/100
[1m1289/1289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 12ms/step - loss: 25155.4941 - val_loss: 3584.4316 - learning_rate: 0.0010
Epoch 4/100
[1m1289/1289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - loss: 22239.2715 - val_loss: 2525.7361 - learning_rate: 0.0010
Epoch 5/100
[1m1289/1289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - loss: 20411.3691 - val_loss: 546.1136 - learning_rate: 0.0010
Epoch 6/100
[1m1289/1289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 11ms/step - loss: 18587.9863 - val_loss: 932.1183 - learning_rate: 0.0010
Epoch 7/100
[1m1289/1289[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [24]:
y_pred_dl = model.predict(X_test_scaled).flatten()

rmse_dl = np.sqrt(mean_squared_error(y_test, y_pred_dl))
mae_dl  = mean_absolute_error(y_test, y_pred_dl)
r2_dl   = r2_score(y_test, y_pred_dl)

print("Deep Learning (MLP)")
print("RMSE:", rmse_dl)
print("MAE :", mae_dl)
print("R2  :", r2_dl)


[1m3221/3221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step
Deep Learning (MLP)
RMSE: 15.750286281345362
MAE : 13.054705619812012
R2  : -1.08437180519104


In [25]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'XGBoost', 'Deep Learning'],
    'RMSE': [rmse_lr, rmse_xgb, rmse_dl],
    'R2':   [r2_lr, r2_xgb, r2_dl]
})

results.sort_values(by='RMSE')


Unnamed: 0,Model,RMSE,R2
1,XGBoost,8.788303,0.351054
0,Linear Regression,9.523312,0.237966
2,Deep Learning,15.750286,-1.084372
