In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
X_train = pd.read_csv("dataset/X_train.csv")
y_train = pd.read_csv("dataset/y_train.csv")
X_test = pd.read_csv("dataset/X_test.csv")
y_test = pd.read_csv("dataset/y_test.csv")

In [3]:
# Definisi Fungsi Evaluasi
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Prediksi pada data training dan testing
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Hitung metrik evaluasi
    metrics = {
        'Model': model_name,
        'Train MAE': mean_absolute_error(y_train, y_train_pred),
        'Test MAE': mean_absolute_error(y_test, y_test_pred),
        'Train RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred)),
        'Train R2': r2_score(y_train, y_train_pred),
        'Test R2': r2_score(y_test, y_test_pred)
    }
    
    # Validasi silang (k=5)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    metrics['CV R2 Mean'] = cv_scores.mean()
    metrics['CV R2 Std'] = cv_scores.std()
    
    return metrics

In [4]:
# Pelatihan Model
results = []

## Linear Regression

In [5]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
results.append(evaluate_model(lr_model, X_train, X_test, y_train, y_test, 'Linear Regression'))

## Ridge Regression


In [6]:
ridge_params = {'alpha': [0.1, 1.0, 10.0]}
ridge_grid = GridSearchCV(Ridge(), ridge_params, cv=5, scoring='r2')
ridge_grid.fit(X_train, y_train)
ridge_best = ridge_grid.best_estimator_
print("\n=== Ridge Regression Best Parameters ===")
print(ridge_grid.best_params_)
results.append(evaluate_model(ridge_best, X_train, X_test, y_train, y_test, 'Ridge Regression'))


=== Ridge Regression Best Parameters ===
{'alpha': 10.0}


## Random Forest

In [7]:
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}
rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=5, scoring='r2')
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_
print("\n=== Random Forest Best Parameters ===")
print(rf_grid.best_params_)
results.append(evaluate_model(rf_best, X_train, X_test, y_train, y_test, 'Random Forest'))

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


KeyboardInterrupt: 

## XGBoost

In [None]:
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1]
}
xgb_grid = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
xgb_grid.fit(X_train, y_train)
xgb_best = xgb_grid.best_estimator_
print("\n=== XGBoost Best Parameters ===")
print(xgb_grid.best_params_)
results.append(evaluate_model(xgb_best, X_train, X_test, y_train, y_test, 'XGBoost'))

## SVR

In [None]:
svr_params = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1.0, 10.0],
    'epsilon': [0.1, 0.2]
}
svr_grid = GridSearchCV(SVR(), svr_params, cv=5, scoring='r2')
svr_grid.fit(X_train, y_train)
svr_best = svr_grid.best_estimator_
print("\n=== SVR Best Parameters ===")
print(svr_grid.best_params_)
results.append(evaluate_model(svr_best, X_train, X_test, y_train, y_test, 'SVR'))

## Ringkasan Hasil


In [None]:
results_df = pd.DataFrame(results)
print("\n=== Perbandingan Performa Model ===")
print(results_df)


## Visualisasi Perbandingan Model

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Test R2', data=results_df)
plt.title('Perbandingan R2 Score pada Data Testing')
plt.xlabel('Model')
plt.ylabel('Test R2 Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.close()

## Visualisasi RMSE

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Test RMSE', data=results_df)
plt.title('Perbandingan RMSE pada Data Testing')
plt.xlabel('Model')
plt.ylabel('Test RMSE')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('model_comparison_rmse.png')
plt.close()

In [None]:
# 7. Menyimpan Hasil
results_df.to_csv('model_comparison_results.csv', index=False)
print("\nHasil perbandingan model disimpan sebagai 'model_comparison_results.csv'")