In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import preprocessing utilities
from util.preprocess import preprocess
from util.geo_data import add_geo_data

In [3]:
print("="*80)
print("HDB Resale Price Prediction - Hyperparameter Optimization")
print("="*80)

HDB Resale Price Prediction - Hyperparameter Optimization


In [4]:
print("\nLoading data...")
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")



Loading data...


In [5]:
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (162691, 11)
Test shape: (50000, 10)


In [6]:
# Preprocess data
print("\nPreprocessing data...")
train_processed = preprocess(train_df)
test_processed = preprocess(test_df)


Preprocessing data...


In [7]:
print("Adding geographical features...")
train_processed = add_geo_data(train_processed)
test_processed = add_geo_data(test_processed)

Adding geographical features...


In [8]:
# Prepare features for modeling
y_train = train_processed['RESALE_PRICE'].values
feature_cols = [col for col in train_processed.columns if col not in ['RESALE_PRICE', 'MONTH']]


In [9]:
# Identify categorical and numerical columns
categorical_cols = ['TOWN', 'FLAT_TYPE', 'BLOCK', 'STREET', 'FLOOR_RANGE', 'FLAT_MODEL']
numerical_cols = [col for col in feature_cols if col not in categorical_cols]


In [10]:
# Prepare dataframes for modeling
X_train = train_processed[feature_cols].copy()
X_test = test_processed[feature_cols].copy()

In [11]:
# Handle categorical variables
for col in categorical_cols:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')

In [12]:
# Handle missing values if any
if X_train.isnull().any().any():
    print("\nHandling missing values...")
    for col in X_train.columns:
        if X_train[col].isnull().any():
            if col in numerical_cols:
                median_val = X_train[col].median()
                X_train[col].fillna(median_val, inplace=True)
                X_test[col].fillna(median_val, inplace=True)
            else:
                mode_val = X_train[col].mode()[0] if len(X_train[col].mode()) > 0 else 'unknown'
                X_train[col].fillna(mode_val, inplace=True)
                X_test[col].fillna(mode_val, inplace=True)

In [13]:
print("\n" + "="*80)
print("Starting Hyperparameter Optimization with Optuna")
print("="*80)


Starting Hyperparameter Optimization with Optuna


In [14]:
# Define objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'force_col_wise': True,
        'random_state': 42,
        
        # Hyperparameters to optimize
        'num_leaves': trial.suggest_int('num_leaves', 50, 150),
        'max_depth': trial.suggest_int('max_depth', -1, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 2000, 6000),
        
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 0.9),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 0.9),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 30),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.0001, 0.01, log=True),
        
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 2.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 2.0),
        
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.05),
        'max_bin': trial.suggest_int('max_bin', 200, 300),
    }
    
    # 3-Fold CV for faster optimization (will use 5-fold for final model)
    n_splits = 3
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    cv_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        # Create LightGBM datasets
        train_data = lgb.Dataset(X_tr, label=y_tr, categorical_feature='auto')
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, categorical_feature='auto')
        
        # Train model
        model = lgb.train(
            params,
            train_data,
            num_boost_round=params['n_estimators'],
            valid_sets=[val_data],
            callbacks=[
                lgb.early_stopping(stopping_rounds=150, verbose=False),
            ]
        )
        
        # Predictions
        y_pred = model.predict(X_val, num_iteration=model.best_iteration)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        cv_scores.append(rmse)
    
    mean_rmse = np.mean(cv_scores)
    return mean_rmse

In [15]:
# Create Optuna study
print("\nStarting Optuna optimization...")
study = optuna.create_study(
    direction='minimize',
    sampler=TPESampler(seed=42),
    study_name='lgb_hdb_optimization'
)

[I 2025-11-06 09:30:52,176] A new study created in memory with name: lgb_hdb_optimization



Starting Optuna optimization...


In [16]:
# Run optimization
study.optimize(
    objective,
    n_trials=100,
    timeout=18000,  
    show_progress_bar=True,
    n_jobs=1
)

  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-11-06 09:31:42,276] Trial 0 finished with value: 25765.957731719278 and parameters: {'num_leaves': 87, 'max_depth': 15, 'learning_rate': 0.026975154833351143, 'n_estimators': 4395, 'feature_fraction': 0.5624074561769746, 'bagging_fraction': 0.562397808134481, 'bagging_freq': 1, 'min_child_samples': 28, 'min_child_weight': 0.0015930522616241021, 'lambda_l1': 1.416145155592091, 'lambda_l2': 0.041168988591604894, 'min_split_gain': 0.04849549260809972, 'max_bin': 284}. Best is trial 0 with value: 25765.957731719278.
[I 2025-11-06 09:31:53,233] Trial 1 finished with value: 39769.346127534714 and parameters: {'num_leaves': 71, 'max_depth': 2, 'learning_rate': 0.007627364729026304, 'n_estimators': 3217, 'feature_fraction': 0.7099025726528951, 'bagging_fraction': 0.6727780074568463, 'bagging_freq': 3, 'min_child_samples': 22, 'min_child_weight': 0.00019010245319870352, 'lambda_l1': 0.5842892970704363, 'lambda_l2': 0.7327236865873834, 'min_split_gain': 0.0228034992108518, 'max_bin': 279

In [17]:
# Print optimization results
print("\n" + "="*80)
print("Optimization Complete!")
print("="*80)


Optimization Complete!


In [18]:
print(f"\nBest CV RMSE: {study.best_value:,.2f}")
print(f"Number of trials: {len(study.trials)}")


Best CV RMSE: 25,457.66
Number of trials: 100


In [19]:
print("\nBest hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")


Best hyperparameters:
  num_leaves: 144
  max_depth: 14
  learning_rate: 0.007272698914248853
  n_estimators: 5895
  feature_fraction: 0.5494699263850129
  bagging_fraction: 0.5451546623033238
  bagging_freq: 1
  min_child_samples: 15
  min_child_weight: 0.002158697533609497
  lambda_l1: 1.668389475448569
  lambda_l2: 0.41010045543664
  min_split_gain: 0.04022193440027789
  max_bin: 206


In [20]:
# Save optimization results
study_df = study.trials_dataframe()
study_df.to_csv('optuna_study_results.csv', index=False)
print("\nOptimization history saved to: optuna_study_results.csv")


Optimization history saved to: optuna_study_results.csv


In [21]:
# Train final model with best parameters
print("\n" + "="*80)
print("Training Final Model with Best Parameters (5-Fold CV)")
print("="*80)


Training Final Model with Best Parameters (5-Fold CV)


In [22]:
best_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'force_col_wise': True,
    'random_state': 42,
    **study.best_params
}

In [23]:
# Extract n_estimators separately
n_estimators = best_params.pop('n_estimators')

In [24]:
# 5-Fold CV for final model
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [25]:
oof_predictions = np.zeros(len(X_train))
test_predictions = np.zeros(len(X_test))
feature_importance_df = pd.DataFrame()
cv_scores = []
cv_mae_scores = []
cv_r2_scores = []

In [26]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
    print(f"\nFold {fold}/{n_splits}")
    print("-" * 40)
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_tr, label=y_tr, categorical_feature='auto')
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, categorical_feature='auto')
    
    # Train model
    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=n_estimators,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=False),
            lgb.log_evaluation(period=500)
        ]
    )
    
    # Predictions
    oof_predictions[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    test_predictions += model.predict(X_test, num_iteration=model.best_iteration) / n_splits
    
    # Calculate fold scores
    fold_rmse = np.sqrt(mean_squared_error(y_val, oof_predictions[val_idx]))
    fold_mae = mean_absolute_error(y_val, oof_predictions[val_idx])
    fold_r2 = r2_score(y_val, oof_predictions[val_idx])
    
    cv_scores.append(fold_rmse)
    cv_mae_scores.append(fold_mae)
    cv_r2_scores.append(fold_r2)
    
    print(f"Fold {fold} RMSE: {fold_rmse:,.2f}")
    print(f"Fold {fold} MAE: {fold_mae:,.2f}")
    print(f"Fold {fold} R²: {fold_r2:.4f}")
    print(f"Best iteration: {model.best_iteration}")
    
    # Feature importance
    fold_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importance(importance_type='gain'),
        'fold': fold
    })
    feature_importance_df = pd.concat([feature_importance_df, fold_importance], axis=0)



Fold 1/5
----------------------------------------
[500]	train's rmse: 32109.3	valid's rmse: 33948.8
[1000]	train's rmse: 24782.7	valid's rmse: 27674.6
[1500]	train's rmse: 23020.8	valid's rmse: 26460.6
[2000]	train's rmse: 22059.6	valid's rmse: 25922.1
[2500]	train's rmse: 21370.4	valid's rmse: 25603.1
[3000]	train's rmse: 20840.2	valid's rmse: 25415.7
[3500]	train's rmse: 20383.4	valid's rmse: 25289
[4000]	train's rmse: 19990.1	valid's rmse: 25210.5
[4500]	train's rmse: 19632.2	valid's rmse: 25141
[5000]	train's rmse: 19307.1	valid's rmse: 25088.9
[5500]	train's rmse: 19004.4	valid's rmse: 25050.2
Fold 1 RMSE: 25,033.25
Fold 1 MAE: 17,940.15
Fold 1 R²: 0.9812
Best iteration: 5891

Fold 2/5
----------------------------------------
[500]	train's rmse: 31962	valid's rmse: 34309.9
[1000]	train's rmse: 24644.2	valid's rmse: 28169.3
[1500]	train's rmse: 22889	valid's rmse: 26978.2
[2000]	train's rmse: 21934.1	valid's rmse: 26443.2
[2500]	train's rmse: 21259.4	valid's rmse: 26135.7
[3000]	t

In [27]:
# Overall CV scores
overall_rmse = np.sqrt(mean_squared_error(y_train, oof_predictions))
overall_mae = mean_absolute_error(y_train, oof_predictions)
overall_r2 = r2_score(y_train, oof_predictions)

In [28]:
print("\n" + "="*80)
print("FINAL RESULTS WITH OPTIMIZED HYPERPARAMETERS")
print("="*80)
print(f"\nOverall CV RMSE: {overall_rmse:,.2f} SGD")
print(f"Mean CV RMSE: {np.mean(cv_scores):,.2f} (+/- {np.std(cv_scores):,.2f}) SGD")
print(f"\nOverall CV MAE: {overall_mae:,.2f} SGD")
print(f"Mean CV MAE: {np.mean(cv_mae_scores):,.2f} (+/- {np.std(cv_mae_scores):,.2f}) SGD")
print(f"\nOverall CV R²: {overall_r2:.4f}")
print(f"Mean CV R²: {np.mean(cv_r2_scores):.4f} (+/- {np.std(cv_r2_scores):.4f})")
print(f"\nFold Scores:")
for i in range(len(cv_scores)):
    print(f"  Fold {i+1}: RMSE={cv_scores[i]:,.2f} SGD, MAE={cv_mae_scores[i]:,.2f} SGD, R²={cv_r2_scores[i]:.4f}")



FINAL RESULTS WITH OPTIMIZED HYPERPARAMETERS

Overall CV RMSE: 25,220.86 SGD
Mean CV RMSE: 25,219.59 (+/- 253.56) SGD

Overall CV MAE: 17,993.88 SGD
Mean CV MAE: 17,993.88 (+/- 115.36) SGD

Overall CV R²: 0.9811
Mean CV R²: 0.9811 (+/- 0.0003)

Fold Scores:
  Fold 1: RMSE=25,033.25 SGD, MAE=17,940.15 SGD, R²=0.9812
  Fold 2: RMSE=25,583.98 SGD, MAE=18,175.95 SGD, R²=0.9808
  Fold 3: RMSE=25,423.30 SGD, MAE=18,045.74 SGD, R²=0.9808
  Fold 4: RMSE=24,888.02 SGD, MAE=17,827.68 SGD, R²=0.9816
  Fold 5: RMSE=25,169.38 SGD, MAE=17,979.89 SGD, R²=0.9809


In [29]:
# Feature importance with percentage normalization
print("\n" + "="*80)
print("Top 10 Most Important Features:")
print("="*80)
feature_importance_summary = feature_importance_df.groupby('feature')['importance'].mean().sort_values(ascending=False)



Top 10 Most Important Features:


In [30]:
# Normalize to percentage
total_importance = feature_importance_summary.sum()
feature_importance_pct = (feature_importance_summary / total_importance * 100).sort_values(ascending=False)


In [31]:
# Display top 10 with percentage
print("\nFeature Importance (%):")
for feature, pct in feature_importance_pct.head(10).items():
    print(f"  {feature}: {pct:.2f}%")


Feature Importance (%):
  FLOOR_AREA_SQM: 20.83%
  STREET: 18.73%
  RESALE_YEAR: 13.65%
  FLAT_TYPE: 12.50%
  TOWN: 7.61%
  FLAT_MODEL: 6.93%
  LEASE_COMMENCE_DATA: 5.23%
  FLAT_AGE: 3.92%
  LATITUDE: 2.36%
  FLOOR_AVG: 2.13%


In [32]:
# Save feature importance with percentage
feature_importance_save = pd.DataFrame({
    'feature': feature_importance_pct.index,
    'importance_percentage': feature_importance_pct.values
})
feature_importance_save.to_csv('feature_importance_optimized_hyperopt.csv', index=False)
print("\nFeature importance saved to: feature_importance_optimized_hyperopt.csv")



Feature importance saved to: feature_importance_optimized_hyperopt.csv


In [41]:
# Visualize feature importance
print("\nGenerating feature importance visualization...")
plt.figure(figsize=(12, 8))
top_n = 10
top_features = feature_importance_pct.head(top_n)
plt.barh(range(len(top_features)), top_features.values)
plt.yticks(range(len(top_features)), top_features.index)
plt.xlabel('Importance (%)', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title(f'Top {top_n} Feature Importance (Percentage) - LightGBM', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('feature_importance_plot.png', dpi=300, bbox_inches='tight')
print("Feature importance plot saved to: feature_importance_plot.png")
plt.close()


Generating feature importance visualization...
Feature importance plot saved to: feature_importance_plot.png


In [42]:
# Create submission file
print("\n" + "="*80)
print("Creating Submission File")
print("="*80)


Creating Submission File


In [43]:
submission = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'Predicted': test_predictions
})

In [44]:
submission.to_csv('submission_hyperopt.csv', index=False)
print(f"\nSubmission file saved: submission_hyperopt.csv")
print(f"Number of predictions: {len(submission)}")
print(f"\nPrediction Statistics:")
print(f"  Min: ${submission['Predicted'].min():,.2f}")
print(f"  Max: ${submission['Predicted'].max():,.2f}")
print(f"  Mean: ${submission['Predicted'].mean():,.2f}")
print(f"  Median: ${submission['Predicted'].median():,.2f}")


Submission file saved: submission_hyperopt.csv
Number of predictions: 50000

Prediction Statistics:
  Min: $165,228.66
  Max: $1,545,383.24
  Mean: $517,817.95
  Median: $487,617.17


In [47]:
# Save best parameters to file
with open('best_hyperparameters.txt', 'w') as f:
    f.write("Best Hyperparameters from Optuna Optimization\n")
    f.write("="*50 + "\n\n")
    f.write(f"CV RMSE: {overall_rmse:,.2f} SGD\n")
    f.write(f"Optuna Best Value: {study.best_value:,.2f} SGD\n\n")
    f.write("Parameters:\n")
    for key, value in study.best_params.items():
        f.write(f"  {key}: {value}\n")

In [48]:
print("\nBest parameters saved to: best_hyperparameters.txt")


Best parameters saved to: best_hyperparameters.txt


In [49]:
print("\n" + "="*80)
print("OPTIMIZATION COMPLETE!")
print("="*80)


OPTIMIZATION COMPLETE!
