In [1]:
# ONLY RUN THIS IF YOU'RE IN GOOGLE COLAB
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Thesis/Pintu-Air/notebooks')

# Verify you're in the right place
!pwd
!ls -la

Mounted at /content/drive
/content/drive/MyDrive/Thesis/Pintu-Air/notebooks
total 23640
-rw------- 1 root root 5517847 Jun 19 04:28 '01 Result Data Cleaning Part 1.csv'
-rw------- 1 root root 2009870 Jun 19 04:28 '02 All Data.csv'
-rw------- 1 root root 1591313 Jun 19 04:28 '02 Data Preperation.ipynb'
-rw------- 1 root root  256098 Jun 19 04:28 '02 X_test.csv'
-rw------- 1 root root 4831129 Jun 19 04:28 '02 X_train.csv'
-rw------- 1 root root   40580 Jun 19 04:28 '02 y_test.csv'
-rw------- 1 root root  770651 Jun 19 04:28 '02 y_train.csv'
-rw------- 1 root root 1657773 Jun 19 04:28  03_ARIMA.ipynb
-rw------- 1 root root    2708 Jun 19 04:28  03_arima_results_all_significant.csv
-rw------- 1 root root   49684 Jun 19 04:28  03_arima_results.csv
-rw------- 1 root root 1737590 Jun 19 04:28  04_XGBoost.ipynb
-rw------- 1 root root  950934 Jun 19 04:29 '04_XGBoost Reduced.ipynb'
-rw------- 1 root root 1844123 Jun 19 04:28  05_SARIMAX.ipynb
-rw------- 1 root root   89515 Jun 19 07:30  06b_LST

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import RobustScaler
from itertools import product
import os
import warnings
warnings.filterwarnings('ignore')

2025-06-19 21:08:25.544649: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-19 21:08:25.936655: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750342106.074931    1020 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750342106.113325    1020 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750342106.424253    1020 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# GPU Configuration
print("GPU Available:", tf.config.list_physical_devices('GPU'))

# ADD THIS BLOCK:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled")
    except RuntimeError as e:
        print(f"GPU setup error: {e}")

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU memory growth enabled


In [3]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
y_train = (pd.read_csv("02 y_train.csv", index_col='Tanggal')).values
y_test = (pd.read_csv("02 y_test.csv", index_col='Tanggal')).values

In [5]:
X = pd.read_csv("02 All Data.csv")
X['Tanggal'] = pd.to_datetime(X['Tanggal'])
X = X.set_index('Tanggal').iloc[24:]

# Feature Processing
categorical_cols = [col for col in X.columns if 'cuaca' in col]
numeric_cols = [col for col in X.columns if 'air' in col]

X_num = X[numeric_cols].copy()
X_cat = pd.DataFrame()

for col in categorical_cols:
    X_cat[f'{col}_hujan'] = X[col].isin(['Hujan', 'Gerimis']).astype(int)

In [7]:
# Train-Test Split
split_idx = int(len(X) * 0.95)
X_num_train, X_num_test = X_num[:split_idx], X_num[split_idx:]
X_cat_train, X_cat_test = X_cat[:split_idx], X_cat[split_idx:]

# Scaling
# scaler_X = MinMaxScaler()
# scaler_y = MinMaxScaler()
scaler_X = RobustScaler()
scaler_y = RobustScaler()

X_num_train_scaled = scaler_X.fit_transform(X_num_train)
X_num_test_scaled = scaler_X.transform(X_num_test)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()


print(f"\n{'='*60}\nData Train and Test Distribution:\n{'='*60}")

print(f'Jumlah data: {len(X)}')

print(f'Jumlah data X train: {len(X_num_train)}')
print(f'Jumlah data X test: {len(X_num_test)}')

print(f'Jumlah data y train: {len(y_train)}')
print(f'Jumlah data y test: {len(y_test)}')


Data Train and Test Distribution:
Jumlah data: 31200
Jumlah data X train: 29640
Jumlah data X test: 1560
Jumlah data y train: 29640
Jumlah data y test: 1560


In [8]:
# Combine Features
X_train = np.concatenate([X_num_train_scaled, X_cat_train.values], axis=1)
X_test = np.concatenate([X_num_test_scaled, X_cat_test.values], axis=1)

print(f"Features: {X_train.shape[1]} total ({len(numeric_cols)} numeric, {len(categorical_cols)} categorical)")

Features: 6 total (3 numeric, 3 categorical)


In [9]:
def create_sequences(X, y, seq_len):
    X_seq, y_seq = [], []

    for i in range(seq_len, len(X)):
        # Take seq_len previous time steps as features
        X_seq.append(X[i-seq_len:i])
        # Current time step as target
        y_seq.append(y[i])

    return np.array(X_seq), np.array(y_seq)

In [10]:
def build_model(seq_len, n_features, units, dropout, lr):
    """Build LSTM model"""
    model = Sequential()

    for i, unit in enumerate(units):
        return_seq = i < len(units) - 1
        if i == 0:
            model.add(LSTM(unit, return_sequences=return_seq, input_shape=(seq_len, n_features)))
        else:
            model.add(LSTM(unit, return_sequences=return_seq))
        model.add(Dropout(dropout))

    model.add(Dense(16, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer=Adam(learning_rate=lr), loss='mse', metrics=['mae'])
    return model

In [11]:
def evaluate_model(model, X_seq, y_true, scaler_y):
    """Evaluate model performance"""
    y_pred_scaled = model.predict(X_seq, verbose=0)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

    return {
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred),
        'mape': mean_absolute_percentage_error(y_true, y_pred) * 100
    }

In [16]:
def save_learning_curves(history, combo_id, params):
    """Save learning curves"""
    os.makedirs('saved plots', exist_ok=True)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    ax1.plot(history.history['loss'], label='Train')
    ax1.plot(history.history['val_loss'], label='Validation')
    ax1.set_title(f'Loss - Combo {combo_id}')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)

    ax2.plot(history.history['mae'], label='Train')
    ax2.plot(history.history['val_mae'], label='Validation')
    ax2.set_title(f'MAE - Combo {combo_id}')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('MAE')
    ax2.legend()
    ax2.grid(True)

    plt.tight_layout()

    filename = f"arobust_long_combo_{combo_id}_lr{params['learning_rate']}_drop{params['dropout_rate']}.png"
    plt.savefig(f"saved plots/{filename}", dpi=100, bbox_inches='tight')
    plt.close()

In [None]:
def lstm_grid_search(X_train, y_train_scaled, X_test, y_test, scaler_y, param_grid):
    """
    Perform LSTM grid search with comprehensive evaluation
    """
    param_combinations = list(product(*param_grid.values()))
    param_names = list(param_grid.keys())
    total_combinations = len(param_combinations)

    print(f"Starting grid search: {total_combinations} combinations")

    results = []
    best_score = float('inf')
    best_params = None

    for i, params in enumerate(param_combinations):
        param_dict = dict(zip(param_names, params))
        print(f"\nCombination {i+1}/{total_combinations}: {param_dict}")
        print(f"Progress: {(i+1)/total_combinations*100:.1f}%")

        try:
            # Create sequences
            X_seq, y_seq = create_sequences(X_train, y_train_scaled, param_dict['sequence_length'])

            # Train-validation split
            val_size = int(len(X_seq) * 0.2)
            X_train_fold = X_seq[:-val_size]
            y_train_fold = y_seq[:-val_size]
            X_val_fold = X_seq[-val_size:]
            y_val_fold = y_seq[-val_size:]

            # Build and train model
            model = build_model(
                param_dict['sequence_length'],
                X_seq.shape[2],
                param_dict['lstm_units'],
                param_dict['dropout_rate'],
                param_dict['learning_rate']
            )

            callbacks = [
                EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=0),
                ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=0.00001, verbose=0)
            ]
            callbacks = [
                EarlyStopping(
                    monitor='val_loss',
                    patience=5,  # Much shorter patience (was 15-20)
                    restore_best_weights=True,
                    min_delta=0.0001
                ),
                ReduceLROnPlateau(
                    monitor='val_loss',
                    factor=0.3,  # More aggressive LR reduction
                    patience=3,  # Faster response
                    min_lr=0.00001
                )
            ]

            history = model.fit(
                X_train_fold, y_train_fold,
                epochs=param_dict['epochs'],
                batch_size=param_dict['batch_size'],
                validation_data=(X_val_fold, y_val_fold),
                callbacks=callbacks,
                verbose=1
            )

            # Save learning curves
            save_learning_curves(history, i+1, param_dict)

            # Evaluate on all sets (convert scaled targets back to original scale)
            y_train_fold_orig = scaler_y.inverse_transform(y_train_fold.reshape(-1, 1)).flatten()
            y_val_fold_orig = scaler_y.inverse_transform(y_val_fold.reshape(-1, 1)).flatten()

            train_metrics = evaluate_model(model, X_train_fold, y_train_fold_orig, scaler_y)
            val_metrics = evaluate_model(model, X_val_fold, y_val_fold_orig, scaler_y)

            # Test evaluation
            X_test_seq, _ = create_sequences(X_test, y_test.flatten(), param_dict['sequence_length'])
            y_test_actual = y_test[param_dict['sequence_length']:]
            test_metrics = evaluate_model(model, X_test_seq, y_test_actual, scaler_y)

            # Store results (including essential history)
            result = {
                **param_dict,
                'train_rmse': train_metrics['rmse'],
                'train_r2': train_metrics['r2'],
                'val_rmse': val_metrics['rmse'],
                'val_r2': val_metrics['r2'],
                'test_rmse': test_metrics['rmse'],
                'test_r2': test_metrics['r2'],
                'test_mae': test_metrics['mae'],
                'test_mape': test_metrics['mape'],
                'epochs_trained': len(history.history['loss']),
                'final_train_loss': history.history['loss'][-1],
                'final_val_loss': history.history['val_loss'][-1],
                'best_val_loss': min(history.history['val_loss'])
            }
            results.append(result)

            print(f"Results - Train RMSE: {train_metrics['rmse']:.4f}, Test RMSE: {test_metrics['rmse']:.4f}, R2: {test_metrics['r2']:.4f}")

            if test_metrics['rmse'] < best_score:
                best_score = test_metrics['rmse']
                best_params = param_dict.copy()
                print(f"New best model found!")

        except Exception as e:
            print(f"Error in combination {i+1}: {str(e)}")
            results.append({**param_dict, 'error': str(e), 'test_rmse': float('inf')})

    return results, best_params, best_score

In [17]:
# Parameter Grid
param_grid = {
    'sequence_length': [24],
    'lstm_units': [[64, 32, 16], [128, 64, 32]],
    'dropout_rate': [0.2],
    'learning_rate': [0.001, 0.005, 0.01],
    'batch_size': [64],
    'epochs': [50]
}

In [42]:
# Parameter Grid
param_grid = {
    'sequence_length': [36, 48, 72],
    'lstm_units': [[32], [64]],
    'dropout_rate': [0.3, 0.5],
    'learning_rate': [0.001],
    'batch_size': [64],
    'epochs': [30]
}

In [53]:
param_grid = {
    'sequence_length': [36],  # Best performer
    'lstm_units': [[64]],     # Best architecture
    'dropout_rate': [0.4, 0.5, 0.6],  # Higher dropout to reduce overfitting
    'learning_rate': [0.0005, 0.001],  # Slightly lower LR
    'batch_size': [32],       # Smaller batch for better generalization
    'epochs': [25]            # Fewer epochs with early stopping
}

In [19]:
param_grid = {
    # 1. SEQUENCE LENGTH (4 options)
    # Focus around the sweet spot but expand range
    'sequence_length': [24, 36, 48, 60],  # Add 60h for daily+weekly patterns
    
    # 2. ARCHITECTURE (3 options) 
    # Focus on best performers only
    'lstm_units': [
        [32],      # Simpler for less overfitting
        [64],      # Current best performer
        [48]       # Sweet spot between 32 and 64
    ],
    
    # 3. DROPOUT STRATEGY (5 options)
    # CRITICAL: Address overfitting with higher dropout
    'dropout_rate': [0.4, 0.5, 0.6, 0.7, 0.8],  # Push dropout higher!
    
    # 4. LEARNING RATE (1 option - FIXED)
    # Fix at optimal value to reduce combinations
    'learning_rate': [0.0005],  # Sweet spot from your results
    
    # 5. BATCH SIZE (1 option - fixed)
    'batch_size': [32],  # Smaller batch for better generalization
    
    # 6. EPOCHS (1 option - fixed) 
    'epochs': [40]  # Longer training with early stopping
}

In [20]:
# Run Grid Search
results, best_params, best_score = lstm_grid_search(X_train, y_train_scaled, X_test, y_test, scaler_y, param_grid)

print(f"\nGrid search completed. Best RMSE: {best_score:.4f}")
print(f"Best parameters: {best_params}")

Starting grid search: 60 combinations

Combination 1/60: {'sequence_length': 24, 'lstm_units': [32], 'dropout_rate': 0.4, 'learning_rate': 0.0005, 'batch_size': 32, 'epochs': 40}
Progress: 1.7%
Epoch 1/40
[1m741/741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 15ms/step - loss: 0.2751 - mae: 0.3322 - val_loss: 0.1859 - val_mae: 0.1487 - learning_rate: 5.0000e-04
Epoch 2/40
[1m741/741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - loss: 0.0979 - mae: 0.1731 - val_loss: 0.1746 - val_mae: 0.1194 - learning_rate: 5.0000e-04
Epoch 3/40
[1m741/741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 16ms/step - loss: 0.0850 - mae: 0.1539 - val_loss: 0.1720 - val_mae: 0.1110 - learning_rate: 5.0000e-04
Epoch 4/40
[1m741/741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - loss: 0.0798 - mae: 0.1433 - val_loss: 0.1707 - val_mae: 0.1143 - learning_rate: 5.0000e-04
Epoch 5/40
[1m741/741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 

In [21]:
results_df = pd.DataFrame([r for r in results if 'error' not in r])
results_df.to_csv('lstm_arobust_long_grid_search_results.csv', index=False)
print(f"Results saved to lstm_grid_search_results.csv")

Results saved to lstm_grid_search_results.csv


In [22]:
# Create sequences with best parameters
SEQUENCE_LENGTH = best_params['sequence_length']
X_seq_final, y_seq_final = create_sequences(X_train, y_train_scaled, SEQUENCE_LENGTH)

In [23]:
# Build final model with best parameters
final_model = build_model(
    best_params['sequence_length'],
    X_seq_final.shape[2],
    best_params['lstm_units'],
    best_params['dropout_rate'],
    best_params['learning_rate']
)

In [None]:
# Train on ALL training data (no validation split for final model)
# callbacks = [
#     EarlyStopping(monitor='loss', patience=20, restore_best_weights=True, verbose=1),
#     ReduceLROnPlateau(monitor='loss', factor=0.5, patience=10, min_lr=0.00001, verbose=1)
# ]

callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        min_delta=0.0001
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.3,
        patience=3,
        min_lr=0.00001
    )
]

In [25]:
final_history = final_model.fit(
    X_seq_final, y_seq_final,
    epochs=best_params['epochs'] + 20,  # Allow more epochs for final training
    batch_size=best_params['batch_size'],
    callbacks=callbacks,
    verbose=1
)

Epoch 1/60
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - loss: 0.2073 - mae: 0.2709 - learning_rate: 5.0000e-04
Epoch 2/60
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - loss: 0.1152 - mae: 0.1663 - learning_rate: 5.0000e-04
Epoch 3/60
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 17ms/step - loss: 0.1071 - mae: 0.1513 - learning_rate: 5.0000e-04
Epoch 4/60
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 18ms/step - loss: 0.1010 - mae: 0.1423 - learning_rate: 5.0000e-04
Epoch 5/60
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 17ms/step - loss: 0.0974 - mae: 0.1372 - learning_rate: 5.0000e-04
Epoch 6/60
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 19ms/step - loss: 0.0974 - mae: 0.1349 - learning_rate: 5.0000e-04
Epoch 7/60
[1m926/926[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 20ms/step - loss: 0.0954 - mae: 0.1311 - learning_ra

In [None]:
# Training evaluation (convert scaled targets back to original scale)
y_train_actual = scaler_y.inverse_transform(y_seq_final.reshape(-1, 1)).flatten()
final_train_metrics = evaluate_model(final_model, X_seq_final, y_train_actual, scaler_y)

# Test evaluation
X_test_seq, _ = create_sequences(X_test, y_test.flatten(), SEQUENCE_LENGTH)
y_test_actual = y_test[SEQUENCE_LENGTH:]
final_test_metrics = evaluate_model(final_model, X_test_seq, y_test_actual, scaler_y)

print(f"\nFinal Model Performance:")
print(f"TRAINING SET:")
print(f"  Train RMSE: {final_train_metrics['rmse']:.4f}")
print(f"  Train R2: {final_train_metrics['r2']:.4f}")
print(f"  Train MAE: {final_train_metrics['mae']:.4f}")
print(f"  Train MAPE: {final_train_metrics['mape']:.4f}")

print(f"\nTEST SET:")
print(f"  Test RMSE: {final_test_metrics['rmse']:.4f}")
print(f"  Test R2: {final_test_metrics['r2']:.4f}")
print(f"  Test MAE: {final_test_metrics['mae']:.4f}")
print(f"  Test MAPE: {final_test_metrics['mape']:.4f}")

print(f"\nOVERFITTING ANALYSIS:")
print(f"  RMSE Difference (Train - Test): {final_train_metrics['rmse'] - final_test_metrics['rmse']:.4f}")
print(f"  R2 Difference (Train - Test): {final_train_metrics['r2'] - final_test_metrics['r2']:.4f}")
print(f"  MAE Difference (Train - Test): {final_train_metrics['mae'] - final_test_metrics['mae']:.4f}")
print(f"  MAPE Difference (Train - Test): {final_train_metrics['mape'] - final_test_metrics['mape']:.4f}")

# Save final model
final_model.save('best_lstm_model.keras')
print("\nFinal model saved as 'best_lstm_model.keras'")


Final Model Performance:
TRAINING SET:
  Train RMSE: 20.6354
  Train R2: 0.7919
  Train MAE: 10.5227
  Train MAPE: 2.4391

TEST SET:
  Test RMSE: 25.9417
  Test R2: 0.1516
  Test MAE: 9.8446
  Test MAPE: 2.7461

OVERFITTING ANALYSIS:
  RMSE Difference (Train - Test): -5.3063
  R2 Difference (Train - Test): 0.6403
  MAE Difference (Train - Test): 0.6781
  MAPE Difference (Train - Test): -0.3070

Final model saved as 'best_lstm_model.keras'


: 