In [1]:
# ONLY RUN THIS IF YOU'RE IN GOOGLE COLAB
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Thesis/Pintu-Air/notebooks')

# Verify you're in the right place
!pwd
!ls -la

Mounted at /content/drive
/content/drive/MyDrive/Thesis/Pintu-Air/notebooks
total 23638
-rw------- 1 root root 5517847 Jun 19 04:28 '01 Result Data Cleaning Part 1.csv'
-rw------- 1 root root 2009870 Jun 19 04:28 '02 All Data.csv'
-rw------- 1 root root 1591313 Jun 19 04:28 '02 Data Preperation.ipynb'
-rw------- 1 root root  256098 Jun 19 04:28 '02 X_test.csv'
-rw------- 1 root root 4831129 Jun 19 04:28 '02 X_train.csv'
-rw------- 1 root root   40580 Jun 19 04:28 '02 y_test.csv'
-rw------- 1 root root  770651 Jun 19 04:28 '02 y_train.csv'
-rw------- 1 root root 1657773 Jun 19 04:28  03_ARIMA.ipynb
-rw------- 1 root root    2708 Jun 19 04:28  03_arima_results_all_significant.csv
-rw------- 1 root root   49684 Jun 19 04:28  03_arima_results.csv
-rw------- 1 root root 1737590 Jun 19 04:28  04_XGBoost.ipynb
-rw------- 1 root root  950934 Jun 19 04:29 '04_XGBoost Reduced.ipynb'
-rw------- 1 root root 1844123 Jun 19 04:28  05_SARIMAX.ipynb
-rw------- 1 root root   87178 Jun 19 07:56  06b_LST

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

from itertools import product
import os
import warnings
warnings.filterwarnings('ignore')

In [3]:
# GPU Configuration
print("GPU Available:", tf.config.list_physical_devices('GPU'))

# ADD THIS BLOCK:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled")
    except RuntimeError as e:
        print(f"GPU setup error: {e}")

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU memory growth enabled


In [4]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [5]:
y_train = (pd.read_csv("02 y_train.csv", index_col='Tanggal')).values
y_test = (pd.read_csv("02 y_test.csv", index_col='Tanggal')).values

In [17]:
all_df = pd.read_csv("02 All Data.csv")
all_df['Tanggal'] = pd.to_datetime(all_df['Tanggal'])

# Create new dataframe for lagged features
X = pd.DataFrame(index=all_df.index)

# Target variable (current Manggarai and 1 hour back)
X['Manggarai'] = all_df['Manggarai (air)']

# Cross-correlation lags for Depok (8 hours back)
X['Depok Lag 8'] = all_df['Depok (air)'].shift(8)

# Cross-correlation lags for Katulampa (11 hours back)
X['Katulampa Lag 11'] = all_df['Katulampa (air)'].shift(11)

X = X.iloc[24:]
del all_df

In [18]:
# Train-Test Split
split_idx = int(len(X) * 0.95)
X_train, X_test = X[:split_idx], X[split_idx:]

# Scaling
# scaler_X = MinMaxScaler()
# scaler_y = MinMaxScaler()
scaler_X = RobustScaler()
scaler_y = RobustScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()


print(f"\n{'='*60}\nData Train and Test Distribution:\n{'='*60}")

print(f'Jumlah data: {len(X)}')

print(f'Jumlah data X train: {len(X_train)}')
print(f'Jumlah data X test: {len(X_test)}')

print(f'Jumlah data y train: {len(y_train)}')
print(f'Jumlah data y test: {len(y_test)}')


Data Train and Test Distribution:
Jumlah data: 31200
Jumlah data X train: 29640
Jumlah data X test: 1560
Jumlah data y train: 29640
Jumlah data y test: 1560


In [8]:
def create_sequences(X, y, seq_len):
    X_seq, y_seq = [], []

    for i in range(seq_len, len(X)):
        # Take seq_len previous time steps as features
        X_seq.append(X[i-seq_len:i])
        # Current time step as target
        y_seq.append(y[i])

    return np.array(X_seq), np.array(y_seq)

In [9]:
def build_model(seq_len, n_features, units, dropout, lr):
    """Build LSTM model"""
    model = Sequential()

    for i, unit in enumerate(units):
        return_seq = i < len(units) - 1
        if i == 0:
            model.add(LSTM(unit, return_sequences=return_seq, input_shape=(seq_len, n_features)))
        else:
            model.add(LSTM(unit, return_sequences=return_seq))
        model.add(Dropout(dropout))

    model.add(Dense(16, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer=Adam(learning_rate=lr), loss='mse', metrics=['mae'])
    return model

In [10]:
def evaluate_model(model, X_seq, y_true, scaler_y):
    """Evaluate model performance"""
    y_pred_scaled = model.predict(X_seq, verbose=0)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

    return {
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred),
        'mape': mean_absolute_percentage_error(y_true, y_pred) * 100
    }

In [20]:
def save_learning_curves(history, combo_id, params):
    """Save learning curves"""
    os.makedirs('saved plots', exist_ok=True)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    ax1.plot(history.history['loss'], label='Train')
    ax1.plot(history.history['val_loss'], label='Validation')
    ax1.set_title(f'Loss - Combo {combo_id}')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)

    ax2.plot(history.history['mae'], label='Train')
    ax2.plot(history.history['val_mae'], label='Validation')
    ax2.set_title(f'MAE - Combo {combo_id}')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('MAE')
    ax2.legend()
    ax2.grid(True)

    plt.tight_layout()

    filename = f"brobust_combo_{combo_id}_lr{params['learning_rate']}_drop{params['dropout_rate']}.png"
    plt.savefig(f"saved plots/{filename}", dpi=100, bbox_inches='tight')
    plt.close()

In [12]:
def lstm_grid_search(X_train, y_train_scaled, X_test, y_test, scaler_y, param_grid):
    """
    Perform LSTM grid search with comprehensive evaluation
    """
    param_combinations = list(product(*param_grid.values()))
    param_names = list(param_grid.keys())
    total_combinations = len(param_combinations)

    print(f"Starting grid search: {total_combinations} combinations")

    results = []
    best_score = float('inf')
    best_params = None

    for i, params in enumerate(param_combinations):
        param_dict = dict(zip(param_names, params))
        print(f"\nCombination {i+1}/{total_combinations}: {param_dict}")
        print(f"Progress: {(i+1)/total_combinations*100:.1f}%")

        try:
            # Create sequences
            X_seq, y_seq = create_sequences(X_train, y_train_scaled, param_dict['sequence_length'])

            # Train-validation split
            val_size = int(len(X_seq) * 0.2)
            X_train_fold = X_seq[:-val_size]
            y_train_fold = y_seq[:-val_size]
            X_val_fold = X_seq[-val_size:]
            y_val_fold = y_seq[-val_size:]

            # Build and train model
            model = build_model(
                param_dict['sequence_length'],
                X_seq.shape[2],
                param_dict['lstm_units'],
                param_dict['dropout_rate'],
                param_dict['learning_rate']
            )

            callbacks = [
                EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=0),
                ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=0.00001, verbose=0)
            ]

            history = model.fit(
                X_train_fold, y_train_fold,
                epochs=param_dict['epochs'],
                batch_size=param_dict['batch_size'],
                validation_data=(X_val_fold, y_val_fold),
                callbacks=callbacks,
                verbose=1
            )

            # Save learning curves
            save_learning_curves(history, i+1, param_dict)

            # Evaluate on all sets (convert scaled targets back to original scale)
            y_train_fold_orig = scaler_y.inverse_transform(y_train_fold.reshape(-1, 1)).flatten()
            y_val_fold_orig = scaler_y.inverse_transform(y_val_fold.reshape(-1, 1)).flatten()

            train_metrics = evaluate_model(model, X_train_fold, y_train_fold_orig, scaler_y)
            val_metrics = evaluate_model(model, X_val_fold, y_val_fold_orig, scaler_y)

            # Test evaluation
            X_test_seq, _ = create_sequences(X_test, y_test.flatten(), param_dict['sequence_length'])
            y_test_actual = y_test[param_dict['sequence_length']:]
            test_metrics = evaluate_model(model, X_test_seq, y_test_actual, scaler_y)

            # Store results (including essential history)
            result = {
                **param_dict,
                'train_rmse': train_metrics['rmse'],
                'train_r2': train_metrics['r2'],
                'val_rmse': val_metrics['rmse'],
                'val_r2': val_metrics['r2'],
                'test_rmse': test_metrics['rmse'],
                'test_r2': test_metrics['r2'],
                'test_mae': test_metrics['mae'],
                'test_mape': test_metrics['mape'],
                'epochs_trained': len(history.history['loss']),
                'final_train_loss': history.history['loss'][-1],
                'final_val_loss': history.history['val_loss'][-1],
                'best_val_loss': min(history.history['val_loss'])
            }
            results.append(result)

            print(f"Results - Train RMSE: {train_metrics['rmse']:.4f}, Test RMSE: {test_metrics['rmse']:.4f}, R2: {test_metrics['r2']:.4f}")

            if test_metrics['rmse'] < best_score:
                best_score = test_metrics['rmse']
                best_params = param_dict.copy()
                print(f"New best model found!")

        except Exception as e:
            print(f"Error in combination {i+1}: {str(e)}")
            results.append({**param_dict, 'error': str(e), 'test_rmse': float('inf')})

    return results, best_params, best_score

In [21]:
# Parameter Grid
param_grid = {
    'sequence_length': [24],
    'lstm_units': [[32], [64]],
    'dropout_rate': [0.3, 0.5],
    'learning_rate': [0.001],
    'batch_size': [64],
    'epochs': [30]
}

In [22]:
# Run Grid Search
results, best_params, best_score = lstm_grid_search(X_train, y_train_scaled, X_test, y_test, scaler_y, param_grid)

print(f"\nGrid search completed. Best RMSE: {best_score:.4f}")
print(f"Best parameters: {best_params}")

Starting grid search: 4 combinations

Combination 1/4: {'sequence_length': 24, 'lstm_units': [32], 'dropout_rate': 0.3, 'learning_rate': 0.001, 'batch_size': 64, 'epochs': 30}
Progress: 25.0%
Epoch 1/30
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.6805 - mae: 0.6245 - val_loss: 0.3381 - val_mae: 0.3004 - learning_rate: 0.0010
Epoch 2/30
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.4618 - mae: 0.4926 - val_loss: 0.3318 - val_mae: 0.3043 - learning_rate: 0.0010
Epoch 3/30
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.4503 - mae: 0.4828 - val_loss: 0.3300 - val_mae: 0.2982 - learning_rate: 0.0010
Epoch 4/30
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - loss: 0.4486 - mae: 0.4803 - val_loss: 0.3322 - val_mae: 0.3003 - learning_rate: 0.0010
Epoch 5/30
[1m371/371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 0.4459 - m

In [23]:
results_df = pd.DataFrame([r for r in results if 'error' not in r])
results_df.to_csv('lstm_brobust_grid_search_results.csv', index=False)
print(f"Results saved to lstm_grid_search_results.csv")

Results saved to lstm_grid_search_results.csv


In [21]:
# Create sequences with best parameters
SEQUENCE_LENGTH = best_params['sequence_length']
X_seq_final, y_seq_final = create_sequences(X_train, y_train_scaled, SEQUENCE_LENGTH)

In [22]:
# Build final model with best parameters
final_model = build_model(
    best_params['sequence_length'],
    X_seq_final.shape[2],
    best_params['lstm_units'],
    best_params['dropout_rate'],
    best_params['learning_rate']
)

In [23]:
# Train on ALL training data (no validation split for final model)
callbacks = [
    EarlyStopping(monitor='loss', patience=20, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='loss', factor=0.5, patience=10, min_lr=0.00001, verbose=1)
]

In [24]:
final_history = final_model.fit(
    X_seq_final, y_seq_final,
    epochs=best_params['epochs'] + 20,  # Allow more epochs for final training
    batch_size=best_params['batch_size'],
    callbacks=callbacks,
    verbose=1
)

Epoch 1/70


KeyboardInterrupt: 

In [None]:
# Training evaluation (convert scaled targets back to original scale)
y_train_actual = scaler_y.inverse_transform(y_seq_final.reshape(-1, 1)).flatten()
final_train_metrics = evaluate_model(final_model, X_seq_final, y_train_actual, scaler_y)

# Test evaluation
X_test_seq, _ = create_sequences(X_test, y_test.flatten(), SEQUENCE_LENGTH)
y_test_actual = y_test[SEQUENCE_LENGTH:]
final_test_metrics = evaluate_model(final_model, X_test_seq, y_test_actual, scaler_y)

print(f"\nFinal Model Performance:")
print(f"TRAINING SET:")
print(f"  Train RMSE: {final_train_metrics['rmse']:.4f}")
print(f"  Train R2: {final_train_metrics['r2']:.4f}")
print(f"  Train MAE: {final_train_metrics['mae']:.4f}")
print(f"  Train MAPE: {final_train_metrics['mape']:.4f}")

print(f"\nTEST SET:")
print(f"  Test RMSE: {final_test_metrics['rmse']:.4f}")
print(f"  Test R2: {final_test_metrics['r2']:.4f}")
print(f"  Test MAE: {final_test_metrics['mae']:.4f}")
print(f"  Test MAPE: {final_test_metrics['mape']:.4f}")

print(f"\nOVERFITTING ANALYSIS:")
print(f"  RMSE Difference (Train - Test): {final_train_metrics['rmse'] - final_test_metrics['rmse']:.4f}")
print(f"  R2 Difference (Train - Test): {final_train_metrics['r2'] - final_test_metrics['r2']:.4f}")
print(f"  MAE Difference (Train - Test): {final_train_metrics['mae'] - final_test_metrics['mae']:.4f}")
print(f"  MAPE Difference (Train - Test): {final_train_metrics['mape'] - final_test_metrics['mape']:.4f}")

# Save final model
final_model.save('best_lstm_model.h5')
print("\nFinal model saved as 'best_lstm_model.h5'")




Final Model Performance:
TRAINING SET:
  Train RMSE: 44.6697
  Train R2: 0.0311
  Train MAE: 41.7557
  Train MAPE: 7.5235

TEST SET:
  Test RMSE: 52.4711
  Test R2: -2.4919
  Test MAE: 48.3058
  Test MAPE: 8.5214

OVERFITTING ANALYSIS:
  RMSE Difference (Train - Test): -7.8014
  R2 Difference (Train - Test): 2.5230
  MAE Difference (Train - Test): -6.5501
  MAPE Difference (Train - Test): -0.9979

Final model saved as 'best_lstm_model.h5'
