In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import itertools

In [2]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')


In [3]:
import os

# Directory to save the best model
model_save_path = 'best_model.pth'

In [4]:
# Check if CUDA (GPU) is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the dataset
file_path = "../data/cleaned.csv"
df = pd.read_csv(file_path)

# Define features and target
X = df.drop(columns=['is_canceled'])
y = df['is_canceled']

# List of categorical columns including numerical categories
categorical_columns = [
    'hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
    'distribution_channel', 'reserved_room_type', 'assigned_room_type',
    'deposit_type', 'customer_type', 'arrival_date_year',
    'arrival_date_week_number', 'arrival_date_day_of_month',
    'company', 'agent'
]

# List of true numerical columns
numerical_columns = [
    'lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',
    'adults', 'children', 'babies', 'is_repeated_guest',
    'previous_cancellations', 'previous_bookings_not_canceled',
    'booking_changes', 'days_in_waiting_list', 'adr',
    'required_car_parking_spaces', 'total_of_special_requests'
]

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns)
    ]
)

# Fit the preprocessor on the entire dataset to capture all categories
preprocessor.fit(X)

# Transform the entire dataset
X_preprocessed = preprocessor.transform(X)



Using device: cuda


In [5]:
# Split the dataset into training (60%), validation (10%), testing (10%), and incremental learning (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X_preprocessed, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)
X_test, X_incremental, y_test, y_incremental = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [6]:
# Function to create the model
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_layer_sizes, dropout):
        super(NeuralNetwork, self).__init__()
        self.layers = nn.ModuleList()
        prev_size = input_dim
        for size in hidden_layer_sizes:
            self.layers.append(nn.Linear(prev_size, size))
            self.layers.append(nn.LeakyReLU())
            self.layers.append(nn.Dropout(dropout))
            prev_size = size
        self.output = nn.Linear(prev_size, 1)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        x = torch.sigmoid(self.output(x))
        return x

# Function to train the model
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

# Function to validate the model
def validate(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            preds = outputs.round()
            correct += preds.eq(labels).sum().item()
    accuracy = correct / len(val_loader.dataset)
    return running_loss / len(val_loader), accuracy


In [7]:
# Define hyperparameter grid
param_grid = {
    'hidden_layer_sizes': [(512, 256, 128), (1024, 512, 256), (512, 512)],
    'dropout': [0.3, 0.5, 0.7],
    'learning_rate': [0.0001, 0.001, 0.01],
    'batch_size': [32, 64, 128]
}
# Define Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5)

In [8]:
# Perform grid search for hyperparameter tuning
best_accuracy = 0.0
best_params = None

for hidden_layer_sizes, dropout, learning_rate, batch_size in itertools.product(
    param_grid['hidden_layer_sizes'], param_grid['dropout'], param_grid['learning_rate'], param_grid['batch_size']
):
    print(f"Training with params: {hidden_layer_sizes}, {dropout}, {learning_rate}, {batch_size}")

    fold_accuracies = []

    for fold, (train_index, val_index) in enumerate(skf.split(X_train_resampled, y_train_resampled)):
        print(f"  Fold {fold + 1}/{skf.n_splits}")

        X_train_fold, X_val_fold = X_train_resampled[train_index], X_train_resampled[val_index]
        y_train_fold, y_val_fold = y_train_resampled[train_index], y_train_resampled[val_index]

        # Convert to PyTorch tensors
        X_train_tensor = torch.tensor(X_train_fold, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1)
        X_val_tensor = torch.tensor(X_val_fold, dtype=torch.float32)
        y_val_tensor = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1)

        # Create DataLoader
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model, loss function, and optimizer
        input_dim = X_train_tensor.shape[1]
        model = NeuralNetwork(input_dim, hidden_layer_sizes, dropout).to(device)
        criterion = nn.BCELoss().to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

        # Training loop
        num_epochs = 100
        patience = 5
        patience_counter = 0
        best_val_loss = float('inf')

        for epoch in range(num_epochs):
            train_loss = train(model, train_loader, criterion, optimizer, device)
            val_loss, val_accuracy = validate(model, val_loader, criterion, device)
            print(f"    Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

            scheduler.step(val_loss)  # Step the scheduler

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("    Early stopping triggered")
                    break

        fold_accuracies.append(val_accuracy)

    mean_fold_accuracy = np.mean(fold_accuracies)
    print(f"Mean fold accuracy for params {hidden_layer_sizes}, {dropout}, {learning_rate}, {batch_size}: {mean_fold_accuracy}")

    if mean_fold_accuracy > best_accuracy:
        best_accuracy = mean_fold_accuracy
        best_params = (hidden_layer_sizes, dropout, learning_rate, batch_size)
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')

print(f"Best accuracy: {best_accuracy}")
print(f"Best params: {best_params}")


Training with params: (512, 256, 128), 0.3, 0.0001, 32
  Fold 1/5
    Epoch 1/100, Train Loss: 0.3706, Val Loss: 0.3337, Val Accuracy: 0.8428
    Epoch 2/100, Train Loss: 0.2962, Val Loss: 0.3138, Val Accuracy: 0.8543
    Epoch 3/100, Train Loss: 0.2695, Val Loss: 0.3083, Val Accuracy: 0.8569
    Epoch 4/100, Train Loss: 0.2512, Val Loss: 0.3002, Val Accuracy: 0.8616
    Epoch 5/100, Train Loss: 0.2349, Val Loss: 0.3023, Val Accuracy: 0.8620
    Epoch 6/100, Train Loss: 0.2197, Val Loss: 0.2982, Val Accuracy: 0.8706
    Epoch 7/100, Train Loss: 0.2073, Val Loss: 0.2993, Val Accuracy: 0.8708
    Epoch 8/100, Train Loss: 0.1948, Val Loss: 0.3065, Val Accuracy: 0.8699
    Epoch 9/100, Train Loss: 0.1840, Val Loss: 0.3105, Val Accuracy: 0.8730
    Epoch 10/100, Train Loss: 0.1764, Val Loss: 0.3055, Val Accuracy: 0.8743
    Epoch 11/100, Train Loss: 0.1540, Val Loss: 0.3102, Val Accuracy: 0.8777
    Early stopping triggered
  Fold 2/5
    Epoch 1/100, Train Loss: 0.3747, Val Loss: 0.3221, V

In [20]:
best_params

((1024, 512, 256), 0.3, 0.001, 64)