In [15]:
import pandas as pd
import numpy as np
import torch, torch.nn as nn, torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
#!pip install imbalanced-learn
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import NearMiss
#!pip install --upgrade imbalanced-learn
random_seed = 3

In [17]:
hotels = pd.read_csv('hotel_bookings.csv')

In [19]:
hotels = hotels.drop('is_canceled', axis=1)
hotels.head(5)

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,342,2015,July,27,1,0,0,2,0.0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,737,2015,July,27,1,0,0,2,0.0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,7,2015,July,27,1,0,1,1,0.0,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,13,2015,July,27,1,0,1,1,0.0,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,14,2015,July,27,1,0,2,2,0.0,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [21]:
drop_columns = ['arrival_date_month', 'hotel','country','reservation_status_date']

hotels = hotels.drop(labels=drop_columns, axis=1)

## Label Encoding

In [24]:
meal_mapping = {
    'Undefined': 0,
    'SC': 0,
    'BB': 1,
    'HB': 2,
    'FB': 3
}

# Apply the mapping to the 'meal' column
hotels['meal'] = hotels['meal'].map(meal_mapping)

# Check the updated DataFrame
print(hotels[['meal']].head(5))

   meal
0     1
1     1
2     1
3     1
4     1


In [26]:
status_mapping = {
    'Check-Out': 2,
    'Canceled': 1,
    'No-Show': 0
}

# Use the .replace() method to encode the reservation_status column
hotels['reservation_status_encoded'] = hotels['reservation_status'].replace(status_mapping)

# Check the result
print(hotels[['reservation_status', 'reservation_status_encoded']].head(10))


  reservation_status  reservation_status_encoded
0          Check-Out                           2
1          Check-Out                           2
2          Check-Out                           2
3          Check-Out                           2
4          Check-Out                           2
5          Check-Out                           2
6          Check-Out                           2
7          Check-Out                           2
8           Canceled                           1
9           Canceled                           1


## One-Hot Encoding

In [29]:
categorical_columns = hotels.select_dtypes(include=['object', 'category']).columns
print(categorical_columns)

Index(['market_segment', 'distribution_channel', 'reserved_room_type',
       'assigned_room_type', 'deposit_type', 'customer_type',
       'reservation_status'],
      dtype='object')


In [31]:
hotels = pd.get_dummies(hotels, columns=categorical_columns, dtype=int)

In [33]:
# Remove target columns
remove_cols = ['is_canceled']

# Select training features
train_features = [x for x in hotels.columns if x not in remove_cols]


In [35]:
X = hotels[train_features].values  # Extract the feature columns
X = torch.tensor(X, dtype=torch.float32)  # Convert to tensor with float type
y = hotels['reservation_status_encoded'].values  # Extract the 'is_canceled' column
y = torch.tensor(y, dtype=torch.float32)  # Convert to tensor with float type
y = y.view(-1, 1)  # Reshape y to be of shape (n_samples, 1)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X,  # Features
    y,  # Target labels
    test_size=0.2,  # 20% for testing
    random_state=random_seed  # Set random seed for reproducibility
)

In [39]:
print(X_train.shape)
print(y_train.shape)  # Should print something like (batch_size,)

torch.Size([95512, 66])
torch.Size([95512, 1])


In [75]:
# Step 1: Impute missing values (for training data)
X_train_cpu = X_train.cpu().numpy()
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_cpu)

# Step 2: Subset for testing purposes
X_train_subset = torch.tensor(X_train_imputed[:1000], dtype=torch.float32)
y_train_subset = y_train[:1000]

# Convert to NumPy arrays for NearMiss
X_train_subset_np = X_train_subset.numpy()
y_train_subset_np = y_train_subset.numpy()

# Step 3: Apply NearMiss (Undersampling)
near_miss = NearMiss(sampling_strategy='auto')
X_train_resampled, y_train_resampled = near_miss.fit_resample(X_train_subset_np, y_train_subset_np)

# Step 4: Convert the resampled data back to tensors
X_train_resampled_tensor = torch.tensor(X_train_resampled, dtype=torch.float32)
y_train_resampled_tensor = torch.tensor(y_train_resampled, dtype=torch.long)

# Step 5: Define the model
random_seed = 42
torch.manual_seed(random_seed)
model = nn.Sequential(
    nn.Linear(66, 36),  # Input layer to first hidden layer
    nn.ReLU(),
    nn.Linear(36, 18),  # First hidden layer to second hidden layer
    nn.ReLU(),
    nn.Linear(18, 3),  # Second hidden layer to output layer (3 classes)
)

# Step 6: Set the loss function and optimizer
loss = nn.CrossEntropyLoss()  # For multiclass classification
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Step 7: Training loop
num_epochs = 500
for epoch in range(num_epochs):
    y_pred = model(X_train_resampled_tensor)  # Forward pass
    loss_value = loss(y_pred, y_train_resampled_tensor)  # Compute loss

    optimizer.zero_grad()
    loss_value.backward()
    optimizer.step()

    # Calculate accuracy
    with torch.no_grad():
        y_pred_labels = torch.argmax(y_pred, dim=1)  # Get class with max probability
        correct_predictions = (y_pred_labels == y_train_resampled_tensor).float()
        accuracy = correct_predictions.sum() / len(correct_predictions)
    
    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss_value.item():.4f}, Accuracy: {accuracy.item()*100:.2f}%")


Epoch [100/500], Loss: 0.5633, Accuracy: 87.18%
Epoch [200/500], Loss: 0.3373, Accuracy: 94.87%
Epoch [300/500], Loss: 0.1613, Accuracy: 100.00%
Epoch [400/500], Loss: 0.0725, Accuracy: 100.00%
Epoch [500/500], Loss: 0.0371, Accuracy: 100.00%


In [79]:
model.eval()
with torch.no_grad():
    test_predictions = model(X_test) 
    test_predicted_labels = torch.argmax(test_predictions, dim=1).numpy()  # Shape [23878]
    accuracy = accuracy_score(y_test, test_predicted_labels)
    report = classification_report(y_test, test_predicted_labels)
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)


Accuracy: 0.0107
Classification Report:
              precision    recall  f1-score   support

         0.0       0.01      1.00      0.02       251
         1.0       0.00      0.00      0.00      8600
         2.0       0.83      0.00      0.00     15027

    accuracy                           0.01     23878
   macro avg       0.28      0.33      0.01     23878
weighted avg       0.52      0.01      0.00     23878



In [81]:
report = classification_report(y_test, test_predicted_labels)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.0107
Classification Report:
              precision    recall  f1-score   support

         0.0       0.01      1.00      0.02       251
         1.0       0.00      0.00      0.00      8600
         2.0       0.83      0.00      0.00     15027

    accuracy                           0.01     23878
   macro avg       0.28      0.33      0.01     23878
weighted avg       0.52      0.01      0.00     23878

