In [1]:
import pandas as pd
import numpy as np
import torch, torch.nn as nn, torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
#!pip install imbalanced-learn
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
random_seed = 3

In [3]:
hotels = pd.read_csv('hotel_bookings.csv')

In [4]:
print(hotels['is_canceled'].value_counts(1))

is_canceled
0    0.629584
1    0.370416
Name: proportion, dtype: float64


In [5]:
print(hotels['reservation_status'].value_counts(1))

reservation_status
Check-Out    0.629584
Canceled     0.360307
No-Show      0.010110
Name: proportion, dtype: float64


In [6]:
grouped_canc_per_month = hotels.groupby('arrival_date_month')['is_canceled'].mean()
grouped_canc_per_month_sorted = grouped_canc_per_month.sort_values()

In [11]:
grouped_per_month = hotels.groupby('arrival_date_month')['hotel'].count()
grouped_per_month_sorted = grouped_per_month.sort_values()

In [13]:
print((100*grouped_canc_per_month_sorted/grouped_per_month_sorted).sort_values())

arrival_date_month
August       0.002721
July         0.002958
March        0.003283
May          0.003364
October      0.003409
April        0.003679
September    0.003728
June         0.003790
February     0.004142
November     0.004597
January      0.005140
December     0.005158
dtype: float64


In [15]:
hotels = hotels.drop('reservation_status', axis=1)
hotels.head(5)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,3,No Deposit,,,0,Transient,0.0,0,0,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,4,No Deposit,,,0,Transient,0.0,0,0,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,0,No Deposit,,,0,Transient,75.0,0,0,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,0,No Deposit,304.0,,0,Transient,75.0,0,0,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,0,No Deposit,240.0,,0,Transient,98.0,0,1,2015-07-03


In [17]:
drop_columns = ['arrival_date_month', 'hotel','country','reservation_status_date']

hotels = hotels.drop(labels=drop_columns, axis=1)

## Label Encoding

In [20]:
meal_mapping = {
    'Undefined': 0,
    'SC': 0,
    'BB': 1,
    'HB': 2,
    'FB': 3
}

# Apply the mapping to the 'meal' column
hotels['meal'] = hotels['meal'].map(meal_mapping)

# Check the updated DataFrame
print(hotels[['meal']].head(5))

   meal
0     1
1     1
2     1
3     1
4     1


## One-Hot Encoding

In [23]:
categorical_columns = hotels.select_dtypes(include=['object', 'category'])
# Display the categorical columns
print(categorical_columns.columns)
print(hotels.columns)

Index(['market_segment', 'distribution_channel', 'reserved_room_type',
       'assigned_room_type', 'deposit_type', 'customer_type'],
      dtype='object')
Index(['is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_week_number', 'arrival_date_day_of_month',
       'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children',
       'babies', 'meal', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests'],
      dtype='object')


In [25]:
# Check the data types of the columns in the hotels DataFrame
print(hotels.dtypes)
# Convert to categorical (if they are not already)
categorical_columns = hotels.select_dtypes(include=['object']).columns

is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                                int64
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             float64
company                           float64
days_in_waiting_list                int64
customer_type                     

In [27]:
hotels = pd.get_dummies(hotels, columns=categorical_columns, dtype=int)
# Display the updated DataFrame
print(hotels.head(5))

   is_canceled  lead_time  arrival_date_year  arrival_date_week_number  \
0            0        342               2015                        27   
1            0        737               2015                        27   
2            0          7               2015                        27   
3            0         13               2015                        27   
4            0         14               2015                        27   

   arrival_date_day_of_month  stays_in_weekend_nights  stays_in_week_nights  \
0                          1                        0                     0   
1                          1                        0                     0   
2                          1                        0                     1   
3                          1                        0                     1   
4                          1                        0                     2   

   adults  children  babies  ...  assigned_room_type_K  assigned_room_type_L  \


In [28]:
# Remove target columns
remove_cols = ['is_canceled']

# Select training features
train_features = [x for x in hotels.columns if x not in remove_cols]


In [29]:
X = hotels[train_features].values  # Extract the feature columns
X = torch.tensor(X, dtype=torch.float32)  # Convert to tensor with float type
y = hotels['is_canceled'].values  # Extract the 'is_canceled' column
y = torch.tensor(y, dtype=torch.float32)  # Convert to tensor with float type
y = y.view(-1, 1)  # Reshape y to be of shape (n_samples, 1)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X,  # Features
    y,  # Target labels
    test_size=0.2,  # 20% for testing
    random_state=random_seed  # Set random seed for reproducibility
)

In [34]:
print(X_train.shape) 

torch.Size([95512, 62])


In [37]:
# Set the random seed for reproducibility
torch.manual_seed(random_seed)
model = nn.Sequential(
    nn.Linear(62, 36),  # Input layer (65 nodes) to first hidden layer (36 nodes)
    nn.ReLU(),          # ReLU activation function
    nn.Linear(36, 18),  # First hidden layer (36 nodes) to second hidden layer (18 nodes)
    nn.ReLU(),          # ReLU activation function
    nn.Linear(18, 1),   # Second hidden layer (18 nodes) to output layer (1 node)
    nn.Sigmoid()        # Sigmoid activation function for binary classification
)

# Display the model architecture
print(model)
# Set the binary cross-entropy loss function
loss = nn.BCEWithLogitsLoss()

# Set the Adam optimizer with a learning rate of 0.005
optimizer = optim.Adam(model.parameters(), lr=0.005)


Sequential(
  (0): Linear(in_features=62, out_features=36, bias=True)
  (1): ReLU()
  (2): Linear(in_features=36, out_features=18, bias=True)
  (3): ReLU()
  (4): Linear(in_features=18, out_features=1, bias=True)
  (5): Sigmoid()
)


In [38]:
# Number of epochs
num_epochs = 1000

# Training loop
for epoch in range(num_epochs):
    y_pred = model(X_train)  # Forward pass through the model
    loss_value = loss(y_pred, y_train)  # Calculate binary cross-entropy loss
    optimizer.zero_grad()  # Zero the gradients before the backward pass
    loss_value.backward()  # Backpropagate the loss to compute gradients
    optimizer.step()  # Apply the gradients to update the model parameters
    with torch.no_grad():
        y_pred_labels = (torch.sigmoid(y_pred) >= 0.5).float()  # Threshold at 0.5 to get binary labels
        correct_predictions = (y_pred_labels == y_train).float()  # Compare predictions to true labels
        accuracy = correct_predictions.sum() / len(correct_predictions)  # Accuracy as percentage
    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss_value.item():.4f}, Accuracy: {accuracy.item()*100:.2f}%")


Epoch [100/1000], Loss: nan, Accuracy: 62.96%
Epoch [200/1000], Loss: nan, Accuracy: 62.96%
Epoch [300/1000], Loss: nan, Accuracy: 62.96%
Epoch [400/1000], Loss: nan, Accuracy: 62.96%
Epoch [500/1000], Loss: nan, Accuracy: 62.96%
Epoch [600/1000], Loss: nan, Accuracy: 62.96%
Epoch [700/1000], Loss: nan, Accuracy: 62.96%
Epoch [800/1000], Loss: nan, Accuracy: 62.96%
Epoch [900/1000], Loss: nan, Accuracy: 62.96%
Epoch [1000/1000], Loss: nan, Accuracy: 62.96%


In [39]:
# Set the model to evaluation mode
model.eval()

# Turn off gradient calculation (no need to calculate gradients during evaluation)
with torch.no_grad():
    # Generate predicted probabilities on X_test
    test_predictions = model(X_test)  # Forward pass on the test set
    
    # Convert predicted probabilities to binary labels using a threshold of 0.5
    test_predicted_labels = (torch.sigmoid(test_predictions) >= 0.5).float()


In [40]:
print(test_predicted_labels[:10])

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])


In [41]:
test_predicted_labels = test_predicted_labels.view(-1).numpy()  # Convert tensor to numpy array
y_test = y_test.view(-1).numpy()  # Convert tensor to numpy array (true labels)

# Compute the overall accuracy
accuracy = accuracy_score(y_test, test_predicted_labels)

# Compute the precision, recall, and F1 score
report = classification_report(y_test, test_predicted_labels)

# Print the accuracy and classification report
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.6293
Classification Report:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77     15027
         1.0       0.00      0.00      0.00      8851

    accuracy                           0.63     23878
   macro avg       0.31      0.50      0.39     23878
weighted avg       0.40      0.63      0.49     23878



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
