In [13]:
# Re-importing necessary libraries as the session state was reset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Reloading the dataset
file_path = 'hotel_dataset.csv'
data = pd.read_csv(file_path)

# Handling missing values
data['children'].fillna(data['children'].median(), inplace=True)
data['country'].fillna(data['country'].mode()[0], inplace=True)
data['agent'].fillna(0, inplace=True)
data['company'].fillna(0, inplace=True)

# Encoding categorical variables using one-hot encoding
categorical_columns = [
    'hotel', 'arrival_date_month', 'meal', 'country',
    'market_segment', 'distribution_channel', 'reserved_room_type',
    'assigned_room_type', 'deposit_type', 'customer_type'
]

data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Drop unnecessary columns
columns_to_drop = ['reservation_status', 'reservation_status_date']
data_encoded.drop(columns=columns_to_drop, inplace=True)

# Subset of relevant features for modeling
selected_features = [
    'lead_time', 'adults', 'children', 'stays_in_week_nights',
    'stays_in_weekend_nights', 'total_of_special_requests', 'adr'
]

# Downsample the dataset for manageable size
sampled_data = data_encoded.sample(n=5000, random_state=42)  # Select 5000 rows

# Redefine features and target for the sampled dataset
X_sampled = sampled_data[selected_features]
y_sampled = sampled_data['is_canceled']

# Splitting the downsampled data
X_train_sampled, X_test_sampled, y_train_sampled, y_test_sampled = train_test_split(
    X_sampled, y_sampled, test_size=0.2, random_state=42
)

# Train Logistic Regression on the smaller dataset
logistic_model_sampled = LogisticRegression(max_iter=1000, random_state=42)
logistic_model_sampled.fit(X_train_sampled, y_train_sampled)

# Evaluate the model
y_pred_sampled = logistic_model_sampled.predict(X_test_sampled)
sampled_accuracy = accuracy_score(y_test_sampled, y_pred_sampled)
sampled_classification_report = classification_report(y_test_sampled, y_pred_sampled)

sampled_accuracy, sampled_classification_report


(0.687,
 '              precision    recall  f1-score   support\n\n           0       0.71      0.84      0.77       618\n           1       0.63      0.44      0.52       382\n\n    accuracy                           0.69      1000\n   macro avg       0.67      0.64      0.64      1000\nweighted avg       0.68      0.69      0.67      1000\n')

In [14]:
import joblib
joblib.dump(logistic_model_sampled, "dic_bonus.pkl")

['dic_bonus.pkl']