In [None]:
'''
This is an example of a Python script that processes hotel reservation data using src/hotel_reservations/data_preprocessor.py.
This was developed for Deliverable 1 of the MarvelousMLOps project.
'''
from loguru import logger
import pandas as pd
from pyspark.sql import SparkSession

from hotel_reservations.config import ProjectConfig
from hotel_reservations.data_preprocessor import DataProcessor
from marvelous.logging import setup_logging
from marvelous.timer import Timer

def main(config_path: str, data_path: str = "../data/data.csv", env: str = "dev") -> None:
    """Main function to process hotel cancellation data.
    
    :param config_path: Path to the YAML configuration file
    :param env: Environment to use (dev, acc, prd)
    """
    
    # Establish config
    config = ProjectConfig.from_yaml(config_path="../project_config.yml", env="dev")

    # Set up logging
    setup_logging(log_file="logs/marvelous-1da.log") # Assuming this is required and works.

    # Initialize Spark session
    spark = SparkSession.builder.getOrCreate()
    
    # Load and validate the configuration
    try:
        config = ProjectConfig.from_yaml(config_path, env)
        print(f"Configuration loaded for environment: {env}")
    except Exception as e:
        print(f"Error loading configuration: {e}")
        spark.stop()
        return
    
    # Load the data
    try:
        raw_data = pd.read_csv(data_path)
        print(f"Data loaded from {data_path}, shape: {raw_data.shape}")
    except Exception as e:
        print(f"Error loading data: {e}")
        spark.stop()
        return

    # Initialize the DataPreprocessor class that handles the data processing
    processor = DataProcessor(raw_data, config, spark)

    # Preprocess the data
    print("Starting data preprocessing...")
    processor.preprocess()
    
    # Split the data into train and test sets
    print("Splitting data into train and test sets...")
    train_set, test_set = processor.split_data()
    logger.info("Training set shape: %s", train_set.shape)
    logger.info("Test set shape: %s", test_set.shape)
    
    # Save the data to Databricks tables
    print("Saving data to Databricks catalog...")
    processor.save_to_catalog(train_set, test_set)
    
    # Enable change data feed
    print("Enabling Change Data Feed...")
    processor.enable_change_data_feed()
    
    print("Processing completed successfully!")
    
    # Stop Spark session
    spark.stop()

config_path = "../project_config.yml"
data_path = "../data/data.csv"
env = "dev"

main(config_path=config_path, data_path=data_path, env=env)

Configuration loaded for environment: dev
Data loaded from ../data/data.csv, shape: (36275, 19)
Starting data preprocessing...
Preprocessing complete. DataFrame shape: (36275, 27)
Numeric features: ['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'required_car_parking_space', 'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 'no_of_special_requests', 'total_nights', 'total_guests', 'price_per_person', 'price_per_night', 'with_children', 'has_weekend_stay']
Categorical features: ['type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'booking_season']
Splitting data into train and test sets...
Data split complete. Train set: (29020, 27), Test set: (7255, 27)
[32m2025-05-21 11:19:38[0m | [1mINFO[0m | [36m865111206[0m:[36mmain[0m - [1mTraining set shape: %s[0m
[32m2025-05-21 11:19:38[0m | [1mINFO[0m | [36m86511

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df['no_of_children'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df['no_of_previous_cancellations'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

Data saved to tables: mlops_dev.dylanaus.train_set and mlops_dev.dylanaus.test_set
Enabling Change Data Feed...
Change Data Feed enabled for train and test set tables
Processing completed successfully!
