In [4]:

import argparse
import os

import pandas as pd
from pyspark.sql import SparkSession

from hotel_reservations.config import ProjectConfig
from hotel_reservations.data_preprocessor import DataProcessor
from marvelous.logging import setup_logging
from marvelous.timer import Timer

def main(config_path: str, data_path: str = "../data/data.csv", env: str = "dev") -> None:
    """Main function to process hotel cancellation data.
    
    :param config_path: Path to the YAML configuration file
    :param env: Environment to use (dev, acc, prd)
    """
    
    # Establish config
    config = ProjectConfig.from_yaml(config_path="../project_config.yml", env="dev")

    # Set up logging
    setup_logging(log_file="logs/marvelous-1da.log") # Assuming this is required and works.

    # Initialize Spark session
    spark = SparkSession.builder.getOrCreate()
    
    # Load and validate the configuration
    try:
        config = ProjectConfig.from_yaml(config_path, env)
        print(f"Configuration loaded for environment: {env}")
    except Exception as e:
        print(f"Error loading configuration: {e}")
        spark.stop()
        return
    
    # Load the data
    try:
        raw_data = pd.read_csv(data_path)
        print(f"Data loaded from {data_path}, shape: {raw_data.shape}")
    except Exception as e:
        print(f"Error loading data: {e}")
        spark.stop()
        return
    
    # Create data processor instance
    with Timer() as preprocess_timer:
        # Initialize the DataPreprocessor class that handles the data processing
        processor = DataProcessor(raw_data, config, spark)
    
        # Preprocess the data
        print("Starting data preprocessing...")
        processor.preprocess()
    
    # Split the data into train and test sets
    print("Splitting data into train and test sets...")
    train_set, test_set = processor.split_data()
    logger.info("Training set shape: %s", train_set.shape)
    logger.info("Test set shape: %s", test_set.shape)
    
    # Save the data to Databricks tables
    print("Saving data to Databricks catalog...")
    processor.save_to_catalog(train_set, test_set)
    
    # Enable change data feed
    print("Enabling Change Data Feed...")
    processor.enable_change_data_feed()
    
    print("Processing completed successfully!")
    
    # Stop Spark session
    spark.stop()


if __name__ == "__main__":
    #parser = argparse.ArgumentParser(description="Process hotel cancellation data")
    #parser.add_argument("--config", type=str, required=True, help="Path to configuration YAML file")
    #parser.add_argument("--data", type=str, required=True, help="Path to data CSV file")
    #parser.add_argument("--env", type=str, default="dev", choices=["dev", "acc", "prd"], 
    #                    help="Environment (dev, acc, prd)")
    
    #args = parser.parse_args()
    config_path = "../project_config.yml"
    data_path = "../data/data.csv"
    env = "dev"

    main(config_path=config_path, data_path=data_path, env=env)

RuntimeError: Only remote Spark sessions using Databricks Connect are supported. Use DatabricksSession.builder to create a remote Spark session instead.
Refer to https://docs.databricks.com/dev-tools/databricks-connect.html on how to configure Databricks Connect.