The data ingestion stage verifies connection to HDFS, by reading sample data using Spark

In [1]:
import os

In [2]:
%pwd

'/Users/macbookpro/Documents/sclable_ml_pipelines/scalable_ml_pipelines/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/macbookpro/Documents/sclable_ml_pipelines/scalable_ml_pipelines'

Configuration holds address to HDFS file paths for our artifacts, root directory, data file and soure URL

#Data Ingestion Configuration
artifacts_root: hdfs:///geekradius/used_cars_project/

data_ingestion:
    root_dir: /geekradius/used_cars_project/data_ingestion/
    hdfs_data_file: hdfs://localhost:9000/geekradius/used_cars_project/used_cars_data.csv
    source_URL: https://www.kaggle.com/datasets/ananaymital/us-used-cars-dataset/download?datasetVersionNumber=1

Entity

In [5]:
from dataclasses import dataclass

@dataclass(frozen=True)
class DataIngestionConfig:
    """
    Configuration for the data ingestion process.
    
    Attributes:
    - root_dir: The directory where data ingestion artifacts should be stored.
    - source_URL: The source URL from which the dataset is to be downloaded.
    - hdfs_data_file: The path in HDFS where the ingested data file should be stored.
    """
    
    root_dir: str  # Directory for data ingestion related artifacts
    source_URL: str  # URL for the source dataset
    hdfs_data_file: str  # Path in HDFS for the ingested data


ConfigurationManager

In [6]:
from us_used_cars_ml_pipeline.constants import *
from us_used_cars_ml_pipeline.utils.common import read_yaml
from us_used_cars_ml_pipeline import logger

from us_used_cars_ml_pipeline.entity.config_entity import DataIngestionConfig

class ConfigurationManager:
    def __init__(self, 
                 config_filepath=CONFIG_FILE_PATH, 
                 params_filepath=PARAMS_FILE_PATH, 
                 schema_filepath=SCHEMA_FILE_PATH):
        
        try:
            self.config = read_yaml(config_filepath)
        except Exception as e:
            logger.error(f"Error reading config file: {config_filepath}. Error: {e}")
            raise
        
        try:
            self.params = read_yaml(params_filepath)
        except Exception as e:
            logger.error(f"Error reading params file: {params_filepath}. Error: {e}")
            raise
        
        try:
            self.schema = read_yaml(schema_filepath)
        except Exception as e:
            logger.error(f"Error reading schema file: {schema_filepath}. Error: {e}")
            raise

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            hdfs_data_file=config.hdfs_data_file
        )
        return data_ingestion_config


Components

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
from us_used_cars_ml_pipeline.entity.config_entity import DataIngestionConfig
from us_used_cars_ml_pipeline import logger

class DataIngestion:
    """
    The DataIngestion component is responsible for managing the reading and ingestion 
    of data from HDFS. This class provides methods to read data from HDFS into Spark 
    DataFrame format. 

    Attributes:
        config (DataIngestionConfig): Configuration parameters for data ingestion.

    Note:
        This class will be expanded in the future to support additional functionalities 
        related to data ingestion.
    """
    
    def __init__(self, config: DataIngestionConfig):
        """
        Initializes the DataIngestion component with the given configuration.

        Args:
            config (DataIngestionConfig): Configuration parameters for data ingestion.
        """
        self.config = config

    def read_data_from_hdfs(self, spark: SparkSession) -> DataFrame:
        """
        Reads data from HDFS and returns it as a DataFrame.

        Args:
            spark (SparkSession): Active SparkSession for data processing.

        Returns:
            DataFrame: Spark DataFrame containing the read data.

        Raises:
            Exception: If there's an error during the data reading process.
        """
        try:
            df = spark.read.csv(self.config.hdfs_data_file, header=True, inferSchema=True)
            df.show()  # Display the first few rows of the DataFrame
            return df
        except Exception as e:
            logger.error(f"Failed to read data from HDFS. Error: {e}")
            raise e


Pipeline

In [9]:
from pyspark.sql import SparkSession
from us_used_cars_ml_pipeline.config.configuration import ConfigurationManager
from us_used_cars_ml_pipeline.components.data_ingestion import DataIngestion
from us_used_cars_ml_pipeline import logger
from us_used_cars_ml_pipeline.utils.common import get_spark_session


class DataIngestionTrainingPipeline:
    """
    This pipeline is responsible for orchestrating the data ingestion process 
    for training. It initializes necessary configurations, reads data from HDFS, 
    and ensures a smooth flow of the data ingestion stage.
    """
    
    STAGE_NAME = "Data Ingestion State"

    def __init__(self):
        self.config_manager = ConfigurationManager()

    def initialize_spark_session(self) -> SparkSession:
        """Initialize and return a Spark session."""
        return get_spark_session()

    def run_data_ingestion(self):
        """
        Main method to run the data ingestion process.
        """
        try:
            logger.info("Fetching data ingestion configuration...")
            data_ingestion_config = self.config_manager.get_data_ingestion_config()
            
            logger.info("Initializing data ingestion process...")
            data_ingestion = DataIngestion(config=data_ingestion_config)
            
            logger.info("Reading data file...")
            spark = self.initialize_spark_session()
            data_ingestion.read_data_from_hdfs(spark)
        except Exception as e:
            logger.exception("An error occurred during the data ingestion process.")
            raise e

    def run_pipeline(self):
        """
        Run the data ingestion training pipeline.
        """
        try:
            logger.info(f">>>>>> Stage: {DataIngestionTrainingPipeline.STAGE_NAME} started <<<<<<")
            self.run_data_ingestion()
            logger.info(f">>>>>> Stage {DataIngestionTrainingPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
        except Exception as e:
            # No need to log the exception here since it's already logged in the run_data_ingestion method.
            raise e

if __name__ == '__main__':
    pipeline = DataIngestionTrainingPipeline()
    pipeline.run_pipeline()


[2023-09-24 14:30:30,919: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-09-24 14:30:30,927: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-09-24 14:30:30,928: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-09-24 14:30:30,930: 47: us_used_cars_ml_pipeline_logger: INFO: 4090757948:  >>>>>> Stage: Data Ingestion State started <<<<<<]
[2023-09-24 14:30:30,931: 29: us_used_cars_ml_pipeline_logger: INFO: 4090757948:  Fetching data ingestion configuration...]
[2023-09-24 14:30:30,931: 32: us_used_cars_ml_pipeline_logger: INFO: 4090757948:  Initializing data ingestion process...]
[2023-09-24 14:30:30,932: 35: us_used_cars_ml_pipeline_logger: INFO: 4090757948:  Reading data file...]


23/09/24 14:30:33 WARN Utils: Your hostname, Macbooks-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.100 instead (on interface en0)
23/09/24 14:30:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/24 14:30:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/24 14:31:34 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-----------------+------------+----+----------+----------+---------------+-----+--------+-----------------+--------------------+------------+----------+--------------------+----------------+-------------------+-----------+--------------------+-----+-------------+----------------+--------------+-------------+----------------+---------+-------------+-------+--------------------+----------+--------------------+-----+------------+------+------+---------+--------+--------+-----------+-------------+----------+---------+--------------------+--------------------+----------+---------------+-------+------------------+-----------+------------------+-------+-------+--------------+-------------+------+-------------------+-----------+--------------------+------------+--------------------+------+--------------------+-----------------------+------------+--------------------+---------+-------+----+
|              vin|back_legroom| bed|bed_height|bed_length|      body_type|cabin|    city|city_fuel_econ