In [1]:
import os

In [17]:
os.chdir("../semantic_preprocessor_model/semantic_preprocessor_model")


In [18]:
%pwd

'/Users/macbookpro/Documents/semantic_preprocessor_model/semantic_preprocessor_model'

# Config.yaml


In [None]:
# Root directory for all artifacts
artifacts_root: artifacts

# Configuration related to data ingestion
data_ingestion:

  # Directory where data ingestion artifacts are stored
  root_dir: artifacts/data_ingestion

  # Path to the local file where the data is already saved
  local_data_file: /Users/macbookpro/Documents/semantic_preprocessor_model/semantic_preprocessor_model/data/data.csv

# Entity

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataIngestionConfig:
    """
    Configuration for data ingestion process.
    
    Attributes:
    - root_dir: Directory where data ingestion artifacts are stored.
    - local_data_file: Path to the local file where the data is already saved.
    """
    root_dir: Path  # Directory where data ingestion artifacts are stored
    local_data_file: Path  # Path to the local file where the data is already saved

# Constants

In [4]:
from pathlib import Path

# Path to the main configuration file
CONFIG_FILE_PATH = Path("config/config.yaml")

# Path to the parameters file
PARAMS_FILE_PATH = Path("params.yaml")

# Path to the schema definition file
SCHEMA_FILE_PATH = Path("schema.yaml")

# Configuration Manager

In [24]:
from src.semantic_preprocessor_model.constants import *
from src.semantic_preprocessor_model.utils.common import read_yaml, create_directories
from src.semantic_preprocessor_model import logger
from src.semantic_preprocessor_model.entity.config_entity import (DataIngestionConfig)


class ConfigurationManager:
    """
    ConfigurationManager manages configurations needed for the data pipeline.

    The class reads configuration, parameter, and schema settings from specified files
    and provides a set of methods to access these settings. It also takes care of
    creating necessary directories defined in the configurations.

    Attributes:
    - config (dict): Configuration settings.
    - params (dict): Parameters for the pipeline.
    - schema (dict): Schema information.
    """
    
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH) -> None:
        """
        Initialize ConfigurationManager with configurations, parameters, and schema.

        Args:
        - config_filepath (Path): Path to the configuration file.
        - params_filepath (Path): Path to the parameters file.
        - schema_filepath (Path): Path to the schema file.

        Creates:
        - Directories specified in the configuration.
        """
        self.config = self._read_config_file(config_filepath, "config")
        self.params = self._read_config_file(params_filepath, "params")
        self.schema = self._read_config_file(schema_filepath, "initial_schema")

        # Create the directory for storing artifacts if it doesn't exist
        create_directories([self.config.artifacts_root])

    def _read_config_file(self, filepath: str, config_name: str) -> dict:
        """
        Read a configuration file and return its content.

        Args:
        - filepath (str): Path to the configuration file.
        - config_name (str): Name of the configuration (for logging purposes).

        Returns:
        - dict: Configuration settings.

        Raises:
        - Exception: If there's an error reading the file.
        """
        try:
            return read_yaml(filepath)
        except Exception as e:
            logger.error(f"Error reading {config_name} file: {filepath}. Error: {e}")
            raise

    
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Extract and return data ingestion configurations as a DataIngestionConfig object.

        This method fetches settings related to data ingestion, like directories and file paths,
        and returns them as a DataIngestionConfig object.

        Returns:
        - DataIngestionConfig: Object containing data ingestion configuration settings.

        Raises:
        - AttributeError: If the 'data_ingestion' attribute does not exist in the config file.
        """
        try:
            config = self.config.data_ingestion
            # Create the root directory for data ingestion if it doesn't already exist
            create_directories([config.root_dir])
            
            return DataIngestionConfig(
                root_dir=Path(config.root_dir),
                local_data_file=Path(config.local_data_file),
            )

        except AttributeError as e:
            logger.error("The 'data_ingestion' attribute does not exist in the config file.")
            raise e

# Component

In [20]:
import os
import shutil
from src.semantic_preprocessor_model import logger
from src.semantic_preprocessor_model.utils.common import get_size
# from semantic_preprocessor_model.entity.config_entity import DataIngestionConfig

class DataIngestion:
    """
    DataIngestion handles the process of transferring data from a local directory 
    to the project's official artifact directories.

    The class currently assumes that the data is already present locally, 
    and focuses on transferring this data to the specified directory.

    Attributes:
    - config (DataIngestionConfig): Configuration settings for data ingestion.
    """

    def __init__(self, config: DataIngestionConfig):
        """
        Initialize the DataIngestion component.

        Args:
        - config (DataIngestionConfig): Configuration settings for data ingestion.
        """
        self.config = config

    def download_data(self):
        """ 
        Placeholder for downloading data functionality. 
        Currently, data is assumed to be locally available.
        """
        pass

    def extract_zip_file(self):
        """
        Placeholder for extracting zip files. 
        If the data comes as a zip file, this method can be used to extract it.
        """
        pass

    def transfer_data(self) -> None:
        """
        Transfer the data from the local directory to the project's artifact directory.

        This method ensures that the artifact directory exists, and then transfers 
        the data file to this directory.

        Raises:
        - FileNotFoundError: If the local data file does not exist.
        """
        root_dir = Path(self.config.root_dir)
        local_data_path = Path(self.config.local_data_file)
        
        # Check if the local data file exists
        if not local_data_path.exists():
            logger.error(f"Local data file not found at {local_data_path}.")
            raise FileNotFoundError(f"No file found at {local_data_path}")

        # Get the file size using the utility function
        file_size = get_size(local_data_path)

        # Ensure the transfer directory exists
        os.makedirs(root_dir, exist_ok=True)

        # Transfer the file
        shutil.copy2(local_data_path, root_dir)
        logger.info(f"Data transferred from {local_data_path} to {root_dir}. File size: {file_size}.")

# Pipeline

In [25]:
# from predicting_publications.config.configuration import ConfigurationManager
# from predicting_publications.components.data_ingestion import DataIngestion
from src.semantic_preprocessor_model import logger

class DataIngestionPipeline:

    STAGE_NAME = "Data Ingestion Stage"

    def __init__(self):
        self.config_manager = ConfigurationManager()

    def run_data_ingestion(self):
        """
        Main method to run the data ingestion process.
        """
        try:
            logger.info("Fetching data ingestion configuration...")
            data_ingestion_config = self.config_manager.get_data_ingestion_config()
            
            logger.info("Initializing data ingestion process...")
            data_ingestion = DataIngestion(config=data_ingestion_config)
            
            logger.info(f"Copying training data from {data_ingestion_config.local_data_file} to {data_ingestion_config.root_dir}...")
            data_ingestion.transfer_data()
            
        except Exception as e:
            logger.exception("An error occurred during the data ingestion process.")
            raise e
        
    def run_pipeline(self):
        """
        Run the data ingestion training pipeline.
        """
        try:
            logger.info(f">>>>>> Stage: {DataIngestionPipeline.STAGE_NAME} started <<<<<<")
            self.run_data_ingestion()
            logger.info(f">>>>>> Stage {DataIngestionPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
        except Exception as e:
            # No need to log the exception here since it's already logged in the run_data_ingestion method.
            raise e

if __name__ == '__main__':
    pipeline = DataIngestionPipeline()
    pipeline.run_pipeline()

[2023-10-22 18:04:12,700: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-10-22 18:04:12,702: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-10-22 18:04:12,703: 42: semantic_preprocessor_model_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-10-22 18:04:12,703: 65: semantic_preprocessor_model_logger: INFO: common:  Created directory at: artifacts]
[2023-10-22 18:04:12,704: 35: semantic_preprocessor_model_logger: INFO: 3507771301:  >>>>>> Stage: Data Ingestion Stage started <<<<<<]
[2023-10-22 18:04:12,704: 17: semantic_preprocessor_model_logger: INFO: 3507771301:  Fetching data ingestion configuration...]
[2023-10-22 18:04:12,705: 65: semantic_preprocessor_model_logger: INFO: common:  Created directory at: artifacts/data_ingestion]
[2023-10-22 18:04:12,705: 20: semantic_preprocessor_model_logger: INFO: 3507771301:  Initializing data ingestion