In [1]:
import os

In [2]:
os.chdir("../")
%pwd

'/Users/macbookpro/Documents/predict_publications/publications_prediction'

# 1. Setup Config.yaml

In [None]:
# Configuration related to data validation
data_validation:
  # Directory where data validation results and artifacts are stored
  root_dir: artifacts/data_validation
  
  # Path to the ingested data file that will be used for validation
  data_source_file: artifacts/data_ingestion/train_data.csv
  
  # Path to the file that captures the validation status (e.g., success, errors encountered)
  status_file: artifacts/initial_data_validation/status.txt


# 2. Update schema.yaml

In [None]:
schema_type: "initial"
description: "Schema of the initial data before feature engineering."

columns:
  timestamp: 
    type: int64
    description: "Timestamp of the data entry."
  lon: 
    type: float64
    description: "Longitude value."
  lat: 
    type: float64
    description: "Latitude value."
  likescount: 
    type: int64
    description: "Count of likes."
  commentscount: 
    type: int64
    description: "Count of comments."
  symbols_cnt: 
    type: int64
    description: "Count of symbols."
  words_cnt: 
    type: int64
    description: "Count of words."
  hashtags_cnt: 
    type: int64
    description: "Count of hashtags."
  mentions_cnt: 
    type: int64
    description: "Count of mentions."
  links_cnt: 
    type: int64
    description: "Count of links."
  emoji_cnt: 
    type: int64
    description: "Count of emojis."
  point: 
    type: object
    description: "Geographical point object."

# 3. Setup Params.yaml (Not Required at this Stage)

# 4. Setup Entity

In [3]:
from dataclasses import dataclass
from pathlib import Path
from typing import Dict

@dataclass(frozen=True)
class DataValidationConfig:
    """
    Configuration for data validation process.
    
    This configuration class captures the necessary paths and directories 
    required for the validation of data both pre and post feature engineering.
    
    Attributes:
    - root_dir: Directory where data validation results and artifacts are stored.
    - data_source_file: Path to the file where the ingested or feature-engineered data is stored.
    - status_file: Path to the file that captures the validation status (e.g., success, errors encountered).
    - initial_schema: Dictionary holding all schema configurations. This can include initial data schema,
                  feature-engineered data schema, and any other relevant schema definitions.
    """
    
    root_dir: Path  # Directory for storing validation results and related artifacts
    data_source_file: Path  # Path to the ingested or feature-engineered data file
    status_file: Path  # File for logging the validation status
    initial_schema: Dict[str, Dict[str, str]]  # Dictionary containing initial schema configurations


# 5. Setup Configuration Manager

In [7]:
from predicting_publications.constants import *
from predicting_publications.utils.common import read_yaml, create_directories
from predicting_publications import logger
from predicting_publications.entity.config_entity import DataIngestionConfig

class ConfigurationManager:
    """
    ConfigurationManager manages configurations needed for the data pipeline.

    The class reads configuration, parameter, and schema settings from specified files
    and provides a set of methods to access these settings. It also takes care of
    creating necessary directories defined in the configurations.

    Attributes:
    - config (dict): Configuration settings.
    - params (dict): Parameters for the pipeline.
    - schema (dict): Schema information.
    """
    
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH, 
                 params_filepath = PARAMS_FILE_PATH, 
                 schema_filepath = SCHEMA_FILE_PATH,
                 feature_schema_filepath = FEATURE_SCHEMA_FILE_PATH) -> None:
        """
        Initialize ConfigurationManager with configurations, parameters, and schema.

        Args:
        - config_filepath (Path): Path to the configuration file.
        - params_filepath (Path): Path to the parameters file.
        - schema_filepath (Path): Path to the schema file.

        Creates:
        - Directories specified in the configuration.
        """
        self.config = self._read_config_file(config_filepath, "config")
        self.params = self._read_config_file(params_filepath, "params")
        self.schema = self._read_config_file(schema_filepath, "initial_schema")
        self.feature_schema_filepath = self._read_config_file(feature_schema_filepath, "feature_engineered_schema")

        # Create the directory for storing artifacts if it doesn't exist
        create_directories([self.config.artifacts_root])

    def _read_config_file(self, filepath: str, config_name: str) -> dict:
        """
        Read a configuration file and return its content.

        Args:
        - filepath (str): Path to the configuration file.
        - config_name (str): Name of the configuration (for logging purposes).

        Returns:
        - dict: Configuration settings.

        Raises:
        - Exception: If there's an error reading the file.
        """
        try:
            return read_yaml(filepath)
        except Exception as e:
            logger.error(f"Error reading {config_name} file: {filepath}. Error: {e}")
            raise
    

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Extract and return data ingestion configurations as a DataIngestionConfig object.

        This method fetches settings related to data ingestion, like directories and file paths,
        and returns them as a DataIngestionConfig object.

        Returns:
        - DataIngestionConfig: Object containing data ingestion configuration settings.

        Raises:
        - AttributeError: If the 'data_ingestion' attribute does not exist in the config file.
        """
        try:
            config = self.config.data_ingestion
            # Create the root directory for data ingestion if it doesn't already exist
            create_directories([config.root_dir])
            
            return DataIngestionConfig(
                root_dir=Path(config.root_dir),
                local_data_file=Path(config.local_data_file),
            )

        except AttributeError as e:
            logger.error("The 'data_ingestion' attribute does not exist in the config file.")
            raise e
        

    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Extract and return data validation configurations as a DataValidationConfig object.

        This method fetches settings related to data validation, like directories, file paths,
        and schema, and returns them as a DataValidationConfig object.

        Returns:
        - DataValidationConfig: Object containing data validation configuration settings.

        Raises:
        - AttributeError: If the 'data_validation' attribute does not exist in the config file.
        """
        try:
            # Extract data validation configurations
            config = self.config.data_validation
            
            # Extract schema for data validation
            schema = self.schema.columns
            
            # # Ensure the status directory for data validation exists
            # create_directories([config.status_file])
            # Ensure the parent directory for the status file exists
            create_directories([os.path.dirname(config.status_file)])

            
            # Construct and return the DataValidationConfig object
            return DataValidationConfig(
                root_dir=Path(config.root_dir),
                data_source_file=Path(config.data_source_file),
                status_file=Path(config.status_file),
                initial_schema=schema
            )

        except AttributeError as e:
            # Log the error and re-raise the exception for handling by the caller
            logger.error("The 'data_validation' attribute does not exist in the config file.")
            raise e



# 6. Create Components

In [15]:
import pandas as pd
from predicting_publications import logger


class DataValidation:
    """
    Validates the data against a predefined schema to ensure that all expected columns 
    are present and of the correct type.
    """

    def __init__(self, config: DataValidationConfig):
        """
        Initializes the DataValidation component by reading the data source file 
        specified in the config.

        Args:
        - config (DataValidationConfig): Configuration settings for data validation.

        Attributes:
        - df (pd.DataFrame): The data to be validated.
        """
        self.config = config
        try:
            self.df = pd.read_csv(self.config.data_source_file)
        except FileNotFoundError:
            logger.error(f"File not found: {self.config.data_source_file}")
            raise

    def validate_all_features(self) -> bool:
        """
        Validates that all expected columns are present in the dataframe.

        Returns:
        - bool: True if validation is successful, False otherwise.
        """
        validation_status = True
        status_message = "Validation status: "

        # Determine missing or extra columns
        all_columns = set(self.df.columns)
        expected_columns = set(self.config.initial_schema.keys())

        missing_columns = expected_columns - all_columns
        extra_columns = all_columns - expected_columns

        # Log and update the status message for any discrepancies
        if missing_columns:
            validation_status = False
            logger.warning(f"Missing columns: {', '.join(missing_columns)}")
            status_message += f"Missing columns: {', '.join(missing_columns)}. "
        if extra_columns:
            validation_status = False
            logger.warning(f"Extra columns found: {', '.join(extra_columns)}")
            status_message += f"Extra columns found: {', '.join(extra_columns)}. "
        if validation_status:
            logger.info("All expected columns are present in the dataframe.")
            status_message += "All expected columns are present."

        # Append the validation status to the file
        self._write_status_to_file(status_message, overwrite=True)
        return validation_status

    def validate_data_types(self) -> bool:
        """
        Validates the data types of each column in the dataframe against 
        the expected data types specified in the schema.

        Returns:
        - bool: True if all data types match, False otherwise.
        """
        validation_status = True
        status_message = "Data type validation status: "

        expected_data_types = {col: self.config.initial_schema[col]['type'] for col in self.config.initial_schema}

        for column, dtype in expected_data_types.items():
            # Check if the column exists in the dataframe
            if column in self.df.columns:
                if not pd.api.types.is_dtype_equal(self.df[column].dtype, dtype):
                    validation_status = False
                    logger.warning(f"Data type mismatch for column '{column}': Expected {dtype} but got {self.df[column].dtype}")
                    status_message += f"Data type mismatch for column '{column}': Expected {dtype} but got {self.df[column].dtype}. "
            else:
                validation_status = False
                logger.warning(f"Column '{column}' not found in dataframe.")
                status_message += f"Column '{column}' not found in dataframe. "

        if validation_status:
            logger.info("All data types are as expected.")
            status_message += "All data types are as expected."

        # Append the validation status to the file
        self._write_status_to_file(status_message)
        return validation_status

    def _write_status_to_file(self, message: str, overwrite: bool = False):
        """
        Writes a given message to the status file specified in the config.

        Args:
        - message (str): The message to write.
        - overwrite (bool): If True, overwrites the file. If False, appends to the file.
        """
        mode = 'w' if overwrite else 'a'
        try:
            with open(self.config.status_file, mode) as f:
                f.write(message + "\n")
        except Exception as e:
            logger.error(f"Error writing to status file: {e}")
            raise

    def run_all_validations(self):
        """
        Executes all validations and writes the overall validation status.
        """
        feature_validation_status = self.validate_all_features()
        data_type_validation_status = self.validate_data_types()

        overall_status = "Overall Validation Status: "
        if feature_validation_status and data_type_validation_status:
            overall_status += "All validations passed."
        else:
            overall_status += "Some validations failed. Check the log for details."
        
        self._write_status_to_file(overall_status)

    def _save_dataframe(self):
        """
        Save the dataframe to the output path specified in the configuration.
        """
        try:
            self.df.to_csv(self.config.validated_data_file, index=False)
            logger.info(f"Data saved successfully to {self.config.root_dir}")
        except Exception as e:
            logger.error(f"Error while saving the dataframe: {e}")
            raise


# 7. Setup Pipeline

In [17]:
from predicting_publications import logger

class InitialDataValidationPipeline:
    """
    This pipeline handles the initial data validation steps.

    After the data ingestion stage, it's imperative to ensure the data's integrity
    before moving on to feature engineering or model training. This class
    orchestrates that validation by checking for correct features and data types.

    Attributes:
        STAGE_NAME (str): The name of this pipeline stage.
    """

    STAGE_NAME = "Initial Data Validation Pipeline"

    def __init__(self):
        """
        Initializes the pipeline with a configuration manager.
        """
        self.config_manager = ConfigurationManager()

    def run_data_validation(self):
        """
        Run the set of data validations.
        
        This method orchestrates the different validation functions to ensure the
        dataset's integrity.
        """
        try:
            logger.info("Fetching initial data validation configuration...")
            data_validation_config = self.config_manager.get_data_validation_config()

            logger.info("Initializing data validation process...")
            data_validation = DataValidation(config=data_validation_config)

            logger.info("Executing Data Validations...")
            data_validation.run_all_validations()

            logger.info("Initial Data Validation Pipeline completed successfully.")

        except Exception as e:
            logger.error(f"Error encountered during the data validation: {e}")
    
    def run_pipeline(self):
        """
        Run the entire Initial Data Validation Pipeline.
        
        This method encapsulates the process of the initial data validation and
        provides logs for each stage of the pipeline.
        """
        try:
            logger.info(f">>>>>> Stage: {InitialDataValidationPipeline.STAGE_NAME} started <<<<<<")
            self.run_data_validation()
            logger.info(f">>>>>> Stage {InitialDataValidationPipeline.STAGE_NAME} completed <<<<<< \n\nx==========x")
        except Exception as e:
            logger.error(f"Error encountered during the {InitialDataValidationPipeline.STAGE_NAME}: {e}")
            raise e

if __name__ == '__main__':
    pipeline = InitialDataValidationPipeline()
    pipeline.run_pipeline()


[2023-10-16 15:04:33,154: 42: predict_publications_logger: INFO: common:  yaml file: config/config.yaml loaded successfully]
[2023-10-16 15:04:33,157: 42: predict_publications_logger: INFO: common:  yaml file: params.yaml loaded successfully]
[2023-10-16 15:04:33,161: 42: predict_publications_logger: INFO: common:  yaml file: schema.yaml loaded successfully]
[2023-10-16 15:04:33,162: 42: predict_publications_logger: INFO: common:  yaml file: feature_engineered_schema.yaml loaded successfully]
[2023-10-16 15:04:33,163: 65: predict_publications_logger: INFO: common:  Created directory at: artifacts]
[2023-10-16 15:04:33,163: 53: predict_publications_logger: INFO: 3638325286:  >>>>>> Stage: Initial Data Validation Pipeline started <<<<<<]
[2023-10-16 15:04:33,164: 31: predict_publications_logger: INFO: 3638325286:  Fetching initial data validation configuration...]
[2023-10-16 15:04:33,164: 65: predict_publications_logger: INFO: common:  Created directory at: artifacts/initial_data_valida