In [1]:
import os


In [2]:
%pwd

'c:\\end-to-end-wine-quality-project\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\end-to-end-wine-quality-project'

In [5]:
import pandas as pd


In [6]:
data = pd.read_csv(r"artifacts/data_ingestion/winequality-red.csv")
data.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [8]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

In [9]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories



In [10]:
import os
import pandas as pd
from mlProject import logger
#from mlProject.entity.config_entity import DataValidationConfig

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True
            data = pd.read_csv(self.config.unzip_data_dir / "winequality-red.csv")
            all_schema = self.config.all_schema

            # Check if all required columns are present
            for column in all_schema.keys():
                if column not in data.columns:
                    validation_status = False
                    logger.error(f"Column {column} not found in the dataset")
                    
            # Check data types
            for column, dtype in all_schema.items():
                if column in data.columns:
                    column_type = data[column].dtype
                    schema_type = dtype.get('type', '')
                    
                    # Convert pandas dtypes to schema types for comparison
                    if 'float' in str(column_type) and 'float' in schema_type:
                        continue
                    elif 'int' in str(column_type) and 'int' in schema_type:
                        continue
                    else:
                        validation_status = False
                        logger.error(f"Column {column} has type {column_type}, expected {schema_type}")
            
            # Write validation status to file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")
                
            logger.info(f"Data validation completed with status: {validation_status}")
            return validation_status
            
        except Exception as e:
            logger.error(f"Error during data validation: {e}")
            raise e
        
    def initiate_data_validation(self) -> bool:
        logger.info("Starting data validation")
        try:
            # Create the validation directory if it doesn't exist
            os.makedirs(os.path.dirname(self.config.STATUS_FILE), exist_ok=True)
            
            # Validate columns against schema
            validation_status = self.validate_all_columns()
            
            logger.info("Data validation completed")
            return validation_status
        except Exception as e:
            logger.error(f"Data validation failed: {e}")
            raise e

In [12]:
from pathlib import Path
from mlProject.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from mlProject.utils.common import read_yaml, create_directories


class ConfigurationManager:
    """
    Handles loading configuration, parameters, and schema from YAML files
    and provides component-specific configurations.
    """
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
        
        # Load configuration files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        
        # Create root artifacts directory
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        """
        Returns DataIngestionConfig object with parameters from config.yaml
        """
        config = self.config.data_ingestion
        
        # Create data ingestion directory
        create_directories([config.root_dir])
        
        # Create and return the data ingestion configuration
        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir
        )
        
        return data_ingestion_config
    
    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Returns DataValidationConfig object with parameters from config.yaml and schema.yaml
        Uses data from the already completed data ingestion stage for validation.
        """
        # Get validation configuration from config file
        validation_config = self.config.data_validation
        
        # Create validation directory
        create_directories([validation_config.root_dir])
        
        # Extract column schema information for validation
        schema = self.schema.columns if hasattr(self.schema, 'columns') else self.schema
        
        # Create the validation configuration
        data_validation_config = DataValidationConfig(
            root_dir=validation_config.root_dir,
            STATUS_FILE=validation_config.STATUS_FILE,
            unzip_data_dir=validation_config.unzip_data_dir,
            all_schema=schema
        )
        
        return data_validation_config

In [14]:
from mlProject import logger

STAGE_NAME = "Data Validation stage"

class DataValidationTrainingPipeline:
    def __init__(self):
        pass
    
    def main(self):
        try:
            logger.info(f">>>>>> Stage {STAGE_NAME} started <<<<<<")
            config = ConfigurationManager()
            data_validation_config = config.get_data_validation_config()
            data_validation = DataValidation(data_validation_config)
            validation_status = data_validation.initiate_data_validation()
            logger.info(f">>>>>> Stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
            return validation_status
        except Exception as e:
            logger.exception(e)
            raise e

if __name__ == "__main__":
    try:
        pipeline = DataValidationTrainingPipeline()
        pipeline.main()
    except Exception as e:
        logger.exception(e)
        raise e

[2025-05-12 18:52:39,234: INFO: 52000810: >>>>>> Stage Data Validation stage started <<<<<<]
[2025-05-12 18:52:39,241: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-12 18:52:39,248: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-12 18:52:39,256: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-12 18:52:39,258: INFO: common: created directory at: artifacts]
[2025-05-12 18:52:39,258: ERROR: 52000810: 'ConfigurationManager' object has no attribute 'get_data_validation_config']
Traceback (most recent call last):
  File "C:\Users\User\AppData\Local\Temp\ipykernel_15520\52000810.py", line 13, in main
    data_validation_config = config.get_data_validation_config()
AttributeError: 'ConfigurationManager' object has no attribute 'get_data_validation_config'. Did you mean: 'get_data_ingestion_config'?
[2025-05-12 18:52:39,258: ERROR: 52000810: 'ConfigurationManager' object has no attribute 'get_data_validation_config']
Traceback (

AttributeError: 'ConfigurationManager' object has no attribute 'get_data_validation_config'

In [15]:
from mlProject import logger
from mlProject.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline


STAGE_NAME = "Data Ingestion stage"

try:
    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
    data_ingestion = DataIngestionTrainingPipeline()
    data_ingestion.main()
    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
except Exception as e:
    logger.exception(e)
    raise e

STAGE_NAME = "Data Validation stage"

try:
    logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
    data_validation = DataValidationTrainingPipeline()
    data_validation.main()
    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
except Exception as e:
    logger.exception(e)
    raise e

logger.info("welcome to ML project")

[2025-05-12 18:52:45,745: INFO: 2021993117: >>>>>> stage Data Ingestion stage started <<<<<<]
[2025-05-12 18:52:45,745: INFO: stage_01_data_ingestion: >>>>>> Stage Data Ingestion stage started <<<<<<]
[2025-05-12 18:52:45,749: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-12 18:52:45,762: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-12 18:52:45,776: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-12 18:52:45,780: INFO: common: created directory at: artifacts]
[2025-05-12 18:52:45,784: INFO: common: created directory at: artifacts/data_ingestion]
[2025-05-12 18:52:45,786: INFO: data_ingestion: Starting data ingestion process]
[2025-05-12 18:52:45,788: INFO: data_ingestion: Files already exist in artifacts/data_ingestion, skipping download and extraction]
[2025-05-12 18:52:45,789: INFO: stage_01_data_ingestion: >>>>>> Stage Data Ingestion stage completed <<<<<<

[2025-05-12 18:52:45,792: INFO: 2021993117: >>>>>> stage D

AttributeError: 'ConfigurationManager' object has no attribute 'get_data_validation_config'