#### Add new component of the Pipelines i.e., Data Validation

#### Update the config

Update config/config.yaml

data_validation:
    root_dir: artifacts/data_validation
    unzip_data_dir: artifacts/data_ingestion/winequality-red.csv
    STATUS_FILE: artifacts/data_validation/status.txt

#### Update the schema

update the schema.yaml

COLUMNS:
    fixed acidity: float64
    ...
    quality: int64

TARGET_COLUMN:
    name: quality

#### Update the entity
Update the entity in mlProject/entity/config_entity.py

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    unzip_data_dir: Path
    STATUS_FILE: str
    all_schema: dict

#### Update the ConfigurationManager in src/mlProject/config/configuration.py

#### Update the Pipeline

try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    logger.exception(e)
    raise e

#### Test Case for Data Validation Stage

In [1]:
import os
from pathlib import Path
import pandas as pd

from mlProject.config.configuration import ConfigurationManager
from mlProject.utils.common import read_yaml, create_directories
from mlProject.entity.config_entity import DataValidationConfig

# Change the working directory
os.chdir("../")
HOME = os.getcwd()
print(f"Current working directory: {HOME}")

# Get Data Validation config
config = ConfigurationManager()
data_validation_config = config.get_data_validation_config()

Current working directory: D:\Crude_Oil_Price_Prediction____Color Sketch\Crude_Oil_Price_Prediction\crude_oil_price_prediction
[2024-04-09 22:12:41,725: INFO: common: yaml file: config\config.yaml loaded successfully.]
[2024-04-09 22:12:41,725: INFO: common: yaml file: params.yaml loaded successfully.]
[2024-04-09 22:12:41,733: INFO: common: yaml file: schema.yaml loaded successfully.]
[2024-04-09 22:12:41,733: INFO: common: created directory at: artifacts]
[2024-04-09 22:12:41,733: INFO: common: created directory at: artifacts/data_validation]


In [2]:
# Get all columns in schema.yaml
all_schema = data_validation_config.all_schema.keys()
list(all_schema)

['quality']

In [3]:
# Get all columns in csv
data = pd.read_csv(data_validation_config.unzip_data_dir)
all_cols = list(data.columns)
all_cols

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [5]:
validation_status = None
for col in all_cols:
    print(col)
    if col not in all_schema:
        validation_status = False
        break
    else:
        validation_status = True

print('\n',validation_status)
    

fixed acidity

 False


In [29]:
if 'alcohol' in all_schema:
    print('Yes')
else:
    print('No')

No


In [10]:
!python main.py

[2024-04-09 22:19:00,295: INFO: main: >>>>>>>> stage Data Ingestion stage started <<<<<<<<]
[2024-04-09 22:19:00,295: INFO: common: yaml file: config\config.yaml loaded successfully.]
[2024-04-09 22:19:00,295: INFO: common: yaml file: params.yaml loaded successfully.]
[2024-04-09 22:19:00,297: INFO: common: yaml file: schema.yaml loaded successfully.]
[2024-04-09 22:19:00,297: INFO: common: created directory at: artifacts]
[2024-04-09 22:19:00,297: INFO: common: created directory at: artifacts/data_ingestion]
[2024-04-09 22:19:00,297: INFO: data_ingestion: File already exists of size: ~26 KB]
[2024-04-09 22:19:00,297: INFO: main: >>>>>>>> stage Data Ingestion stage completed <<<<<<<<

[2024-04-09 22:19:00,297: INFO: main: >>>>>>>> stage Data Validation stage started <<<<<<<<]
[2024-04-09 22:19:00,297: INFO: common: yaml file: config\config.yaml loaded successfully.]
[2024-04-09 22:19:00,297: INFO: common: yaml file: params.yaml loaded successfully.]
[2024-04-09 22:19:00,297: INFO: comm