In [1]:
%cd ..


c:\Users\ernes\Documents\ML Projects\ds-project-test\ds-project-test


In [2]:
import pandas as pd

data=pd.read_csv('artifact/data_ingestion/raw_data/Titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
from dataclasses import dataclass
from typing import Dict
from pathlib import Path

@dataclass
class DataValidationConfig:
    """
    Data Validation Configuration
    """
    validation_directory: Path
    data_path: Path
    STATUS_FILE: str
    schema_file: Dict

In [4]:
from src.ds_project.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH
from src.ds_project.utils.utils import read_yaml, create_directories

class ValidationConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH,
                 ):
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
    
    def get_data_validation_config(self) -> DataValidationConfig:
        """
        Get Data Validation Configuration
        """
        
        data_validation_config = DataValidationConfig(
            validation_directory=self.config['data_validation']['directory'],
            data_path=self.config['data_validation']['local_csv_file'],
            STATUS_FILE=self.config['data_validation']['status'],
            schema_file=self.schema
        )
        

        return data_validation_config

2025-05-02 17:42:50 - INFO: __init__ - Logging setup complete.


In [5]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
data.shape

(891, 12)

# data validation

In [30]:
import os
from src.ds_project import logger
from src.ds_project.utils.utils import read_yaml
import datetime

In [49]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config
    
    def validate_data_schema(self) -> bool: 
        """
        Validate the data against the schema
        """
        status=True
        try:
            if not os.path.exists(self.config.validation_directory):
                create_directories([self.config.validation_directory])
            if not os.path.exists(self.config.STATUS_FILE):
                with open(self.config.STATUS_FILE, 'a') as f:
                    f.write("Data Schema Validation Status\n")
                    f.write("====================================\n")
                
            schema = [col for col in self.config.schema_file['COLUMNS'].keys()] + [target for target in self.config.schema_file['TARGET'].keys()]
            data=pd.read_csv(self.config.data_path)
            data_columns = data.columns.tolist()

            date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            with open(self.config.STATUS_FILE, 'a') as f:
                f.write(f"\n{date}\n")

            
            validation_error=False
            for column in schema:


                status = True
                if column not in data_columns:
                    status = False
                    validation_error=True
                    with open(self.config.STATUS_FILE, 'a') as f:
                        f.write(f"Column {column} - Status {status}:  {column} is not in the dataset\n")
                    continue

                schema_dtype = self.config.schema_file['COLUMNS'][column]['type'] if column in self.config.schema_file['COLUMNS'] else self.config.schema_file['TARGET'][column]['type']
                data_dtype = data[column].dtype
                if (schema_dtype == 'string'):
                    if (data_dtype != 'object'):
                        status = False
                        validation_error=True
                        with open(self.config.STATUS_FILE, 'a') as f:
                            f.write(f"Column {column} - Status {status}: {column} dtype in the schema ({schema_dtype}) is not matching to dtype in the dataset ({data_dtype})\n")
                elif schema_dtype != data_dtype:
                    status = False
                    validation_error=True
                    with open(self.config.STATUS_FILE, 'a') as f:
                        f.write(f"Column {column} - Status {status}: {column} dtype in the schema ({schema_dtype}) is not matching to dtype in the dataset ({data_dtype})\n")
                else:
                    status = True
                    validation_error=False
                    with open(self.config.STATUS_FILE, 'a') as f:
                        f.write(f"Column {column} - Status {status}: is matching the schema\n")

            return validation_error
            
        except Exception as e:
            raise e
                    

In [50]:
manager=ValidationConfigurationManager()
data_validation_config = manager.get_data_validation_config()
validator = DataValidation(config=data_validation_config)

validation = validator.validate_data_schema()
validation


2025-05-02 18:01:11 - INFO: utils - YAML file config\config.yaml loaded successfully.
2025-05-02 18:01:11 - INFO: utils - YAML file schema.yaml loaded successfully.


False

In [33]:
schema_test=read_yaml(Path('schema.yaml'))
schema_test.keys()

2025-05-02 17:52:35 - INFO: utils - YAML file schema.yaml loaded successfully.


dict_keys(['COLUMNS', 'TARGET'])

In [None]:
schema_test['COLUMNS'].keys()

dict_keys(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'])

In [None]:
schema_test['COLUMNS']

ConfigBox({'PassengerId': {'type': 'int64', 'description': 'Unique identifier for each passenger.'}, 'Pclass': {'type': 'int64', 'description': 'Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd).'}, 'Name': {'type': 'string', 'description': 'Name of the passenger.'}, 'Sex': {'type': 'string', 'description': 'Gender of the passenger'}, 'Age': {'type': 'float64', 'description': 'Age of the passenger in years.'}, 'SibSp': {'type': 'int64', 'description': 'Number of siblings or spouses aboard the Titanic.'}, 'Parch': {'type': 'int64', 'description': 'Number of parents or children aboard the Titanic.'}, 'Ticket': {'type': 'string', 'description': 'Ticket number.'}, 'Fare': {'type': 'float64', 'description': 'Passenger fare.'}, 'Cabin': {'type': 'string', 'description': 'Cabin number.'}, 'Embarked': {'type': 'string', 'description': 'Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton).'}})

In [None]:
data['Name'].dtype == schema_test['COLUMNS']['Name']

False