In [None]:
import os

In [None]:
%pwd

In [None]:
os.chdir("../")

In [None]:
%pwd

In [None]:
import pandas as pd

In [None]:
DATA_PATH = 'data/Sandblasting-Condition.csv'
data = pd.read_csv(DATA_PATH)

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.info()

In [1]:
from pathlib import Path
from dataclasses import dataclass
from Dental_Implant_Sandblasting import logger
from Dental_Implant_Sandblasting.utils.common import read_yaml, create_directories
from Dental_Implant_Sandblasting.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = {**self.schema.COLUMNS, **self.schema.TARGET_COLUMNS}
        create_directories([config.root_dir])
        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            unzip_data_dir=config.unzip_data_dir,
            STATUS_FILE=config.STATUS_FILE,
            all_schema=schema,
        )
        return data_validation_config

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = list(self.config.all_schema.keys())

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    logger.error(f"Column {col} not found in schema")
                    break
                else:
                    logger.info(f"Column {col} is valid")

            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")

            return validation_status

        except Exception as e:
            logger.exception(e)
            raise e

# Pipeline execution
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    validation_status = data_validation.validate_all_columns()
    logger.info(f"Validation status: {validation_status}")
except Exception as e:
    logger.exception(e)
    raise e


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 140 entries, ('Angle of Sandblasting', 'Pressure of Sandblasting (bar)', 'Temperture of Acid Etching', 'Time of Acid Etching (min)', 'Voltage of Anodizing (v)', 'Time of  Anodizing (min)', '(Sa) Average of Surface roughness (micrometer)', 'Cell Viability (%)') to ('>>>>>>> f9a302bdff19857456477a25f0b49431f1a09e9d', nan, nan, nan, nan, nan, nan, nan)
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   <<<<<<< HEAD  138 non-null    object
dtypes: object(1)
memory usage: 6.8+ KB
[2024-07-05 16:49:29,837: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-05 16:49:29,884: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-05 16:49:29,892: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-05 16:49:29,894: INFO: common: created directory at: artifacts]
[2024-07-05 16:49:29,897: INFO: common: created directory at: artifa