In [1]:
import os

In [2]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting'

In [5]:
import pandas as pd

In [6]:
DATA_PATH = 'data/Sandblasting-Condition.csv'
data = pd.read_csv(DATA_PATH)

In [7]:
data.head()

Unnamed: 0,Angle of Sandblasting,Pressure of Sandblasting (bar),Temperture of Acid Etching,Time of Acid Etching (min),Voltage of Anodizing (v),Time of Anodizing (min),(Sa) Average of Surface roughness (micrometer),Cell Viability (%),"Result (1=Passed, 0=Failed)"
0,30,3,25,3,80,1,0.746,0,0
1,40,3,25,3,80,1,0.813,0,0
2,50,3,25,3,80,1,0.952,0,0
3,30,4,25,3,80,1,1.207,0,0
4,40,4,25,3,80,1,1.298,0,0


In [8]:
data.columns


Index(['Angle of Sandblasting', 'Pressure of Sandblasting (bar)',
       'Temperture of Acid Etching', 'Time of Acid Etching (min)',
       'Voltage of Anodizing (v)', 'Time of  Anodizing (min)',
       '(Sa) Average of Surface roughness (micrometer)', 'Cell Viability (%)',
       'Result (1=Passed, 0=Failed)'],
      dtype='object')

In [9]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Angle of Sandblasting                           68 non-null     int64  
 1   Pressure of Sandblasting (bar)                  68 non-null     int64  
 2   Temperture of Acid Etching                      68 non-null     int64  
 3   Time of Acid Etching (min)                      68 non-null     int64  
 4   Voltage of Anodizing (v)                        68 non-null     int64  
 5   Time of  Anodizing (min)                        68 non-null     int64  
 6   (Sa) Average of Surface roughness (micrometer)  68 non-null     float64
 7   Cell Viability (%)                              68 non-null     int64  
 8   Result (1=Passed, 0=Failed)                     68 non-null     int64  
dtypes: float64(1), int64(8)
memory usage: 4.9 KB


### Make data class:
   - update the entity



In [10]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict  

### Configuration Manager in src

In [11]:
from Dental_Implant_Sandblasting.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from Dental_Implant_Sandblasting.utils.common import read_yaml, create_directories
from Dental_Implant_Sandblasting.entity.config_entity import DataValidationConfig
from Dental_Implant_Sandblasting import logger

class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
        schema_filepath=SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.features  # Using the features key from the schema

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            unzip_data_dir=config.unzip_data_dir,
            STATUS_FILE=config.STATUS_FILE,
            all_schema=schema,
        )

        return data_validation_config

### update the components

In [12]:
import os
from Dental_Implant_Sandblasting import logger

In [13]:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True
            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)
            all_schema = list(self.config.all_schema.keys())

            missing_cols = [col for col in all_schema if col not in all_cols]
            extra_cols = [col for col in all_cols if col not in all_schema]

            if missing_cols:
                validation_status = False
                logger.error(f"Missing columns: {missing_cols}")

            if extra_cols:
                validation_status = False
                logger.error(f"Extra columns: {extra_cols}")

            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}\n")
                if missing_cols:
                    f.write(f"Missing columns: {missing_cols}\n")
                if extra_cols:
                    f.write(f"Extra columns: {extra_cols}\n")

            return validation_status

        except Exception as e:
            raise e


### Update the pipeline


[2024-06-30 23:56:50,397: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-30 23:56:50,436: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-30 23:56:50,453: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-06-30 23:56:50,457: INFO: common: created directory at: artifacts]
[2024-06-30 23:56:50,480: INFO: common: created directory at: artifacts/data_validation]
[2024-06-30 23:56:50,491: ERROR: 3284769109: Extra columns: ['(Sa) Average of Surface roughness (micrometer)', 'Cell Viability (%)', 'Result (1=Passed, 0=Failed)']]


: 