In [1]:
import os

In [2]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting'

In [5]:
import pandas as pd 

In [6]:
DATA_PATH = 'data/Sandblasting-Condition.csv'
data = pd.read_csv(DATA_PATH)

In [7]:
data.head()

Unnamed: 0,Angle of Sandblasting,Pressure of Sandblasting (bar),Temperture of Acid Etching,Time of Acid Etching (min),Voltage of Anodizing (v),Time of Anodizing (min),(Sa) Average of Surface roughness (micrometer),Cell Viability (%),"Result (1=Passed, 0=Failed)"
0,30,3,25,3,80,1,0.746,0,0
1,40,3,25,3,80,1,0.813,0,0
2,50,3,25,3,80,1,0.952,0,0
3,30,4,25,3,80,1,1.207,0,0
4,40,4,25,3,80,1,1.298,0,0


In [8]:
data.columns

Index(['Angle of Sandblasting', 'Pressure of Sandblasting (bar)',
       'Temperture of Acid Etching', 'Time of Acid Etching (min)',
       'Voltage of Anodizing (v)', 'Time of  Anodizing (min)',
       '(Sa) Average of Surface roughness (micrometer)', 'Cell Viability (%)',
       'Result (1=Passed, 0=Failed)'],
      dtype='object')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype 
---  ------                                          --------------  ----- 
 0   Angle of Sandblasting                           139 non-null    object
 1   Pressure of Sandblasting (bar)                  137 non-null    object
 2   Temperture of Acid Etching                      137 non-null    object
 3   Time of Acid Etching (min)                      137 non-null    object
 4   Voltage of Anodizing (v)                        137 non-null    object
 5   Time of  Anodizing (min)                        137 non-null    object
 6   (Sa) Average of Surface roughness (micrometer)  137 non-null    object
 7   Cell Viability (%)                              137 non-null    object
 8   Result (1=Passed, 0=Failed)                     137 non-null    object
dtypes: object(9)
memory usage: 9.9+ KB


### Make data

In [10]:
from pathlib import Path
from dataclasses import dataclass
from Dental_Implant_Sandblasting import logger
from Dental_Implant_Sandblasting.utils.common import read_yaml, create_directories
from Dental_Implant_Sandblasting.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = {**self.schema.COLUMNS, **self.schema.TARGET_COLUMNS}
        create_directories([config.root_dir])
        data_validation_config = DataValidationConfig(
            root_dir=config.root_dir,
            unzip_data_dir=config.unzip_data_dir,
            STATUS_FILE=config.STATUS_FILE,
            all_schema=schema,
        )
        return data_validation_config

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True  # Initialize to True assuming validation will pass

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            # Combine schema columns and target columns into a single list
            all_schema = list(self.config.all_schema.keys())

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False  # Set to False if any column is not found in the schema
                    logger.error(f"Column {col} not found in schema")
                    break  # Stop further checks if a mismatch is found
                else:
                    logger.info(f"Column {col} is valid")

            # Write the validation status to the status file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")

            return validation_status  # Return the final validation status

        except Exception as e:
            logger.exception(e)
            raise e

# Pipeline execution
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)
    validation_status = data_validation.validate_all_columns()
    logger.info(f"Validation status: {validation_status}")
except Exception as e:
    logger.exception(e)
    raise e


[2024-07-06 23:52:01,655: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-07-06 23:52:01,674: INFO: common: yaml file: params.yaml loaded successfully]
[2024-07-06 23:52:01,687: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-07-06 23:52:01,690: INFO: common: created directory at: artifacts]
[2024-07-06 23:52:01,697: INFO: common: created directory at: artifacts/data_validation]
[2024-07-06 23:52:01,708: INFO: 363709866: Column Angle of Sandblasting is valid]
[2024-07-06 23:52:01,711: INFO: 363709866: Column Pressure of Sandblasting (bar) is valid]
[2024-07-06 23:52:01,717: INFO: 363709866: Column Temperture of Acid Etching is valid]
[2024-07-06 23:52:01,718: INFO: 363709866: Column Time of Acid Etching (min) is valid]
[2024-07-06 23:52:01,720: INFO: 363709866: Column Voltage of Anodizing (v) is valid]
[2024-07-06 23:52:01,722: INFO: 363709866: Column Time of  Anodizing (min) is valid]
[2024-07-06 23:52:01,724: INFO: 363709866: Column (Sa) Average