In [1]:
import os

In [2]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting'

In [5]:
import pandas as pd 

In [6]:
DATA_PATH = 'data/Sandblasting-Condition.csv'
data = pd.read_csv(DATA_PATH)

In [7]:
data.head()

Unnamed: 0,angle_sandblasting,pressure_sandblasting_bar,temperature_acid_etching,time_acid_etching_min,voltage_anodizing_v,time_anodizing_min,sa_surface_roughness_micrometer,cell_viability_percent,Result_Passed_1_Failed_0
0,30,3,25,3,80,1,0.746,75,0
1,40,3,25,3,80,1,0.813,70,0
2,50,3,25,3,80,1,0.952,65,0
3,30,3,25,6,80,1,0.95,77,0
4,30,3,25,9,80,1,1.02,75,0


In [8]:
data.columns

Index(['angle_sandblasting', 'pressure_sandblasting_bar',
       'temperature_acid_etching', 'time_acid_etching_min',
       'voltage_anodizing_v', 'time_anodizing_min',
       'sa_surface_roughness_micrometer', 'cell_viability_percent',
       'Result_Passed_1_Failed_0'],
      dtype='object')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   angle_sandblasting               198 non-null    int64  
 1   pressure_sandblasting_bar        198 non-null    int64  
 2   temperature_acid_etching         198 non-null    int64  
 3   time_acid_etching_min            198 non-null    int64  
 4   voltage_anodizing_v              198 non-null    int64  
 5   time_anodizing_min               198 non-null    int64  
 6   sa_surface_roughness_micrometer  198 non-null    float64
 7   cell_viability_percent           198 non-null    int64  
 8   Result_Passed_1_Failed_0         198 non-null    int64  
dtypes: float64(1), int64(8)
memory usage: 14.0 KB


In [10]:
# Import necessary libraries
from pathlib import Path
from dataclasses import dataclass
from Dental_Implant_Sandblasting import logger
from Dental_Implant_Sandblasting.utils.common import read_yaml, create_directories
from Dental_Implant_Sandblasting.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt

### Make data class (entity):
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict
    columns_to_convert: list
    knn_n_neighbors: int
    sa_lower_bound: float
    sa_upper_bound: float
    feature_columns: list
    target_column_sa: str
    target_column_cv: str
    test_size: float
    random_state: int

### Configuration Manager in src config:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config['artifacts_root']])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config['data_validation']
        schema = {**self.schema['COLUMNS'], **self.schema['TARGET_COLUMNS']}
        create_directories([config['root_dir']])

        data_validation_config = DataValidationConfig(
            root_dir=Path(config['root_dir']),
            unzip_data_dir=Path(config['unzip_data_dir']),  # Updated to use the CSV file path from config.yaml
            STATUS_FILE=Path(config['STATUS_FILE']),
            all_schema=schema,
            columns_to_convert=self.config['data_ingestion']['columns_to_convert'],
            knn_n_neighbors=config['knn_n_neighbors'],
            sa_lower_bound=config['sa_lower_bound'],
            sa_upper_bound=config['sa_upper_bound'],
            feature_columns=config['feature_columns'],
            target_column_sa=config['target_column_sa'],
            target_column_cv=config['target_column_cv'],
            test_size=self.params['data_validation']['test_size'],
            random_state=self.params['data_validation']['random_state']
        )
        return data_validation_config

### Class for validating and preprocessing the data:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True

            csv_path = str(self.config.unzip_data_dir)  # Convert the Path to a string
            logger.info(f"Reading CSV file from: {csv_path}")

            data = pd.read_csv(csv_path)  # Load the CSV file based on the updated path
            all_cols = list(data.columns)
            all_schema = list(self.config.all_schema.keys())

            logger.info(f"Checking columns in the dataset: {all_cols}")
            logger.info(f"Expected schema columns: {all_schema}")

            # Check if any expected columns are missing
            missing_cols = [col for col in all_schema if col not in all_cols]
            if missing_cols:
                validation_status = False
                logger.error(f"Missing columns in data: {missing_cols}")

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    logger.error(f"Unexpected column {col} not found in schema")
                else:
                    logger.info(f"Column {col} is valid")

            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")

            return validation_status

        except Exception as e:
            logger.exception(f"Error during column validation: {e}")
            raise e

    def preprocess_data(self):
        try:
            # Load the dataset
            csv_path = str(self.config.unzip_data_dir)  # Convert the Path to a string
            logger.info(f"Loading dataset from {csv_path}")
            data = pd.read_csv(csv_path)

            # Check for missing values
            missing_values = data.isnull().sum()
            logger.info(f"Missing values:\n{missing_values}")

            # Convert columns to numeric, forcing errors to NaN
            cols_to_convert = self.config.columns_to_convert
            logger.info(f"Converting columns {cols_to_convert} to numeric.")
            data[cols_to_convert] = data[cols_to_convert].apply(pd.to_numeric, errors='coerce')

            # Advanced Imputation using KNN Imputer
            logger.info(f"Imputing missing values using KNN with {self.config.knn_n_neighbors} neighbors.")
            imputer = KNNImputer(n_neighbors=self.config.knn_n_neighbors)
            data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

            logger.info(f"Data after imputing missing values:\n{data_imputed.info()}")

            if data_imputed.empty:
                raise ValueError("Dataset is empty after imputing missing values.")

            # Filter data according to validation ranges for Surface Roughness (Sa)
            valid_data = data_imputed[
                (data_imputed['sa_surface_roughness_micrometer'] > self.config.sa_lower_bound) & 
                (data_imputed['sa_surface_roughness_micrometer'] < self.config.sa_upper_bound)
            ]

            data_imputed.loc[~data_imputed.index.isin(valid_data.index), 'cell_viability_percent'] = 0

            # Separate features and target variables
            feature_columns = self.config.feature_columns
            target_column_sa = self.config.target_column_sa
            target_column_cv = self.config.target_column_cv

            X = data_imputed[feature_columns]
            y_sa = data_imputed[target_column_sa]
            y_cv = data_imputed[target_column_cv]

            # Normalize or standardize features using RobustScaler
            scaler = RobustScaler()
            X_scaled = scaler.fit_transform(X)

            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X_scaled, data_imputed[[target_column_sa, target_column_cv]], test_size=self.config.test_size, random_state=self.config.random_state)

            y_sa_train = y_train[target_column_sa]
            y_sa_test = y_test[target_column_sa]
            y_cv_train = y_train[target_column_cv]
            y_cv_test = y_test[target_column_cv]

            logger.info(f"Training set size for Surface Roughness (Sa): {X_train.shape}")
            logger.info(f"Testing set size for Surface Roughness (Sa): {X_test.shape}")
            logger.info(f"Training set size for Cell Viability (CV): {y_cv_train.shape}")
            logger.info(f"Testing set size for Cell Viability (CV): {y_cv_test.shape}")

            if X_train.shape[0] != y_sa_train.shape[0] or X_train.shape[0] != y_cv_train.shape[0]:
                raise ValueError("Mismatch in the number of training samples between features and targets.")
            if X_test.shape[0] != y_sa_test.shape[0] or X_test.shape[0] != y_cv_test.shape[0]:
                raise ValueError("Mismatch in the number of testing samples between features and targets.")

        except Exception as e:
            logger.exception(f"Error during data preprocessing: {e}")
            raise e

### Pipeline:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)

    # Validate the dataset
    validation_status = data_validation.validate_all_columns()
    logger.info(f"Validation status: {validation_status}")

    # If validation passes, proceed to preprocessing
    if validation_status:
        data_validation.preprocess_data()

except Exception as e:
    logger.exception(f"Pipeline execution failed: {e}")
    raise e


[2024-09-13 15:23:45,205: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-09-13 15:23:45,213: INFO: common: yaml file: params.yaml loaded successfully]
[2024-09-13 15:23:45,217: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-09-13 15:23:45,221: INFO: common: created directory at: artifacts]
[2024-09-13 15:23:45,224: INFO: common: created directory at: artifacts/data_validation]
[2024-09-13 15:23:45,227: INFO: 2771888681: Reading CSV file from: artifacts\data_ingestion\Sandblasting-Condition.csv]
[2024-09-13 15:23:45,236: INFO: 2771888681: Checking columns in the dataset: ['angle_sandblasting', 'pressure_sandblasting_bar', 'temperature_acid_etching', 'time_acid_etching_min', 'voltage_anodizing_v', 'time_anodizing_min', 'sa_surface_roughness_micrometer', 'cell_viability_percent', 'Result_Passed_1_Failed_0']]
[2024-09-13 15:23:45,241: INFO: 2771888681: Expected schema columns: ['angle_sandblasting', 'pressure_sandblasting_bar', 'temperature_acid_etc