In [1]:
import os

In [2]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting'

In [5]:
import pandas as pd 

In [6]:
DATA_PATH = 'data/Sandblasting-Condition.csv'
data = pd.read_csv(DATA_PATH)

In [7]:
data.head()

Unnamed: 0,Angle of Sandblasting,Pressure of Sandblasting (bar),Temperture of Acid Etching,Time of Acid Etching (min),Voltage of Anodizing (v),Time of Anodizing (min),(Sa) Average of Surface roughness (micrometer),Cell Viability (%),"Result (1=Passed, 0=Failed)"
0,30,3,25,3,80,1,0.746,75,0
1,40,3,25,3,80,1,0.813,70,0
2,50,3,25,3,80,1,0.952,65,0
3,30,3,25,6,80,1,0.95,77,0
4,30,3,25,9,80,1,1.02,75,0


In [8]:
data.columns

Index(['Angle of Sandblasting', 'Pressure of Sandblasting (bar)',
       'Temperture of Acid Etching', 'Time of Acid Etching (min)',
       'Voltage of Anodizing (v)', 'Time of  Anodizing (min)',
       '(Sa) Average of Surface roughness (micrometer)', 'Cell Viability (%)',
       'Result (1=Passed, 0=Failed)'],
      dtype='object')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 9 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Angle of Sandblasting                           102 non-null    int64  
 1   Pressure of Sandblasting (bar)                  102 non-null    int64  
 2   Temperture of Acid Etching                      102 non-null    int64  
 3   Time of Acid Etching (min)                      102 non-null    int64  
 4   Voltage of Anodizing (v)                        102 non-null    int64  
 5   Time of  Anodizing (min)                        102 non-null    int64  
 6   (Sa) Average of Surface roughness (micrometer)  102 non-null    float64
 7   Cell Viability (%)                              102 non-null    int64  
 8   Result (1=Passed, 0=Failed)                     102 non-null    int64  
dtypes: float64(1), int64(8)
memory usage: 7.3 KB

In [10]:
from pathlib import Path
from dataclasses import dataclass
from Dental_Implant_Sandblasting import logger
from Dental_Implant_Sandblasting.utils.common import read_yaml, create_directories
from Dental_Implant_Sandblasting.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

### Make data class (entity):
@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

### Configuration Manager in src config:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config['artifacts_root']])

    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config['data_validation']
        schema = {**self.schema['COLUMNS'], **self.schema['TARGET_COLUMNS']}
        create_directories([config['root_dir']])
        data_validation_config = DataValidationConfig(
            root_dir=Path(config['root_dir']),
            unzip_data_dir=Path(config['unzip_data_dir']),
            STATUS_FILE=Path(config['STATUS_FILE']),
            all_schema=schema,
        )
        return data_validation_config

###  Components:
###  Class for validating and preprocessing the data:
class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            validation_status = True  # Initialize to True assuming validation will pass

            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            # Combine schema columns and target columns into a single list
            all_schema = list(self.config.all_schema.keys())

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False  # Set to False if any column is not found in the schema
                    logger.error(f"Column {col} not found in schema")
                    break  # Stop further checks if a mismatch is found
                else:
                    logger.info(f"Column {col} is valid")

            # Write the validation status to the status file
            with open(self.config.STATUS_FILE, 'w') as f:
                f.write(f"Validation status: {validation_status}")

            return validation_status  # Return the final validation status

        except Exception as e:
            logger.exception(e)
            raise e

    def preprocess_data(self):
        try:
            # Load the dataset
            data = pd.read_csv(self.config.unzip_data_dir)

            # Check for missing values
            missing_values = data.isnull().sum()
            logger.info(f"Missing values:\n{missing_values}")

            # Convert appropriate columns to numeric, forcing errors to NaN
            cols_to_convert = [
                'Angle of Sandblasting', 
                'Pressure of Sandblasting (bar)', 
                'Temperture of Acid Etching',
                'Time of Acid Etching (min)',
                'Voltage of Anodizing (v)', 
                'Time of  Anodizing (min)', 
                '(Sa) Average of Surface roughness (micrometer)', 
                'Cell Viability (%)'
            ]
            data[cols_to_convert] = data[cols_to_convert].apply(pd.to_numeric, errors='coerce')

            # Impute missing values using the mean for numeric columns
            data_imputed = data.copy()
            numeric_cols = data.select_dtypes(include=[np.number]).columns
            data_imputed[numeric_cols] = data_imputed[numeric_cols].fillna(data_imputed[numeric_cols].mean())

            # Verify data after imputation
            logger.info(f"Data after imputing missing values:\n{data_imputed.info()}")

            # Ensure that the dataset is not empty after imputation
            if data_imputed.empty:
                raise ValueError("Dataset is empty after imputing missing values.")

            # Filter data according to the given validation ranges for Surface Roughness (Sa)
            valid_data = data_imputed[
                (data_imputed['(Sa) Average of Surface roughness (micrometer)'] > 1.5) & 
                (data_imputed['(Sa) Average of Surface roughness (micrometer)'] < 2.5)
            ]

            # Set "Cell Viability (%)" to 0 where Sa is outside the valid range
            data_imputed.loc[~data_imputed.index.isin(valid_data.index), 'Cell Viability (%)'] = 0

            # Handle outliers in Cell Viability
            def cap_outliers(series):
                Q1 = series.quantile(0.25)
                Q3 = series.quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                return np.clip(series, lower_bound, upper_bound)

            y_cv_capped = cap_outliers(data_imputed['Cell Viability (%)'])

            # Standardize the features
            scaler = StandardScaler()
            feature_columns = [
                'Angle of Sandblasting', 
                'Pressure of Sandblasting (bar)', 
                'Temperture of Acid Etching', 
                'Time of Acid Etching (min)', 
                'Voltage of Anodizing (v)', 
                'Time of  Anodizing (min)'
            ]
            X = data_imputed[feature_columns]
            X_scaled = scaler.fit_transform(X)

            # Split the data into training and testing sets for Surface Roughness (Sa) and Cell Viability (CV)
            y_sa = data_imputed['(Sa) Average of Surface roughness (micrometer)']
            X_train, X_test, y_sa_train, y_sa_test = train_test_split(X_scaled, y_sa, test_size=0.2, random_state=42)
            _, _, y_cv_train, y_cv_test = train_test_split(X_scaled, y_cv_capped, test_size=0.2, random_state=42)

            logger.info(f"Training set size for Surface Roughness (Sa): {X_train.shape}")
            logger.info(f"Testing set size for Surface Roughness (Sa): {X_test.shape}")
            logger.info(f"Training set size for Cell Viability (CV): {y_cv_train.shape}")
            logger.info(f"Testing set size for Cell Viability (CV): {y_cv_test.shape}")

            # Check for any inconsistencies in the data split
            if X_train.shape[0] != y_sa_train.shape[0] or X_train.shape[0] != y_cv_train.shape[0]:
                raise ValueError("Mismatch in the number of training samples between features and targets.")
            if X_test.shape[0] != y_sa_test.shape[0] or X_test.shape[0] != y_cv_test.shape[0]:
                raise ValueError("Mismatch in the number of testing samples between features and targets.")

        except Exception as e:
            logger.exception(e)
            raise e

### Pipeline:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation = DataValidation(config=data_validation_config)

    # Validate the dataset
    validation_status = data_validation.validate_all_columns()
    logger.info(f"Validation status: {validation_status}")

    # If validation passes, proceed to preprocessing
    if validation_status:
        data_validation.preprocess_data()

except Exception as e:
    logger.exception(e)
    raise e


[2024-08-26 23:56:02,549: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-26 23:56:02,613: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-26 23:56:02,639: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-08-26 23:56:02,643: INFO: common: created directory at: artifacts]
[2024-08-26 23:56:02,647: INFO: common: created directory at: artifacts/data_validation]
[2024-08-26 23:56:02,669: INFO: 4022467435: Column Angle of Sandblasting is valid]
[2024-08-26 23:56:02,671: INFO: 4022467435: Column Pressure of Sandblasting (bar) is valid]
[2024-08-26 23:56:02,673: INFO: 4022467435: Column Temperture of Acid Etching is valid]
[2024-08-26 23:56:02,677: INFO: 4022467435: Column Time of Acid Etching (min) is valid]
[2024-08-26 23:56:02,680: INFO: 4022467435: Column Voltage of Anodizing (v) is valid]
[2024-08-26 23:56:02,684: INFO: 4022467435: Column Time of  Anodizing (min) is valid]
[2024-08-26 23:56:02,687: INFO: 4022467435: Column (Sa) 