In [1]:
import os

In [2]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\iNeuron_Projects\\End_to_End_ML_Dental_Implant_Sandblasting'

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from Dental_Implant_Sandblasting import logger
from Dental_Implant_Sandblasting.utils.common import read_yaml, create_directories
from Dental_Implant_Sandblasting.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    transformed_train_dir: Path
    transformed_test_dir: Path
    test_size: float
    random_state: int
    polynomial_features_degree: int
    scaling_method: str
    lasso_max_iter: int

class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH, schema_filepath=SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config['artifacts_root']])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config['data_transformation']
        params = self.params['data_transformation']
        create_directories([config['root_dir']])
        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config['root_dir']),
            data_path=Path(config['data_path']),
            transformed_train_dir=Path(config['transformed_train_path']),
            transformed_test_dir=Path(config['transformed_test_path']),
            test_size=params['test_size'],
            random_state=params['random_state'],
            polynomial_features_degree=params['polynomial_features_degree'],
            scaling_method=params['scaling_method'],
            lasso_max_iter=params['lasso_max_iter']
        )
        return data_transformation_config

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def load_data(self):
        data = pd.read_csv(self.config.data_path)
        logger.info(f"Data loaded from {self.config.data_path}")

        # Basic Data Exploration
        logger.info(f"Data Head: \n{data.head()}")
        logger.info(f"Data Info: \n{data.info()}")
        logger.info(f"Data Description: \n{data.describe()}")

        return data

    def preprocess_data(self, data):
        # Convert columns to numeric, forcing any errors to NaN
        for col in data.columns:
            data[col] = pd.to_numeric(data[col], errors='coerce')

        # Handle missing values by imputing
        imputer = SimpleImputer(strategy='mean')
        data_imputed = pd.DataFrame(imputer.fit_transform(data))
        data_imputed.columns = data.columns

        logger.info("Missing values handled")
        return data_imputed

    def feature_engineering(self, data):
        # Generating polynomial features
        features_before_poly = data.drop(columns=['(Sa) Average of Surface roughness (micrometer)', 'Cell Viability (%)', 'Result (1=Passed, 0=Failed)'])

        # Print to check columns before transformation
        print("Columns before Polynomial Features Transformation:", features_before_poly.columns)

        poly = PolynomialFeatures(degree=self.config.polynomial_features_degree, include_bias=False)
        poly_features = poly.fit_transform(features_before_poly)

        # Scaling features
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(poly_features)

        # Create a DataFrame with the new features
        feature_columns = poly.get_feature_names_out(features_before_poly.columns)
        engineered_data = pd.DataFrame(scaled_features, columns=feature_columns)

        # Add target columns back to the DataFrame
        engineered_data['(Sa) Average of Surface roughness (micrometer)'] = data['(Sa) Average of Surface roughness (micrometer)'].values
        engineered_data['Cell Viability (%)'] = data['Cell Viability (%)'].values
        engineered_data['Result (1=Passed, 0=Failed)'] = data['Result (1=Passed, 0=Failed)'].values

        logger.info("Feature engineering completed")

        # Check for multicollinearity using VIF
        vif_data = pd.DataFrame()
        vif_data["feature"] = engineered_data.columns
        vif_data["VIF"] = [variance_inflation_factor(engineered_data.values, i) for i in range(len(engineered_data.columns))]

        logger.info(f"\nVIF before feature selection:\n{vif_data}")

        # Feature selection using Lasso with increased iterations
        lasso = LassoCV(max_iter=self.config.lasso_max_iter)
        lasso.fit(scaled_features, data['(Sa) Average of Surface roughness (micrometer)'])
        model = SelectFromModel(lasso, prefit=True)
        X_selected = model.transform(scaled_features)

        selected_features = np.array(feature_columns)[model.get_support()]

        logger.info(f"Selected Features:\n{selected_features}")

        return pd.DataFrame(X_selected, columns=selected_features), data[['(Sa) Average of Surface roughness (micrometer)', 'Cell Viability (%)', 'Result (1=Passed, 0=Failed)']]

    def train_test_splitting(self, features, targets):
        train_features, test_features, train_targets, test_targets = train_test_split(
            features, targets, test_size=self.config.test_size, random_state=self.config.random_state)

        train_data = pd.concat([train_features, train_targets.reset_index(drop=True)], axis=1)
        test_data = pd.concat([test_features, test_targets.reset_index(drop=True)], axis=1)

        train_data.to_csv(self.config.transformed_train_dir, index=False)
        test_data.to_csv(self.config.transformed_test_dir, index=False)

        logger.info(f"Train-test split completed with train shape: {train_data.shape} and test shape: {test_data.shape}")
        print(train_data.shape)
        print(test_data.shape)

    def execute(self):
        try:
            data = self.load_data()
            preprocessed_data = self.preprocess_data(data)
            features, targets = self.feature_engineering(preprocessed_data)
            self.train_test_splitting(features, targets)

            # Create status file
            with open(self.config.root_dir / "status.txt", "w") as f:
                f.write("Validation status: True")

            logger.info("Data transformation and splitting completed successfully.")
        except Exception as e:
            # Create status file with failure status
            with open(self.config.root_dir / "status.txt", "w") as f:
                f.write("Validation status: False")

            logger.exception(e)
            raise e

# Pipeline execution
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.execute()
except Exception as e:
    logger.exception(e)
    raise e


[2024-08-09 21:41:24,085: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-09 21:41:24,124: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-09 21:41:24,132: INFO: common: created directory at: artifacts]
[2024-08-09 21:41:24,135: INFO: common: created directory at: artifacts/data_transformation]
[2024-08-09 21:41:24,148: INFO: 3761015650: Data loaded from artifacts\data_ingestion\Sandblasting-Condition.csv]
[2024-08-09 21:41:24,164: INFO: 3761015650: Data Head: 
  Angle of Sandblasting Pressure of Sandblasting (bar)  \
0                    30                              3   
1                    40                              3   
2                    50                              3   
3                    30                              4   
4                    40                              4   

  Temperture of Acid Etching Time of Acid Etching (min)  \
0                         25                          3   
1                     