In [5]:
from japan_ha.constant.training_pipeline import TARGET_COLUMN
from japan_ha.constant.training_pipeline import DATA_TRANSFORMATION_IMPUTER_PARAMS

from japan_ha.entity.artifacts_entity import (
   DataTransformationArtifact, DataValidationArtifact
)
import sys
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline  # Use imblearn's Pipeline for SMOTE
from imblearn.over_sampling import SMOTE
from japan_ha.entity.config_entity import DataTransformationConfig
from japan_ha.exception.exception import JapanHeartAttackException
from japan_ha.logging.logger import logging
from japan_ha.utils.main_utils.utils import save_numpy_array_data,save_object
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline  # Use imblearn's Pipeline for SMOTE
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

In [9]:
import pandas as pd
df=pd.read_csv("japan_heart_attack_dataset.csv")

def get_data_transformer_object():
        '''
        This function is responsible for data transformation.
        '''
        try:
            numerical_columns =  ['Age', 'Cholesterol_Level','Stress_Levels', 'BMI','Heart_Rate','Systolic_BP','Diastolic_BP']
            categorical_columns =  ['Gender', 'Region', 'Smoking_History', 'Diabetes_History',
                                    'Hypertension_History', 'Diet_Quality', 'Alcohol_Consumption', 'Family_History'
                                    ,"Physical_Activity"]

            # Numerical pipeline with KNNImputer and StandardScaler
            num_pipeline = Pipeline(
                steps=[
                    ("imputer", KNNImputer()),  # Use KNNImputer instead of SimpleImputer
                    ("scaler", StandardScaler())
                ]
            )

            # Categorical pipeline with KNNImputer, OneHotEncoder, and StandardScaler
            cat_pipeline = Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),  # Use KNNImputer instead of SimpleImputer
                    ("one_hot_encoder", OneHotEncoder()),
                    ("scaler", StandardScaler(with_mean=False))
                ]
            )

            logging.info(f"Categorical Columns: {categorical_columns}")
            logging.info(f"Numerical Columns: {numerical_columns}")

            # Combine numerical and categorical pipelines using ColumnTransformer
            preprocessor = ColumnTransformer(
                transformers=[
                    ("num_pipeline", num_pipeline, numerical_columns),
                    ("cat_pipeline", cat_pipeline, categorical_columns)
                ]
            )

            # Add SMOTE to the pipeline using imblearn's Pipeline
            processor = ImbPipeline(
                steps=[
                    ("preprocessor", preprocessor),  # Apply preprocessing
                    ("smote", SMOTE(sampling_strategy="minority"))  # Apply SMOTE
                ]
            )

            return processor

        except Exception as e:
            raise JapanHeartAttackException(e, sys)

from sklearn.model_selection import train_test_split
train_df,test_df=train_test_split(df,test_size=0.2)


input_feature_train_df=train_df.drop(columns=[TARGET_COLUMN],axis=1)
target_feature_train_df=train_df[TARGET_COLUMN]

input_feature_test_df=test_df.drop(columns=[TARGET_COLUMN],axis=1)
target_feature_test_df=test_df[TARGET_COLUMN]

preprocessor=get_data_transformer_object()
# Fit and resample the preprocessor on the training data
transformed_input_train_feature, target_feature_train_df = preprocessor.fit_resample(input_feature_train_df, target_feature_train_df)

# Fit and resample the preprocessor on the test data
transformed_input_test_feature, target_feature_test_df = preprocessor.fit_resample(input_feature_test_df, target_feature_test_df)