In [1]:
import os

In [2]:
%pwd

'c:\\Users\\SIR\\Credit-Card-Default-Prediction-Project-Pwskills\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\SIR\\Credit-Card-Default-Prediction-Project-Pwskills'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    transformed_train_dir: Path
    transformed_test_dir: Path
    scaler_file: Path
    train_file: Path
    test_file: Path


In [6]:
from ccdp.constants import *
from ccdp.utils.common import read_yaml, create_directories
#from pathlib import Path
#from src.ccdp.entity.data_transformation import DataTransformationConfig

class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        create_directories([self.config.artifacts_root])

    # Data Transformation Configuration
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.transformed_train_dir, config.transformed_test_dir])

        data_transformation_config = DataTransformationConfig(
            transformed_train_dir=Path(config.transformed_train_dir),
            transformed_test_dir=Path(config.transformed_test_dir),
            scaler_file=Path(config.scaler_file),
            train_file=Path(config.train_file),
            test_file=Path(config.test_file)
        )

        return data_transformation_config


In [7]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from pathlib import Path
from ccdp.logging import logger
import pickle


In [8]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.scaler = StandardScaler()

    def transform_data(self, X: pd.DataFrame) -> pd.DataFrame:
        # Identify numerical and categorical columns
        numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
        categorical_columns = X.select_dtypes(include=['object']).columns
        
        # Apply standard scaling to numerical features
        X[numerical_columns] = self.scaler.fit_transform(X[numerical_columns])
        
        # Apply label encoding to categorical features
        for column in categorical_columns:
            encoder = LabelEncoder()
            X[column] = encoder.fit_transform(X[column])
        
        return X

    def save_transformed_data(self, X: pd.DataFrame, y: pd.Series, file_name: str, is_train: bool = True):
        output_dir = self.config.transformed_train_dir if is_train else self.config.transformed_test_dir
        os.makedirs(output_dir, exist_ok=True)
        
        transformed_df = pd.concat([X, y], axis=1)
        file_path = output_dir / file_name
        transformed_df.to_csv(file_path, index=False)
        logger.info(f"Transformed data saved to {file_path}")

    def transform_and_save(self):
        # Load train and test datasets
        train_df = pd.read_csv(self.config.train_file)
        test_df = pd.read_csv(self.config.test_file)

        # Separate features and target
        X_train = train_df.drop(columns=["default"])
        y_train = train_df["default"]
        
        X_test = test_df.drop(columns=["default"])
        y_test = test_df["default"]
        
        # Transform train and test data
        logger.info("Starting data transformation for training data...")
        X_train_transformed = self.transform_data(X_train)
        
        logger.info("Starting data transformation for test data...")
        X_test_transformed = self.transform_data(X_test)
        
        # Save the transformed data
        self.save_transformed_data(X_train_transformed, y_train, "transformed_train.csv", is_train=True)
        self.save_transformed_data(X_test_transformed, y_test, "transformed_test.csv", is_train=False)

        # Save the scaler used during the transformation
        self.save_scaler()
        
    def save_scaler(self):
        os.makedirs(os.path.dirname(self.config.scaler_file), exist_ok=True)
        with open(self.config.scaler_file, "wb") as scaler_file:
            pickle.dump(self.scaler, scaler_file)
        logger.info(f"Scaler saved to {self.config.scaler_file}")


In [9]:
try:
    # Configuration for Data Transformation
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    
    # Initialize the DataTransformation component
    data_transformation = DataTransformation(config=data_transformation_config)
    
    # Perform the data transformation and save the results
    data_transformation.transform_and_save()
    
    logger.info("Data transformation pipeline completed successfully.")

except Exception as e:
    logger.error(f"Pipeline failed due to: {e}")


[2024-08-20 23:24:15,937: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-20 23:24:15,945: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-20 23:24:15,949: INFO: common: created directory at: artifacts]
[2024-08-20 23:24:15,953: INFO: common: created directory at: artifacts/data_transformation/train]
[2024-08-20 23:24:15,955: INFO: common: created directory at: artifacts/data_transformation/test]
[2024-08-20 23:24:16,175: INFO: 409266770: Starting data transformation for training data...]
[2024-08-20 23:24:16,242: INFO: 409266770: Starting data transformation for test data...]
[2024-08-20 23:24:17,757: INFO: 409266770: Transformed data saved to artifacts\data_transformation\train\transformed_train.csv]
[2024-08-20 23:24:18,018: INFO: 409266770: Transformed data saved to artifacts\data_transformation\test\transformed_test.csv]
[2024-08-20 23:24:18,023: INFO: 409266770: Scaler saved to artifacts\data_transformation\scaler.pkl]
[2024-08-20 23:2