In [3]:
import os

In [4]:
%pwd

'f:\\Faseeh\\New folder\\4_projects\\telco-customer-churn-project\\research'

In [5]:
os.chdir("../")

In [6]:
%pwd

'f:\\Faseeh\\New folder\\4_projects\\telco-customer-churn-project'

In [7]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [8]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories

In [9]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )
        target_column=self.schema.TARGET_COLUMN
        return data_transformation_config,target_column

In [10]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [14]:
class DataTransformation:
    def __init__(self, config, target_column):
        self.config = config
        self.target_column = target_column

    def train_test_splitting(self):
        # Load the dataset
        data = pd.read_csv(self.config.data_path)

        # Identify numerical and categorical columns
        num_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
        cat_features = data.select_dtypes(include=['object']).columns.tolist()

        # Ensure target column is removed from feature lists
        if self.target_column in num_features:
            num_features.remove(self.target_column)
            target_is_numeric = True
        elif self.target_column in cat_features:
            cat_features.remove(self.target_column)
            target_is_numeric = False
        else:
            raise ValueError(f"Target column {self.target_column} not found in dataset.")

        # Define transformation pipelines
        num_pipeline = Pipeline([
            ('scaler', StandardScaler())  # Standardize numerical features
        ])

        cat_pipeline = Pipeline([
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # One-hot encode categorical features
        ])

        # Create preprocessing pipeline
        preprocessor = ColumnTransformer([
            ('num', num_pipeline, num_features),
            ('cat', cat_pipeline, cat_features)
        ])

        # Apply transformations to features
        transformed_features = preprocessor.fit_transform(data.drop(columns=[self.target_column]))

        # Save the preprocessor for use during prediction
        preprocessor_path = os.path.join(self.config.root_dir, "preprocessor.pkl")
        joblib.dump(preprocessor, preprocessor_path)
        logger.info(f"Saved preprocessor at {preprocessor_path}")

        # Extract correct feature names
        num_feature_names = num_features
        cat_feature_names = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out().tolist()
        feature_names = num_feature_names + cat_feature_names

        # Transform target variable
        if target_is_numeric:
            transformed_target = data[self.target_column].values  # Keep as-is for regression
        else:
            label_encoder = LabelEncoder()
            transformed_target = label_encoder.fit_transform(data[self.target_column])  # Convert to numeric labels

            # Save LabelEncoder to use in inference
            label_encoder_path = os.path.join(self.config.root_dir, "label_encoder.pkl")
            joblib.dump(label_encoder, label_encoder_path)
            logger.info(f"Saved LabelEncoder at {label_encoder_path}")

        # Convert to DataFrame
        transformed_df = pd.DataFrame(transformed_features, columns=feature_names)
        transformed_df[self.target_column] = transformed_target  # Append transformed target column

        # Split data
        train, test = train_test_split(transformed_df, test_size=0.25, random_state=42)

        # Save transformed datasets
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info(f"Data transformed and split into training ({train.shape}) and test ({test.shape}) sets.")

In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config,target_column= config.get_data_transformation_config()
    # print(target_column)
    data_transformation = DataTransformation(config=data_transformation_config,target_column=target_column["name"])
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2025-03-14 15:14:49,735: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-14 15:14:49,746: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-14 15:14:49,756: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-14 15:14:49,762: INFO: common: created directory at: artifacts]
[2025-03-14 15:14:49,766: INFO: common: created directory at: artifacts/data_transformation]
[2025-03-14 15:14:53,461: INFO: 812645982: Saved preprocessor at artifacts/data_transformation\preprocessor.pkl]
[2025-03-14 15:14:53,548: INFO: 812645982: Saved LabelEncoder at artifacts/data_transformation\label_encoder.pkl]


In [None]:
print("DOne")

DOne
