In [5]:
import os
%pwd

'c:\\Users\\MEDHA TRUST\\Documents\\Govardhan\\ML\\github\\CustomerChurn\\CustomerChurn\\research'

In [6]:
## command should be in CustomerChurn directory
os.chdir('../')
%pwd

'c:\\Users\\MEDHA TRUST\\Documents\\Govardhan\\ML\\github\\CustomerChurn\\CustomerChurn'

In [26]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen = True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    train_data_path: Path
    test_data_path: Path


In [13]:
from ChurnPrediction.constants import *
from ChurnPrediction.utils.common import read_yaml, create_directory

In [22]:
class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH,
            schema_filepath = SCHEMA_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directory([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directory([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            train_data_path = config.train_data_path,
            test_data_path = config.test_data_path
        )

        return data_transformation_config
    

In [99]:
import os
import pandas as pd
import numpy as np

from pathlib import Path
from dataclasses import dataclass
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

from ChurnPrediction import logger


In [109]:

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def get_transformer_object(self) -> ColumnTransformer:
        try:
            # Feature lists
            cat_columns = [
                'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
                'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                'Contract', 'PaperlessBilling', 'PaymentMethod'
            ]
            num_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']

            # Pipelines for features
            numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ])

            categorical_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('label', OrdinalEncoder())
            ])

            preprocessor = ColumnTransformer([
                ('numerical_pipeline', numeric_transformer, num_columns),
                ('categorical_pipeline', categorical_transformer, cat_columns)
            ])

            return preprocessor

        except Exception as e:
            raise e

    def transform_data(self):
        data = pd.read_csv(self.config.data_path)

        # Remove 'customerID' as it is unique for each customer
        data.drop(['customerID'], axis=1, inplace=True)

        # Change the datatypes of 'SeniorCitizen' and 'TotalCharges' features
        data['SeniorCitizen'] = data['SeniorCitizen'].astype('object')
        data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

        # Encode 'Churn' (the target feature) into numeric values
        data['Churn'] = data['Churn'].replace({'Yes': 1, 'No': 0})
        data['Churn'] = data['Churn'].astype('object')

        # Split features and target
        data_X = data.drop(['Churn'], axis=1)
        data_y = data['Churn']

        # Get the preprocessor object
        preprocessor_object = self.get_transformer_object()
        
        # Preprocess the features
        data_X_preprocessed = preprocessor_object.fit_transform(pd.DataFrame(data_X))

        # Create a DataFrame with the preprocessed features
        transformed_feature_names = preprocessor_object.get_feature_names_out(input_features = data_X.columns)
        data_X_scaled = pd.DataFrame(data_X_preprocessed, columns=transformed_feature_names)

        # Split the data into train and test sets
        train_data, test_data = train_test_split(pd.concat([data_X_scaled, data_y], axis = 1), test_size=0.2, random_state=42, stratify = data_y)


        # Save the train and test data to CSV files
        train_data.to_csv(self.config.train_data_path, index=False)
        test_data.to_csv(self.config.test_data_path, index=False)





In [110]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.transform_data()

except Exception as e:
    raise e


[2023-10-15 15:37:59,092: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-10-15 15:37:59,095: INFO: common: yaml file: params.yaml loaded successfully]
[2023-10-15 15:37:59,097: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-10-15 15:37:59,097: INFO: common: directory: artifacts created successfully]
[2023-10-15 15:37:59,097: INFO: common: directory: artifacts/data_transformation created successfully]
