In [1]:
import os

In [2]:
%pwd

'd:\\Repositories\\ml-churn\\research'

In [3]:
os.chdir("d://Repositories/ml-churn/")

In [4]:
%pwd

'd:\\Repositories\\ml-churn'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    transformation_path: Path

In [6]:
from Churn_analysis.constants import *
from Churn_analysis.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            transformation_path= config.transformation_path
            )
        return data_transformation_config

In [8]:
import os 
from Churn_analysis import logger
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from Churn_analysis.utils.common import save_object



In [11]:
class DataTransformation():
    def __init__(self, config: DataTransformationConfig):
        self.config = config


    def get_train_test_data(self):
        """
        Function that get the data, apply 
        get_dummies to categorical data, StandardScaler to numeric
        and LabelEncoder for choice data.
        It also drop unnecesary columns
        """
        # Reading files
        data = pd.read_csv(self.config.data_path)



        Ordinal_variables = ["gender", "SeniorCitizen","Partner","Dependents","Churn"]
        Numeric_variables = ["tenure","MonthlyCharges","TotalCharges"]
        One_hot_variables = ["PhoneService","MultipleLines", "InternetService","OnlineSecurity","OnlineBackup",
                            "DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract","PaperlessBilling",
                            "PaymentMethod"]

        #------------Complete data transformation-----------------------------------------------------------
        preprocessor = ColumnTransformer(transformers=[
                                        ("Ordinal", OrdinalEncoder(), Ordinal_variables),
                                        ("Standard", StandardScaler(), Numeric_variables),
                                        ("Onehot", OneHotEncoder(), One_hot_variables)
                                        ])

        input_encoder = preprocessor.fit(data.drop(columns=["customerID"]))

        Encoded_data = input_encoder.transform(data.drop(columns=["customerID"]))
        Encoded_df = pd.DataFrame(Encoded_data , columns= preprocessor.get_feature_names_out())

        Encoded_df.drop(columns= ["Onehot__OnlineSecurity_No internet service",
                                  "Onehot__OnlineBackup_No internet service",
                                  "Onehot__DeviceProtection_No internet service",
                                  "Onehot__TechSupport_No internet service",
                                  "Onehot__StreamingTV_No internet service",
                                  "Onehot__StreamingMovies_No internet service"], inplace= True)
        
        # --------------------------------------------------------------------------------------------------
        save_object(self.config.transformation_path, preprocessor)

        # Splitting the data
        train, test = train_test_split(Encoded_df,test_size=0.3, random_state=42, stratify=data["Churn"] )
        
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index= False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index= False)

        strat_test  = test[test["Ordinal__Churn"]==1].sum()/test[test["Ordinal__Churn"]==1].count()
        strat_train = train[train["Ordinal__Churn"]==1].sum()/train[train["Ordinal__Churn"]==1].count()

        logger.info(f"Transformed and splitted data, stratification of train data {strat_train} stratification of test data {strat_test}")
        logger.info(train.shape)
        logger.info(test.shape)
        

In [12]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.get_train_test_data()

except Exception as e:
    raise e

[2023-12-16 12:59:49,157: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-16 12:59:49,160: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-16 12:59:49,168: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-12-16 12:59:49,169: INFO: common: created directory at: artifacts]
[2023-12-16 12:59:49,170: INFO: common: created directory at: artifacts/data_transformation]


[2023-12-16 12:59:49,423: INFO: 1438513477: Transformed and splitted data, stratification of train data Ordinal__gender                                    0.503058
Ordinal__SeniorCitizen                             0.256116
Ordinal__Partner                                   0.361621
Ordinal__Dependents                                0.167431
Ordinal__Churn                                     1.000000
Standard__tenure                                  -0.579689
Standard__MonthlyCharges                           0.335271
Standard__TotalCharges                            -0.318485
Onehot__PhoneService_No                            0.091743
Onehot__PhoneService_Yes                           0.908257
Onehot__MultipleLines_No                           0.442661
Onehot__MultipleLines_No phone service             0.091743
Onehot__MultipleLines_Yes                          0.465596
Onehot__InternetService_DSL                        0.246177
Onehot__InternetService_Fiber optic                0.697