In [1]:
import os

In [2]:
%pwd

'd:\\Repositories\\ml-churn\\research'

In [3]:
os.chdir("d://Repositories/ml-churn/")

In [4]:
%pwd

'd:\\Repositories\\ml-churn'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    transformation_path: Path

In [6]:
from Churn_analysis.constants import *
from Churn_analysis.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self,
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH,
                 schema_filepath = SCHEMA_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)
        create_directories([self.config.artifacts_root])


    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            transformation_path= config.transformation_path
            )
        return data_transformation_config

In [12]:
import os 
from Churn_analysis import logger
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split


In [13]:
class DataTransformation():
    def __init__(self, config: DataTransformationConfig):
        self.config = config


    def get_train_test_data(self):
        """
        Function that get the data, apply 
        get_dummies to categorical data, StandardScaler to numeric
        and LabelEncoder for choice data.
        It also drop unnecesary columns
        """
        # Reading files
        data = pd.read_csv(self.config.data_path)
        names = list(data.columns[[6,7,8,9,10,11,12,13,14,15,16,17]])
        
        # Applying get_dummies
        for name in names: 
            data = pd.get_dummies(data, columns=[name], dtype=int)


        # Dropping innecesary columns
        data.drop(columns=["customerID","OnlineSecurity_No internet service","OnlineBackup_No internet service",
                           "DeviceProtection_No internet service","TechSupport_No internet service","StreamingTV_No internet service",
                           "StreamingMovies_No internet service"], inplace=True)
        

        Ord_encoder = LabelEncoder()
        Std_encoder = StandardScaler()

        Ordinal_variables = ["gender", "SeniorCitizen","Partner","Dependents","Churn"]
        Numeric_variables = ["tenure","MonthlyCharges","TotalCharges"]

        # Applying transformations
        data[Ordinal_variables] = data[Ordinal_variables].apply(lambda col: Ord_encoder.fit_transform(col))
        data[Numeric_variables] = Std_encoder.fit_transform(data[Numeric_variables])

        # Splitting the data
        train, test = train_test_split(data,test_size=0.3, random_state=42, stratify=data["Churn"] )
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index= False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index= False)

        strat_test  = test[test["Churn"]==1].sum()/test[test["Churn"]==1].count()
        strat_train = train[train["Churn"]==1].sum()/train[train["Churn"]==1].count()

        logger.info(f"Transformed and splitted data, stratification of train data {strat_train}\n 
                    stratification of test data {strat_test}")
        logger.info(train.shape)
        logger.info(test.shape)
        

In [15]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.get_train_test_data()

except Exception as e:
    raise e

[2023-12-15 12:02:15,857: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-15 12:02:15,859: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-15 12:02:15,861: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-12-15 12:02:15,863: INFO: common: created directory at: artifacts]
[2023-12-15 12:02:15,864: INFO: common: created directory at: artifacts/data_transformation]
[2023-12-15 12:02:16,001: INFO: 3677636830: Transformed and splitted data]
[2023-12-15 12:02:16,002: INFO: 3677636830: (4922, 37)]
[2023-12-15 12:02:16,002: INFO: 3677636830: (2110, 37)]
