In [27]:
from src import logger
from src.data.make_dataset import DataIngestion
from src.config.configuration import ConfigurationManager
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from src.utils.common import save_pickle, read_pickle, read_yaml
from src.constants import *
from dataclasses import dataclass
from sklearn.preprocessing import StandardScaler
from box import ConfigBox
import numpy as np
logger.name = "Feature Enginnering"

In [23]:
@dataclass(frozen=True)
class FeatureEngineeringConfig:
    train_dir: str
    test_dir: str
    encoder_dir: str
    target_col: str
    drop_cols: list


class ConfigurationManager:
    def __init__(self,
        config_file_path=CONFIG_FILE_PATH,
        params_file_path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)

    def get_feature_engineering_config(self)-> FeatureEngineeringConfig:
        feature_engineering_conf = FeatureEngineeringConfig(
            train_dir = self.config.data_directory.interim_train,
            test_dir = self.config.data_directory.interim_test,
            encoder_dir=self.config.artifacts.encoder,
            target_col=self.params.data.target_col
            drop_cols=self.params.data.drop_columns)
        return feature_engineering_conf
    
    
class FeatureEnginnering:
    def __init__(self):
        pass
    
    def check_null(self, data:pd.DataFrame):
        try:
            if data.isnull().sum().sum() == 0:
                logger.debug("Data has no missing values")
            else:
                logger.warning("Data has missing values")
        except Exception as e:
            logger.error(e)
            
    def fit_label_encoder(self, data: pd.DataFrame):
        fitter = {}
        columns = data.select_dtypes(object).columns
        for col in columns:
            le = LabelEncoder()
            try:
                assert data[col].isnull().sum() == 0, logger.warning(f"Missing values occured in {col}")
                fitter.update({col: le.fit(data[col])})
            except Exception as e:
                logger.error(e)
        return fitter

    def encoder_transform(self, encoder, data):
        try:
            for col, ob in encoder.items():
                assert data[col].isnull().sum() ==0, logger.warning(f"Missing values occured in {col}")
                data[col] = ob.transform(data[col])
            return data
        except Exception as e:
            logger.error(e)

    def fit_scalar(self, data:pd.DataFrame)-> np.array:
        ss = StandardScaler()
        try:
            ss.fit(data)
            logger.info("Standard Scalar fitted sucessfully")
            return ss
        except Exception as e:
            logger.error("Error occured while fitting standard scalar %s", e)

    def drop_columns(self, data:pd.DataFrame, columns:list):
        try:
            data = data.drop(columns, inplace=False)
            logger.info("Dropped columns: %s", columns)
        except:
            logger.error(f"Error occured while dropping columns: {columns}")
        
            


In [24]:
data_ingestion = DataIngestion()
feature_eng = FeatureEnginnering()
config_manager = ConfigurationManager()
config_params = config_manager.get_feature_engineering_config()

# Reading training data
train = data_ingestion.get_data(config_params.train_dir)
x_train, y_train = data_ingestion.split_x_y(train, config_params.target_col)
# fit encoder and save its object
fitter = feature_eng.fit_label_encoder(x_train)
save_pickle(fitter, config_params.artifacts.encoder)
# fit standard scalar and save its object
ss_fitter = feature_eng.fit_scalar(x_train)
save_pickle(ss_fitter, config_params.artifacts.scalar)


2025-01-08 14:51:35,527 - Feature Enginnering - INFO - Yaml read successfully from /Users/goldyrana/mess/deep_learning/projects/telco_customer_churn/config/config.yaml
2025-01-08 14:51:35,529 - Feature Enginnering - INFO - Yaml read successfully from /Users/goldyrana/mess/deep_learning/projects/telco_customer_churn/params.yaml


In [9]:
fitter = read_pickle(config_manager.config.artifacts.encoder)

/Users/goldyrana/mess/deep_learning/projects/telco_customer_churn/artifacts/encoder.pkl


In [14]:
feature_eng.encoder_transform(fitter, train)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,289,0,0,1,1,57,1,0,2,1,...,1,1,1,1,2,1,0,20.10,134,0
1,2140,1,0,1,1,70,1,2,1,2,...,2,2,2,2,2,1,1,116.55,4925,0
2,85,1,0,1,1,47,1,2,1,0,...,0,0,2,0,0,0,2,87.20,2774,0
3,4070,1,0,0,0,23,1,0,2,1,...,1,1,1,1,1,0,0,20.15,2798,0
4,2074,1,1,0,0,16,1,2,1,0,...,0,0,0,0,0,1,0,74.45,378,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,4158,1,0,1,1,69,1,0,0,2,...,2,0,0,0,2,1,0,59.75,2811,0
5630,387,1,0,0,0,4,1,2,1,0,...,0,0,0,2,0,1,2,85.65,2370,1
5631,3029,1,0,0,0,35,1,2,1,0,...,2,2,2,0,0,0,2,95.50,2395,0
5632,3824,0,0,0,0,36,1,0,1,0,...,2,2,0,2,0,1,2,87.55,2158,1
