In [1]:
import pandas as pd 
import numpy as np
from pathlib import Path
from dataclasses import dataclass
import os
#os.chdir("./stores_sales_prediction")

In [2]:
os.chdir("../")
os.listdir()

['.git',
 '.github',
 '.gitignore',
 'artifacts',
 'configs',
 'dvc.yaml',
 'env',
 'init_setup.sh',
 'LICENSE',
 'logs',
 'params.yaml',
 'pyproject.toml',
 'README.md',
 'requirements.txt',
 'requirements_dev.txt',
 'research',
 'setup.cfg',
 'setup.py',
 'src',
 'template.py',
 'tests',
 'tox.ini']

In [13]:
from operator import index
from saleStorePredictor.utils import read_yaml,save_numpy_array_data,load_bin,save_bin,create_directories
import os
from saleStorePredictor.entity import DataTransformationConfig
from saleStorePredictor.config import ConfigurationManager
from saleStorePredictor import logging
from sklearn import preprocessing
import sys,os
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd





class FeatureGenerator(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None):
        """
        FeatureGenerator Initialization
        
        """
        self.columns = columns
        

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        try:
            X["Item_type_combined"] = X['Item_Identifier'].apply(lambda x:x[0:2])

            return X
        except Exception as e:
            logging.error(f"Error generating feature {e}.") 

      





class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = pd.DataFrame(X,columns=self.columns)
        logging.info(output.columns)
        if self.columns is not None:
            for id,col in enumerate(self.columns):
                
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)            



class DataTransformation:

    def __init__(self,data_transformation_config:DataTransformationConfig):
        self.data_transformation_config = data_transformation_config
        self.train = pd.read_csv(self.data_transformation_config.training_dataset,index_col=False)
        self.test = pd.read_csv(self.data_transformation_config.test_dataset,index_col=False)


    def get_data_transformer_object(self)->ColumnTransformer:
        try:
            schema_file_path = Path(self.data_transformation_config.schema_path)

            dataset_schema = read_yaml(schema_file_path)

            numerical_columns = dataset_schema.numerical_columns
            categorical_columns = dataset_schema.categorical_columns


            num_pipeline = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy="mean")),
               # ('scaler', StandardScaler())
            ]
            )

            cat_pipeline = Pipeline(steps=[
               ('feature_generator', FeatureGenerator(columns=categorical_columns)),
                 ('impute', SimpleImputer(strategy="most_frequent")),
               

            ]
            )


            encoding_scaling = Pipeline(steps=[
                  ('encoding',MultiColumnLabelEncoder(columns=categorical_columns)),
                 ('scaler', StandardScaler(with_mean=False))

            ]
            )




            logging.info(f"Categorical columns preprocess: {categorical_columns}")
            logging.info(f"Numerical columns preprocess: {numerical_columns}")

            extra_columns = ['Item_type_combined']


            preprocessing = ColumnTransformer([
                ('num_pipeline', num_pipeline, numerical_columns),
                ('cat_pipeline', cat_pipeline, categorical_columns),
                ('encoding_scaling', encoding_scaling, numerical_columns + categorical_columns )
            ])
            return preprocessing

        except Exception as e:
            raise e
            logging.error(f"Error in preprocessing: {e}") 


    def initiate_data_transformation(self):
        try:
            logging.info(f"Obtaining preprocessing object.")
            preprocessing_obj = self.get_data_transformer_object()




            logging.info(f"Obtaining training and test file path.")
            train_file_path = Path(self.data_transformation_config.training_dataset)
            test_file_path =  Path(self.data_transformation_config.test_dataset)
            

            schema_file_path = Path(self.data_transformation_config.schema_path)
            
            logging.info(f"Loading training and test data as pandas dataframe.")
            train_df = pd.read_csv(train_file_path)
            
            test_df = pd.read_csv(test_file_path)    

            schema = read_yaml(schema_file_path)

            target_column_name = schema.target_column


            logging.info(f"Splitting input and target feature from training and testing dataframe.")
            input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
            target_feature_train_df = train_df[target_column_name]

            input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
            target_feature_test_df = test_df[target_column_name]
            

            logging.info(f"Applying preprocessing object on training dataframe and testing dataframe")
            input_feature_train_arr= preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)


            train_arr = np.c_[ input_feature_train_arr, np.array(target_feature_train_df)]

            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]
            
            transformed_train_dir = Path(self.data_transformation_config.transformed_train_path_file)
            transformed_test_dir = Path(self.data_transformation_config.transformed_test_path_file)

            train_file_name = os.path.basename(train_file_path).replace(".csv",".npz")
            test_file_name = os.path.basename(test_file_path).replace(".csv",".npz")

            transformed_train_file_path = os.path.join(transformed_train_dir, train_file_name)
            transformed_test_file_path = os.path.join(transformed_test_dir, test_file_name)

            logging.info(f"Saving transformed training and testing array.")
            
            save_numpy_array_data(file_path=transformed_train_file_path,array=train_arr)
            save_numpy_array_data(file_path=transformed_test_file_path,array=test_arr)

            

            preprocessing_obj_file_path = Path(self.data_transformation_config.preprocessed_object_path_file)

            create_directories([os.path.dirname(preprocessing_obj_file_path)])

            logging.info(f"Saving preprocessing object at {preprocessing_obj_file_path} type {type(preprocessing_obj)}")
            save_bin(preprocessing_obj,preprocessing_obj_file_path)

            
            
            logging.info(f"Data transformationa completed successfully.")
            
        except Exception as e:
            raise e
            logging.error(f"Error Saving preprocessing object: {e.with_traceback}")

    def __del__(self):
        logging.info(f"{'>>'*30}Data Transformation log completed.{'<<'*30} \n\n")    

    

In [14]:
config = ConfigurationManager()
config = config.get_data_transformation_config()
trans = DataTransformation(config)
trans.initiate_data_transformation()




[2022-09-24 16:54:33,218: INFO: common]: yaml file: configs\config.yaml loaded successfully
[2022-09-24 16:54:33,220: INFO: common]: yaml file: params.yaml loaded successfully
[2022-09-24 16:54:33,221: INFO: common]: created directory at: artifacts
[2022-09-24 16:54:33,245: INFO: 2476690654]: Obtaining preprocessing object.
[2022-09-24 16:54:33,252: INFO: common]: yaml file: configs\schema.yaml loaded successfully
[2022-09-24 16:54:33,253: INFO: 2476690654]: Categorical columns preprocess: ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
[2022-09-24 16:54:33,254: INFO: 2476690654]: Numerical columns preprocess: ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']
[2022-09-24 16:54:33,255: INFO: 2476690654]: Obtaining training and test file path.
[2022-09-24 16:54:33,256: INFO: 2476690654]: Loading training and test data as pandas dataframe.
[2022-09-24 16:54:33,280: INFO: common]: yaml

In [33]:
pd.read_csv("artifacts\data_ingestion\ingested_train_dir\Train.csv",index_col=False)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,NCM55,15.60,Low Fat,0.111686,Others,184.7924,OUT010,1998,,Tier 3,Grocery Store,370.1848
1,FDU01,20.25,Regular,0.012063,Canned,183.5924,OUT017,2007,,Tier 2,Supermarket Type1,2406.2012
2,FDJ22,18.75,Low Fat,0.053025,Snack Foods,192.5504,OUT018,2009,Medium,Tier 3,Supermarket Type2,2109.2544
3,FDX31,20.35,Regular,0.000000,Fruits and Vegetables,234.4958,OUT045,2002,,Tier 2,Supermarket Type1,1402.1748
4,FDN21,18.60,Low Fat,0.076841,Snack Foods,162.2236,OUT035,2004,Small,Tier 2,Supermarket Type1,2900.2248
...,...,...,...,...,...,...,...,...,...,...,...,...
5705,NCN29,15.20,Low Fat,0.020280,Health and Hygiene,49.1034,OUT010,1998,,Tier 3,Grocery Store,194.4136
5706,FDP21,,Regular,0.025616,Snack Foods,188.1872,OUT027,1985,Medium,Tier 3,Supermarket Type3,6239.8776
5707,FDO22,13.50,Regular,0.000000,Snack Foods,78.3960,OUT035,2004,Small,Tier 2,Supermarket Type1,1438.1280
5708,FDI52,18.70,Low Fat,0.104890,Frozen Foods,121.4072,OUT045,2002,,Tier 2,Supermarket Type1,3185.1872
