In [6]:
from dataclasses import dataclass
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from src.logger import logging
from src.exception import StudentPerformanceException
from src.components.data_transformation import DataTransformation


@dataclass
class DataIngestionConfig:
    """Define where the output files will be located"""
    artifact_path:str = os.path.join("artifacts")
    data_train_file_path: str = os.path.join(artifact_path, "train.csv")
    data_test_file_path: str = os.path.join(artifact_path, "test.csv")
    raw_data_file_path: str = os.path.join(artifact_path, "raw.csv")

class DataIngestion:
    """Tread data from data source"""
    def __init__(self)-> DataIngestionConfig:
        self.ingestion_config = DataIngestionConfig()

    def initiate_data_ingestion(self):
        logging.info("Entered the data ingestion method or component")
        try:
            df =  pd.read_csv('notebook/data/stud.csv')
            logging.info('Read the dataset as DataFrame')

            os.makedirs(os.path.dirname(self.ingestion_config.data_train_file_path), exist_ok=True)
            df.to_csv(self.ingestion_config.raw_data_file_path, index=False, header=True)
            # = os.makedirs(os.path.dirname('artifacts/train.csv'), exist_ok=True) = os.path.dirname('artifacts/train.csv')  →  'artifacts'
            df.to_csv(self.ingestion_config.raw_data_file_path, index=False, header=True)
            logging.info('Train and Test Splitting Initiated')
            train_set, test_set = train_test_split(df, test_size=0.2,random_state=42)
            train_set.to_csv(self.ingestion_config.data_train_file_path, index=False, header = True)
            test_set.to_csv(self.ingestion_config.data_test_file_path, index=False, header = True)
            logging.info('Data Ingestion Completed!')
            return (
                self.ingestion_config.data_train_file_path,
                self.ingestion_config.data_test_file_path,
            )
        except Exception as e:
            raise StudentPerformanceException(e,sys)

if __name__=="__main__":
    obj=DataIngestion()
    train_data,test_data=obj.initiate_data_ingestion()
"""
    transformation_obj = DataTransformation()
    train_arr, test_arr = transformation_obj.initiate_data_transformation(train_data,test_data)
"""

'\n    transformation_obj = DataTransformation()\n    train_arr, test_arr = transformation_obj.initiate_data_transformation(train_data,test_data)\n'

In [None]:
import os
import sys
import numpy as np
import pandas as pd
from dataclasses import dataclass
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from src.logger import logging
from src.exception import StudentPerformanceException
from src.utils import save_object

@dataclass
class DataTransformationConfig:
    """Define where the output of data transformation will be store."""
    preprocessor_obj_file_path= os.path.join('artifacts', "preprocessor.pkl")

class DataTransformation:
    def __init__(self):
        self.data_transformation_config = DataTransformationConfig()
    
    def get_data_transformation(self):
        """ 
        This function is responsible for data transformation.
        """
        try:
            # Define numerical and catagorical columns
            numerical_columns = ["reading_score", "writing_score"]
            catagorical_columns = [
                "gender",
                "race_ethnicity",
                "parental_level_of_education",
                "lunch",
                "est_preparation_course"
            ]

            num_pipeline = Pipeline(
                steps=[
                    ("imputer",SimpleImputer(strategy="mean")),
                    ("scaler",StandardScaler())
                ]
            )
            cat_pipeline = Pipeline(
                steps=[
                    ("imputer",SimpleImputer(strategy="most_frequent")),
                    ("one_hot_encoder", OneHotEncoder()),
                    ("scaler", StandardScaler())
                ]
            )

            logging.info(f"Numerical Columns: {numerical_columns}")
            logging.info(f"Catagorical Columns: {catagorical_columns}")
            # ColumnTransform(transformer, ...). transformers : list of tuples List of (name, transformer, columns)
            preprocessor = ColumnTransformer(
                [
                    ("num_pipeline", num_pipeline,numerical_columns),
                    ("cat_pipeline", cat_pipeline, catagorical_columns)
                ]
            )

            return preprocessor

        except Exception as e:
            raise StudentPerformanceException(e,sys)
    def initiate_data_transformation(self,train_path,test_path):
        try:
            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)

            logging.info("Entering reading Training Set")
            logging.info("Obtaining the preprocessor")
            preprocessing_obj = self.get_data_transformation()
            target_column_name = "math_score"
            numerical_columns = ["writing_score", "reading_score"]

            input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
            target_feature_train_df = train_df[target_column_name]

            input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
            target_feature_test_df = test_df[target_column_name]

            logging.info(f"Applying preprocessing to obtain training DataFram and test DataFrame.")

            input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)

            # Transform method apply to training arrays to avoid information leaking

            train_arr = np.c_[
                input_feature_train_arr, np.array(target_feature_train_df)
            ]
            test_arr = np.c_[
                input_feature_test_arr, np.array(target_feature_test_df)
            ]

            logging.info("Saved preprocessing object.")
            save_object(
                file_path = self.data_transformation_config.preprocessor_obj_file_path,
                obj=preprocessing_obj
            )

            return (
                train_arr,
                test_arr,
                self.data_transformation_config.preprocessor_obj_file_path,
            )

        except Exception as e:
            raise StudentPerformanceException(e,sys)

In [8]:
print(train_data)

artifacts\train.csv


In [10]:
train_dataframe = pd.read_csv(train_data)
train_dataframe

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group D,master's degree,standard,none,62,70,75
1,female,group C,bachelor's degree,free/reduced,completed,66,83,83
2,female,group D,some college,free/reduced,none,79,89,86
3,male,group C,master's degree,free/reduced,none,61,67,66
4,male,group E,high school,standard,none,73,64,57
...,...,...,...,...,...,...,...,...
795,female,group D,master's degree,standard,none,87,100,100
796,male,group C,bachelor's degree,standard,none,69,63,61
797,female,group C,associate's degree,standard,none,53,62,53
798,male,group C,some college,free/reduced,completed,50,48,53


In [9]:
print(test_data)

artifacts\test.csv


In [11]:
test_dataframe = pd.read_csv(train_data)
test_dataframe

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group D,master's degree,standard,none,62,70,75
1,female,group C,bachelor's degree,free/reduced,completed,66,83,83
2,female,group D,some college,free/reduced,none,79,89,86
3,male,group C,master's degree,free/reduced,none,61,67,66
4,male,group E,high school,standard,none,73,64,57
...,...,...,...,...,...,...,...,...
795,female,group D,master's degree,standard,none,87,100,100
796,male,group C,bachelor's degree,standard,none,69,63,61
797,female,group C,associate's degree,standard,none,53,62,53
798,male,group C,some college,free/reduced,completed,50,48,53
