In [17]:
import os
import sys
from src.logger import logging
from src.exception import shippingException


from src.components.ingestion import DataIngestion
from src.components.transformation import DataTransformation
from src.components.training import ModelTrainer

obj=DataIngestion()


obj.initiate_data_ingestion()

('artifacts\\train.csv', 'artifacts\\test.csv')

In [16]:
import pandas as pd
import numpy as np
from src.logger import logging
from src.exception import shippingException
import sys



import os
import sys
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
    raw_data_path:str=os.path.join("artifacts","raw.csv")
    train_data_path:str=os.path.join("artifacts","train.csv")
    test_data_path:str=os.path.join("artifacts","test.csv")

class DataIngestion:
    def __init__(self):
        self.ingestion_config=DataIngestionConfig()
        

    def initiate_data_ingestion(self):
        logging.info("data ingestion started")
        try:
            data=pd.read_csv(r'E:\data_science\project\Internship\SCMS_Delivery_History_Dataset.csv')
            data.columns=data.columns.str.lower()
            data.columns=data.columns.str.replace(" ","_")
            logging.info(" reading a df")

            os.makedirs(os.path.dirname(os.path.join(self.ingestion_config.raw_data_path)),exist_ok=True)
            data.to_csv(self.ingestion_config.raw_data_path,index=False)
            logging.info(" i have saved the raw dataset in artifact folder")
            
            logging.info("here i have performed train test split")
            
            train_data,test_data=train_test_split(data,test_size=0.25)
            
            
            logging.info("train test split completed")
            
            train_data.to_csv(self.ingestion_config.train_data_path,index=False)
            test_data.to_csv(self.ingestion_config.test_data_path,index=False)
            
            logging.info("data ingestion part completed")
            
            return (
                 
                
                self.ingestion_config.train_data_path,
                self.ingestion_config.test_data_path
            )



        except Exception as e:
            logging.info()
            raise shippingException(e,sys)

In [8]:
import pandas as pd
import numpy as np
from src.logger import logging
from src.exception import shippingException
import os
import sys
from dataclasses import dataclass
from pathlib import Path


from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.preprocessing import LabelEncoder
from category_encoders.binary import BinaryEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from src.utils.utils import save_object

@dataclass
class DataTransformationConfig:
    preprocessor_obj_file_path=os.path.join('artifacts','preprocessor.pkl')


class DataTransformation:
    def __init__(self):
        self.data_transformation_config=DataTransformationConfig()



    def get_data_transformation(self):
        
        try:
            logging.info('Data Transformation initiated')

            numerical_cols=['unit_of_measure_(per_pack)','line_item_quantity','pack_price',
                            'unit_price',
                            'line_item_insurance_(usd)','freight_cost_(usd)']
            
            categorical_cols=['fulfill_via','vendor_inco_term','shipment_mode','first_line_designation']

            
            ## Numerical Pipeline
            num_pipeline=Pipeline(
                steps=[
                ('scaler',StandardScaler())
                ])
            
            # Categorigal Pipeline
            cat_pipeline=Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('onehotencoder', OneHotEncoder())
                ])
            
            preprocessor=ColumnTransformer([
            ('num_pipeline',num_pipeline,numerical_cols),
            ('cat_pipeline',cat_pipeline,categorical_cols),
            ])
            
            return preprocessor
        
        except Exception as e:
            logging.info("Exception occured in the initiate_datatransformation")

            raise shippingException(e,sys)
    
    def _outlier_capping(self,df,col):
        """
        Method Name :   _outlier_capping

        Description :   This method performs outlier capping in the dataframe. 

        Output      :   DataFrame. 
        """
        logging.info("Entered _outlier_capping method of Data_Transformation class")
        try:
            logging.info("Performing _outlier_capping for columns in the dataframe")
            percentile25 = df[col].quantile(0.25)  # calculating 25 percentile
            percentile75 = df[col].quantile(0.75)  # calculating 75 percentile

            # Calculating upper limit and lower limit
            iqr = percentile75 - percentile25
            upper_limit = percentile75 + 1.5 * iqr
            lower_limit = percentile25 - 1.5 * iqr

            # Capping the outliers
            df.loc[(df[col] > upper_limit), col] = upper_limit
            df.loc[(df[col] < lower_limit), col] = lower_limit
            logging.info(
                "Performed _outlier_capping method of Data_Transformation class"
            )

            logging.info("Exited _outlier_capping method of Data_Transformation class")
            return df

        except Exception as e:
            raise shippingException(e, sys) from e
    ## To removing the irregularity in the freight cost data
    @staticmethod
    def _trans_freight_cost(x):
                if x.find("See")!=-1:
                    return np.nan
                elif x=="Freight Included in Commodity Cost" or x=="Invoiced Separately":
                    return 0
                else:
                    return x
            
    
    def initialize_data_transformation(self,train_path,test_path):

        try:
            self.train_df=pd.read_csv(train_path)
            self.test_df=pd.read_csv(test_path)
            
            logging.info("read train and test data complete")
            logging.info(f'Train Dataframe Head : \n{self.train_df.head().to_string()}')
            logging.info(f'Test Dataframe Head : \n{self.test_df.head().to_string()}')

            print("Columns in train_df:", self.train_df.columns)
            
            preprocessing_obj = self.get_data_transformation()

            target_column_name='line_item_value'

            drop_columns = [target_column_name,'id', 'project_code', 'pq_', 'po_/_so_', 'asn/dn_', 'country',
                            'managed_by','pq_first_sent_to_client_date', 'po_sent_to_vendor_date','scheduled_delivery_date', 
                            'delivered_to_client_date','delivery_recorded_date', 'product_group', 'sub_classification',
                            'vendor', 'item_description', 'molecule/test_type', 'brand', 'dosage','dosage_form',
                            'manufacturing_site','weight_(kilograms)'
                             ]
            
            
            self.train_df["freight_cost_(usd)"] = self.train_df["freight_cost_(usd)"].apply(self._trans_freight_cost)
            self.test_df["freight_cost_(usd)"] = self.test_df["freight_cost_(usd)"].apply(self._trans_freight_cost)
            


           
            numerical_cols=['unit_of_measure_(per_pack)','line_item_quantity','line_item_value','pack_price',
                            'unit_price',
                            'line_item_insurance_(usd)','freight_cost_(usd)']
            
            for col in numerical_cols:
                self.train_df[col] = self.train_df[col].fillna(self.train_df[col].median())
                self.test_df[col] = self.test_df[col].fillna(self.test_df[col].median())
                
            self.train_df["freight_cost_(usd)"]=self.train_df["freight_cost_(usd)"].astype("float")
            self.test_df["freight_cost_(usd)"]=self.test_df["freight_cost_(usd)"].astype("float")
           
                
            
                
            logging.info("NaN values are being filled") 

            # Outlier capping
            logging.info("Got a list of numerical_col")
            [self._outlier_capping(self.train_df,col) for col in numerical_cols]
            logging.info("Outlier capped in train df")
            [self._outlier_capping(self.test_df,col) for col in numerical_cols]
            logging.info("Outlier capped in test df")

            input_feature_train_df = self.train_df.drop(columns=drop_columns,axis=1)
            target_feature_train_df=self.train_df[target_column_name]
            
            input_feature_test_df=self.test_df.drop(columns=drop_columns,axis=1)
            target_feature_test_df=self.test_df[target_column_name]

            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            
            
            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)
            

            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

            save_object(
                file_path=self.data_transformation_config.preprocessor_obj_file_path,
                obj=preprocessing_obj
            )
            
            logging.info("preprocessing pickle file saved")
            
            
            
            return (
                train_arr,
                test_arr
            )

        except Exception as e:
            logging.info("Exception occured in the initiate_datatransformation")

            raise shippingException(e,sys)
    

In [9]:
import pandas as pd
import importlib
import numpy as np
from src.logger import logging
from src.exception import shippingException
import os
import sys
from dataclasses import dataclass
from pathlib import Path

from src.utils.utils import save_object
from src.utils.utils import load_object
from src.utils.utils import evaluate_model

from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

@dataclass
class ModelTrainerConfig:
    trained_model_file_path = os.path.join('artifacts','model.pkl')
    
    
class ModelTrainer:
    def __init__(self):
        self.model_trainer_config = ModelTrainerConfig()
    
    def initate_model_training(self,train_array,test_array):
        try:
            logging.info('Splitting Dependent and Independent variables from train and test data')
            X_train, y_train, X_test, y_test = (
                train_array[:,:-1],
                train_array[:,-1],
                test_array[:,:-1],
                test_array[:,-1]
            )
            
            print(train_array)

            models={
            'LinearRegression':LinearRegression(),
            'Lasso':Lasso(),
            'Ridge':Ridge(),
            'Elasticnet':ElasticNet(),
            'Randomforest':RandomForestRegressor(n_estimators=100)
            ##'XGBOOST': XGBRegressor()
            }

            
            
                
            
        
            
            model_report:dict=evaluate_model(X_train,y_train,X_test,y_test,models)
            print(model_report)
            print('\n====================================================================================\n')
            logging.info(f'Model Report : {model_report}')

            # To get best model score from dictionary 
            best_model_score = max(sorted(model_report.values()))

            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]
            
            best_model = models[best_model_name]
            

            print(f'Best Model Found, Model Name : {best_model_name} , R2 Score : {best_model_score}')
            print('\n====================================================================================\n')
            logging.info(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')

            save_object(
                 file_path=self.model_trainer_config.trained_model_file_path,
                 obj=best_model
            )
          

        except Exception as e:
            logging.info('Exception occured at Model Training')
            raise shippingException(e,sys)

In [10]:
data_transformation=DataTransformation()

train_arr,test_arr=data_transformation.initialize_data_transformation(train_data_path,test_data_path)

Columns in train_df: Index(['id', 'project_code', 'pq_', 'po_/_so_', 'asn/dn_', 'country',
       'managed_by', 'fulfill_via', 'vendor_inco_term', 'shipment_mode',
       'pq_first_sent_to_client_date', 'po_sent_to_vendor_date',
       'scheduled_delivery_date', 'delivered_to_client_date',
       'delivery_recorded_date', 'product_group', 'sub_classification',
       'vendor', 'item_description', 'molecule/test_type', 'brand', 'dosage',
       'dosage_form', 'unit_of_measure_(per_pack)', 'line_item_quantity',
       'line_item_value', 'pack_price', 'unit_price', 'manufacturing_site',
       'first_line_designation', 'weight_(kilograms)', 'freight_cost_(usd)',
       'line_item_insurance_(usd)'],
      dtype='object')


In [5]:
import os
import sys
from src.logger import logging
from src.exception import shippingException


from src.components.ingestion import DataIngestion
from src.components.transformation import DataTransformation
from src.components.training import ModelTrainer


In [6]:
model_trainer_obj=ModelTrainer()
model_trainer_obj.initate_model_training(train_arr,test_arr)

{'LinearRegression': 0.9516215081504077, 'Lasso': 0.9516335003302373, 'Ridge': 0.9516256514333578, 'Elasticnet': 0.876760339653761, 'XGBOOST': 0.9530401700216486}


Best Model Found, Model Name : XGBOOST , R2 Score : 0.9530401700216486




In [118]:
final