In [41]:
import os
import sys

from src.exception import CustomException
from src.logger import logging
from src.utils import save_object

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer

from dataclasses import dataclass

In [43]:
def get_data_transformer_object():
    try:
        num_columns = ['FlightNumber', 'PayloadMass', 'Flights', 'Block', 'ReusedCount', 'Longitude', 'Latitude', 'Year', 'Month', 'DayOfWeek']
        cat_columns = ['BoosterVersion', 'Orbit', 'LaunchSite', 'LandingPad', 'Serial']
        bool_columns = ['GridFins', 'Reused', 'Legs']

        num_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy= 'median')),
                ("standard_scaler", StandardScaler())
            ]
        )

        cat_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy= 'most_frequent')),
                ("one_hot_encoder", OneHotEncoder(handle_unknown= 'ignore'))
            ]
        )

        bool_pipeline = Pipeline(
            steps= [
                #("to_int", FunctionTransformer(lambda x: x.astype(int))),
                ("to_int", FunctionTransformer(np.int32)),
                ("imputer", SimpleImputer(strategy= 'most_frequent'))
            ]
        )

        payload_pipeline = Pipeline(
            steps= [
                ("imputer", KNNImputer(n_neighbors= 5)),
                ("scaler", StandardScaler())
            ]
        )

        landingpad_pipeline = Pipeline(
            steps= [
                ("imputer", SimpleImputer(strategy= 'constant', fill_value= 'No Pad')),
                ("one_hot_encoder", OneHotEncoder(handle_unknown= 'ignore'))
            ]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("num_transformer", num_pipeline, [col for col in num_columns if col != 'PayloadMass']),
                ("cat_transformer", cat_pipeline, [col for col in cat_columns if col != 'LandingPad']),
                ("bool_transformer", bool_pipeline, bool_columns),
                #("bool_transformer", 'passthrough', bool_columns),
                ("payload_transformer", payload_pipeline, ['PayloadMass']),
                ("landingpad_transformer", landingpad_pipeline, ['LandingPad'])
            ]
        )

        return preprocessor

    except Exception as e:
        raise CustomException(e, sys)

In [44]:
def preprocess_date(df: pd.DataFrame):
    try:
        df['Date'] = pd.to_datetime(df['Date'], errors= 'coerce')
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df = df.drop(['Date'], axis = 1)

        return df

    except Exception as e:
        raise CustomException(e, sys)

In [45]:
train_path = r'C:\Users\HP\Documents\Projects\Landing_Prediction\artifacts\train.csv'
test_path = r'C:\Users\HP\Documents\Projects\Landing_Prediction\artifacts\test.csv'

In [46]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [47]:
train_df = preprocess_date(train_df)
test_df = preprocess_date(test_df)
print(f"train_df: {train_df.shape}")
print(f"test_df: {test_df.shape}")

train_df: (134, 19)
test_df: (34, 19)


In [48]:
target_column_name = "Outcome"
input_feature_train_df = train_df.drop(target_column_name, axis= 1)
target_feature_train_df = train_df[target_column_name]
print(f"input_feature_train_df: {input_feature_train_df.shape}")
print(f"target_feature_train_df: {target_feature_train_df.shape}")

input_feature_train_df: (134, 18)
target_feature_train_df: (134,)


In [49]:
input_feature_test_df = test_df.drop(target_column_name, axis= 1)
target_feature_test_df = test_df[target_column_name]
print(f"input_feature_test_df: {input_feature_test_df.shape}")
print(f"target_feature_test_df: {target_feature_test_df.shape}")

input_feature_test_df: (34, 18)
target_feature_test_df: (34,)


In [50]:
preprocessor_obj = get_data_transformer_object()

In [51]:
input_feature_train_array = preprocessor_obj.fit_transform(input_feature_train_df).toarray()
#input_feature_train_array = sparse_train_array.toarray()
print(f"input_feature_train_array: {input_feature_train_array.shape}")
print(f"input_feature_train_array: {input_feature_train_array.shape}")
input_feature_test_array = preprocessor_obj.transform(input_feature_test_df).toarray()
#input_feature_test_array = sparse_test_array.toarray()
print(f"input_feature_test_array: {input_feature_test_array.shape}")
print(f"input_feature_test_array: {input_feature_test_array.shape}")

input_feature_train_array: (134, 87)
input_feature_train_array: (134, 87)
input_feature_test_array: (34, 87)
input_feature_test_array: (34, 87)


In [52]:
print(f"np.array(target_feature_train_df): {np.array(target_feature_train_df).shape}")
print(f"np.array(target_feature_test_df): {np.array(target_feature_test_df).shape}")

np.array(target_feature_train_df): (134,)
np.array(target_feature_test_df): (34,)


In [53]:
input_feature_train_df.shape[0]

134

In [54]:
print(type(input_feature_train_array))
print(type(input_feature_test_array))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [29]:
input_feature_train_array = input_feature_train_array.toarray()
input_feature_test_array = input_feature_test_array.toarray()

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [55]:
print(type(input_feature_train_array))
print(type(input_feature_test_array))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [56]:
train_arr = np.c_[input_feature_train_array, np.array(target_feature_train_df)]
test_arr = np.c_[input_feature_test_array, np.array(target_feature_test_df)]

In [10]:
np.c_[np.array(input_feature_train_df), np.array(target_feature_train_df)]
np.c_[np.array(input_feature_test_df), np.array(target_feature_test_df)]

array([[155, 'Falcon 9', nan, 'SSO', 'CCSFS SLC 40', 7, True, True, True,
        'OCISLY', 5.0, 9, 'B1061', -80.577366, 28.5618571, 2022, 4, 4, 1],
       [40, 'Falcon 9', 6070.0, 'GTO', 'KSC LC 39A', 1, False, False,
        False, nan, 3.0, 0, 'B1034', -80.6039558, 28.6080585, 2017, 5, 0,
        0],
       [137, 'Falcon 9', 13260.0, 'LEO', 'CCSFS SLC 40', 9, True, True,
        True, 'OCISLY', 5.0, 13, 'B1058', -80.577366, 28.5618571, 2021,
        11, 5, 1],
       [39, 'Falcon 9', nan, 'LEO', 'KSC LC 39A', 1, True, False, True,
        'LZ-2', 3.0, 1, 'B1032', -80.6039558, 28.6080585, 2017, 5, 0, 1],
       [160, 'Falcon 9', 13260.0, 'VLEO', 'CCSFS SLC 40', 6, True, True,
        True, 'OCISLY', 5.0, 8, 'B1062', -80.577366, 28.5618571, 2022, 4,
        4, 1],
       [179, 'Falcon 9', 13260.0, 'VLEO', 'VAFB SLC 4E', 10, True, True,
        True, 'LZ-1', 5.0, 9, 'B1061', -120.610829, 34.632093, 2022, 8,
        4, 1],
       [182, 'Falcon 9', 13260.0, 'VLEO', 'VAFB SLC 4E', 7, True

In [57]:
preprocessor_obj

0,1,2
,transformers,"[('num_transformer', ...), ('cat_transformer', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<class 'numpy.int32'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'No Pad'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [58]:
import joblib
import os

In [63]:
preprocessor_path = r'C:\Users\HP\Documents\Projects\Landing_Prediction\artifacts\preprocessor.pkl'

with open(preprocessor_path, 'rb') as file_obj:
    prep = joblib.load(file_obj)

In [64]:
prep

0,1,2
,transformers,"[('num_transformer', ...), ('cat_transformer', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<class 'numpy.int32'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'No Pad'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [None]:
def get_data_transformer_object():
    try:
        num_columns = ['FlightNumber', 'PayloadMass', 'Flights', 'Block', 'ReusedCount', 'Longitude', 'Latitude', 'Year', 'Month', 'DayOfWeek']
        cat_columns = ['BoosterVersion', 'Orbit', 'LaunchSite', 'LandingPad', 'Serial']
        bool_columns = ['GridFins', 'Reused', 'Legs']

        num_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy= 'median')),
                ("standard_scaler", StandardScaler())
            ]
        )

        cat_pipeline = Pipeline(
            steps=[
                ("imputer", SimpleImputer(strategy= 'most_frequent')),
                ("one_hot_encoder", OneHotEncoder(handle_unknown= 'ignore'))
            ]
        )

        bool_pipeline = Pipeline(
            steps= [
                ("to_int", FunctionTransformer(lambda x: x.astype(int))),
                ("imputer", SimpleImputer(strategy= 'most_frequent'))
            ]
        )

        payload_pipeline = Pipeline(
            steps= [
                ("imputer", KNNImputer(n_neighbors= 5)),
                ("scaler", StandardScaler())
            ]
        )

        landingpad_pipeline = Pipeline(
            steps= [
                ("imputer", SimpleImputer(strategy= 'constant', fill_value= 'No Pad')),
                ("one_hot_encoder", OneHotEncoder(handle_unknown= 'ignore'))
            ]
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("num_transformer", num_pipeline, num_columns),
                ("cat_transformer", cat_pipeline, cat_columns),
                #("bool_transformer", bool_pipeline, bool_columns),
                #("bool_transformer", 'passthrough', bool_columns),
                #("payload_transformer", payload_pipeline, ['PayloadMass']),
                #("landingpad_transformer", landingpad_pipeline, ['LandingPad'])
            ]
        )

        return preprocessor

    except Exception as e:
        raise CustomException(e, sys)

In [None]:
input_feature_train_array.shape, np.array(target_feature_train_df).reshape(-1,1).shape

In [None]:
target_train_array = np.array(target_feature_train_df).reshape(-1, 1)
target_test_array = np.array(target_feature_test_df).reshape(-1, 1)

In [None]:
train_arr = np.c_[input_feature_train_array, target_train_array]
test_arr = np.c_[input_feature_test_array, target_test_array]

In [27]:
import sklearn
print(sklearn.__version__)

1.7.1
