In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix
import wandb

In [3]:
data_path_1 = "https://raw.githubusercontent.com/faiz-yah/End-to-end-ML/refs/heads/main/spaceship_titanic/dataset/spaceship-titanic/test.csv"
data_path_2 = "https://raw.githubusercontent.com/faiz-yah/End-to-end-ML/refs/heads/main/spaceship_titanic/dataset/spaceship-titanic/train.csv"

df_1 = pd.read_csv(data_path_1)
df_2 = pd.read_csv(data_path_2)

df = pd.concat([df_1, df_2])

df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [4]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Pre-procesing

In [5]:
########### Select Columns ###########
def selected_columsn(df, numerical_cols, categorical_cols,target_col):
    return df[numerical_cols + categorical_cols + [target_col]]

########### Imputation of Missing Values ###########
## Nummerical Features
def impute_numerical(df, numerical_cols):
    df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
    return df

## Categorical Features
def impute_categorical(df, categorical_cols):
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])
    return df

########### Log Transformation for Long-tailed Distribution ###########
def log_transform(df, numerical_cols):
    log_transformer = FunctionTransformer(func=np.log1p, validate=True)
    df[numerical_cols] = log_transformer.fit_transform(df[numerical_cols])
    return df

########### Standardised Numerical Values ###########
def standardize_numerical(df, numerical_cols):
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df


########### Remove rows without Target ###########
def remove_rows_without_target(df, target_col):
    df = df.dropna(subset=[target_col])
    return df

########### One hot encoding - Categorical  ###########
def encode_categorical(df, encode_categorical_cols):
    df = pd.get_dummies(df, columns=encode_categorical_cols, drop_first=True)
    return df

########### Target Mapping ###########
def map_target(df, target_col, map_dict):
    df[target_col] = df[target_col].map(map_dict)
    return df

########### Integrated Pre-processing Pipeline ###########
def pre_processing_integrated(df, numerical_cols, categorical_cols, encode_categorical_cols, target_col, map_dict):
    df = selected_columsn(df, numerical_cols, categorical_cols, target_col)
    df = remove_rows_without_target(df, target_col)
    df = impute_numerical(df, numerical_cols)
    df = impute_categorical(df, categorical_cols)
    df = log_transform(df, numerical_cols)
    df = standardize_numerical(df, numerical_cols)
    df = encode_categorical(df, encode_categorical_cols)
    df = map_target(df, target_col, map_dict)
    return df

In [6]:
df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [7]:
df_1 = df.copy()

categorical_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
encode_categorical_cols = ["HomePlanet", "Destination"]
numerical_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
target_col = "Transported"

map_dict = {True: 1, False: 0}

df_preprocessed = pre_processing_integrated(df_1, numerical_cols, categorical_cols,encode_categorical_cols, target_col, map_dict)
df_preprocessed

  df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CryoSleep,VIP,Transported,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0.640123,-0.638181,-0.650080,-0.622995,-0.664035,-0.640034,False,False,0,True,False,False,True
1,0.006901,1.090491,0.135040,0.646081,1.614565,0.745894,False,False,1,False,False,False,True
2,1.163749,0.753511,2.139858,-0.622995,2.518191,0.784254,False,True,0,True,False,False,True
3,0.421166,-0.638181,1.790516,1.682500,2.264863,1.277886,False,False,0,True,False,False,True
4,-0.512690,1.464342,0.803380,1.333879,1.624920,-0.240051,False,False,1,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.705856,-0.638181,2.359900,-0.622995,2.009974,0.931876,False,True,0,True,False,False,False
8689,-0.362839,-0.638181,-0.650080,-0.622995,-0.664035,-0.640034,True,False,0,False,False,True,False
8690,0.110589,-0.638181,-0.650080,2.312112,-0.413730,-0.640034,False,False,1,False,False,False,True
8691,0.380946,-0.638181,1.721915,-0.622995,1.455450,2.302492,False,False,0,True,False,False,False


In [8]:
df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        8693 non-null   float64
 1   RoomService                8693 non-null   float64
 2   FoodCourt                  8693 non-null   float64
 3   ShoppingMall               8693 non-null   float64
 4   Spa                        8693 non-null   float64
 5   VRDeck                     8693 non-null   float64
 6   CryoSleep                  8693 non-null   bool   
 7   VIP                        8693 non-null   bool   
 8   Transported                8693 non-null   int64  
 9   HomePlanet_Europa          8693 non-null   bool   
 10  HomePlanet_Mars            8693 non-null   bool   
 11  Destination_PSO J318.5-22  8693 non-null   bool   
 12  Destination_TRAPPIST-1e    8693 non-null   bool   
dtypes: bool(6), float64(6), int64(1)
memory usage: 594.2 

# Training 

In [None]:
def train_test_split_data(df, target_col, test_size=0.2, random_state=42):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train, model_type='random_forest'):
    if model_type == 'random_forest':
        model = RandomForestClassifier(random_state=42)
    elif model_type == 'logistic_regression':
        model = LogisticRegression(max_iter=1000, random_state=42)
    else:
        raise ValueError("Unsupported model type. Choose 'random_forest' or 'logistic_regression'.")
    
    model.fit(X_train, y_train)
    return model

def test_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return y_pred, accuracy, precision, recall, class_report, conf_matrix

In [10]:
X_train, X_test, y_train, y_test = train_test_split_data(df_preprocessed, target_col)


In [11]:
X_train

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CryoSleep,VIP,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
2333,0.206863,-0.638181,0.722457,-0.622995,1.678759,-0.640034,False,False,False,False,False,True
2589,-0.435682,-0.638181,1.766308,0.726959,-0.664035,-0.640034,False,False,False,False,False,True
8302,0.206863,-0.638181,-0.650080,-0.622995,-0.664035,-0.640034,True,False,True,False,False,False
8177,-0.228000,-0.638181,-0.275483,1.585506,1.822050,-0.640034,False,False,False,True,False,True
500,0.535087,-0.638181,-0.650080,-0.622995,-0.664035,-0.640034,True,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
5734,-0.362839,0.357745,-0.275483,1.315515,1.652547,-0.640034,False,False,False,False,False,True
5191,0.967436,1.766318,-0.650080,0.714593,1.732772,1.566819,False,False,False,True,False,True
5390,-0.105436,1.225984,-0.650080,1.779342,-0.664035,0.559913,False,False,False,False,True,False
860,0.460220,1.546406,-0.650080,2.257267,-0.664035,-0.640034,False,False,False,True,False,True


In [12]:
model_rf = train_model(X_train, y_train, model_type='random_forest')

model_lg = train_model(X_train, y_train, model_type='logistic_regression')

In [13]:
test_model_rf = test_model(model_rf, X_test, y_test)

test_model_lg = test_model(model_lg, X_test, y_test)

In [16]:
import wandb

run_rf = wandb.init(
    entity="faizyah-personal",
    project="end-to-end-ml-classification",
    name = "Random Forest",
    config={
        "model_type": "RandomForest",
        "n_estimators": 100,
        "min_samples_split": 2,
        "min_samples_leaf": 1
    },
)

run_rf.log(
    {
        "Random Forest Accuracy": test_model_rf[1],
        "Random Forest Classification Report": test_model_rf[2],
        "Random Forest Confusion Matrix": test_model_rf[3],
    }
)

run_rf.finish()

0,1
Random Forest Accuracy,▁

0,1
Random Forest Accuracy,0.77573
Random Forest Classification Report,precis...


In [17]:
import wandb

run_rf = wandb.init(
    entity="faizyah-personal",
    project="end-to-end-ml-classification",
    name = "Random Forest",
    config={
        "model_type": "RandomForest",
        "n_estimators": 200,
        "min_samples_split": 3,
        "min_samples_leaf": 2
    },
)

run_rf.log(
    {
        "Random Forest Accuracy": test_model_rf[1],
        "Random Forest Classification Report": test_model_rf[2],
        "Random Forest Confusion Matrix": test_model_rf[3],
    }
)

run_rf.finish()

0,1
Random Forest Accuracy,▁

0,1
Random Forest Accuracy,0.77573
Random Forest Classification Report,precis...
