In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# read data

In [5]:
df_train = pd.read_csv('../datasets/train.csv')
df_test = pd.read_csv('../datasets/test.csv')

In [6]:
def convert_passengerID_2_index(df: pd.DataFrame):

    df = df.set_index("PassengerId")
    return df

In [7]:
df_train_cleansed = (
    df_train.pipe(convert_passengerID_2_index)
)

In [None]:
df_test_cleansed = (
    df_test.pipe(convert_passengerID_2_index)
)

# split data into X and y

In [17]:
def split_data_2_x_y(df:pd.DataFrame):

    columns = df.columns 
    X_columns = [i for i in columns if i != 'Transported']
    y_column = 'Transported' if 'Transported' in columns else None 

    if y_column:
        return df[X_columns], df[y_column]
    else:
        return df[X_columns], None

In [48]:
X, y = split_data_2_x_y(df_train_cleansed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
X_test, _ = split_data_2_x_y(df_test_cleansed)

In [50]:
X_train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2513_01,Earth,False,,TRAPPIST-1e,28.0,False,0.0,55.0,0.0,656.0,0.0,Loree Mathison
2774_02,Earth,False,F/575/P,TRAPPIST-1e,17.0,False,0.0,1195.0,31.0,0.0,0.0,Crisey Mcbriddley
8862_04,Europa,True,C/329/S,55 Cancri e,28.0,False,0.0,0.0,0.0,0.0,0.0,Alramix Myling
8736_02,Mars,False,F/1800/P,TRAPPIST-1e,20.0,False,,2.0,289.0,976.0,0.0,Tros Pota
0539_02,Europa,True,C/18/P,55 Cancri e,36.0,False,0.0,0.0,0.0,0.0,0.0,Achyon Nalanet


# data pipeline

In [51]:
num_pipeline = Pipeline([
    ("Fill N/A value", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler())
])

cate_pipeline = Pipeline([
    ("Fill N/A value", SimpleImputer(strategy="most_frequent")),
    ("One Hot Encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [33]:
# num_attribs = X_train.select_dtypes(include="float").columns
# cat_attribs = X_train.select_dtypes(include="object").columns

In [52]:
preprocessing = ColumnTransformer([
    ("num", num_pipeline, make_column_selector(dtype_include=np.number)),
    ("cat", cate_pipeline, make_column_selector(dtype_include=object))
])

In [41]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
dtypes: float64(6), object(6)
memory usage: 882.9+ KB


# Train the model

In [53]:
rnd_clf = make_pipeline(preprocessing, RandomForestClassifier(random_state=42))
rnd_clf.fit(X_train, y_train)

# Evaluate the performance

## Evaluate the performance on the training set

In [57]:
y_proba = rnd_clf.predict_proba(X_train)[:, 1]
auc_roc = roc_auc_score(y_train, y_proba)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)

AUC-ROC score: 1.0


## Evaluate the performance on the validation set

In [58]:
y_proba = rnd_clf.predict_proba(X_val)[:, 1]
auc_roc = roc_auc_score(y_val, y_proba)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)

AUC-ROC score: 0.8549780543363521


# output data

In [60]:
y_test = rnd_clf.predict(X_test)

In [68]:
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])

In [70]:
output.to_csv('submission.csv')