In [322]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV

# read data

In [323]:
df_train = pd.read_csv('../datasets/train.csv')
df_test = pd.read_csv('../datasets/test.csv')

In [324]:
def convert_passengerID_2_index(df: pd.DataFrame):

    df = df.set_index("PassengerId")
    return df

def split_carbin(df: pd.DataFrame):

    for key, value in enumerate(['deck', 'num', 'side']):
        df[f'Carbin_{value}'] = df['Cabin'].apply(lambda x: str(x).split("/")[key] if len(str(x).split("/")) >= key+1 else None)

    df = df.drop('Cabin', axis=1)
    return df

def drop_columns(df: pd.DataFrame, columns:list):
    
    df = df.drop(columns=columns, axis=1)

    return df

In [325]:
df_train_cleansed = (
    df_train.pipe(convert_passengerID_2_index)
    .pipe(split_carbin)
    .pipe(drop_columns, ['Name', 'Carbin_num'])
)

In [326]:
df_test_cleansed = (
    df_test.pipe(convert_passengerID_2_index)
    .pipe(split_carbin)
    .pipe(drop_columns, ['Name', 'Carbin_num'])
)

# split data into X and y

In [327]:
def split_data_2_x_y(df:pd.DataFrame):

    columns = df.columns 
    X_columns = [i for i in columns if i != 'Transported']
    y_column = 'Transported' if 'Transported' in columns else None 

    if y_column:
        return df[X_columns], df[y_column]
    else:
        return df[X_columns], None

In [328]:
X, y = split_data_2_x_y(df_train_cleansed)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [329]:
X_test, _ = split_data_2_x_y(df_test_cleansed)

In [330]:
X_train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Carbin_deck,Carbin_side
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2513_01,Earth,False,TRAPPIST-1e,28.0,False,0.0,55.0,0.0,656.0,0.0,,
2774_02,Earth,False,TRAPPIST-1e,17.0,False,0.0,1195.0,31.0,0.0,0.0,F,P
8862_04,Europa,True,55 Cancri e,28.0,False,0.0,0.0,0.0,0.0,0.0,C,S
8736_02,Mars,False,TRAPPIST-1e,20.0,False,,2.0,289.0,976.0,0.0,F,P
0539_02,Europa,True,55 Cancri e,36.0,False,0.0,0.0,0.0,0.0,0.0,C,P


# data pipeline

In [331]:
# create datapipeline functions

In [332]:
num_pipeline = Pipeline([
    ("Fill N/A value", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler())
])

cate_pipeline = Pipeline([
    ("Fill N/A value", SimpleImputer(strategy="most_frequent")),
    ("One Hot Encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [333]:
# num_attribs = X_train.select_dtypes(include="float").columns
# cat_attribs = X_train.select_dtypes(include="object").columns

In [334]:
preprocessing = ColumnTransformer([
    ("num", num_pipeline, make_column_selector(dtype_include=np.number)),
    ("cat", cate_pipeline, make_column_selector(dtype_include=object))
])

In [335]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6954 entries, 2513_01 to 7775_01
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    6786 non-null   object 
 1   CryoSleep     6777 non-null   object 
 2   Destination   6815 non-null   object 
 3   Age           6806 non-null   float64
 4   VIP           6792 non-null   object 
 5   RoomService   6828 non-null   float64
 6   FoodCourt     6814 non-null   float64
 7   ShoppingMall  6789 non-null   float64
 8   Spa           6820 non-null   float64
 9   VRDeck        6803 non-null   float64
 10  Carbin_deck   6954 non-null   object 
 11  Carbin_side   6796 non-null   object 
dtypes: float64(6), object(6)
memory usage: 706.3+ KB


# Train the model

In [336]:
rnd_clf = make_pipeline(preprocessing, RandomForestClassifier(random_state=42))
rnd_clf.fit(X_train, y_train)

# Tunning the model

In [337]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestClassifier(random_state=42))
])

# param_grid = [
#     {'preprocessing__geo__n_clusters': [5, 8, 10],
#     'random_forest__max_features': [4, 6, 8]},
#     {'preprocessing__geo__n_clusters': [10, 15],
#      'random_forest__max_features': [6, 8, 10]}
# ]
param_grid = [
    {
    'random_forest__max_features': [6, 8, 10],
    'random_forest__n_estimators': [50, 100, 200, 500],
    'random_forest__max_depth': [5, 10, None],
    'random_forest__min_samples_split': [2, 5, 10],
    'random_forest__min_samples_leaf': [1, 2, 4],
    # 'random_forest__max_features': ['auto', 'sqrt']
    }
]
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,
                           scoring="f1", #"roc_auc", 
                           n_jobs=-1)
grid_search.fit(X_train, y_train)

In [338]:
print('Best hyperparameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

Best hyperparameters: {'random_forest__max_depth': 10, 'random_forest__max_features': 10, 'random_forest__min_samples_leaf': 4, 'random_forest__min_samples_split': 10, 'random_forest__n_estimators': 100}
Best score: 0.8082967932289472


In [339]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by='mean_test_score', ascending=False, inplace=True)
cv_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_random_forest__max_depth,param_random_forest__max_features,param_random_forest__min_samples_leaf,param_random_forest__min_samples_split,param_random_forest__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
213,0.409477,0.004538,0.025432,0.003778,10,10,4,10,100,"{'random_forest__max_depth': 10, 'random_fores...",0.802907,0.80809,0.813894,0.808297,0.004488,1
215,1.997122,0.015582,0.104016,0.00693,10,10,4,10,500,"{'random_forest__max_depth': 10, 'random_fores...",0.803076,0.809626,0.811805,0.808169,0.00371,2
179,1.601594,0.030335,0.101227,0.009432,10,8,4,10,500,"{'random_forest__max_depth': 10, 'random_fores...",0.803754,0.80809,0.8125,0.808115,0.00357,3
178,0.656142,0.028469,0.047109,0.00871,10,8,4,10,200,"{'random_forest__max_depth': 10, 'random_fores...",0.803579,0.806882,0.812339,0.8076,0.003612,4
166,0.662085,0.024392,0.06179,0.005869,10,8,2,10,200,"{'random_forest__max_depth': 10, 'random_fores...",0.798473,0.809318,0.813328,0.80704,0.006275,5


# Evaluate the performance

## Evaluate the performance on the training set

In [340]:
y_proba = rnd_clf.predict_proba(X_train)[:, 1]
y_pred = rnd_clf.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9891421953842335
F1 score: 0.9510861948142958


In [341]:
y_proba = grid_search.predict_proba(X_train)[:, 1]
y_pred = grid_search.predict(X_train)
auc_roc = roc_auc_score(y_train, y_proba)
f1 = f1_score(y_train, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.9327657788071801
F1 score: 0.8453549580310143


In [342]:
X_train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Carbin_deck,Carbin_side
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2513_01,Earth,False,TRAPPIST-1e,28.0,False,0.0,55.0,0.0,656.0,0.0,,
2774_02,Earth,False,TRAPPIST-1e,17.0,False,0.0,1195.0,31.0,0.0,0.0,F,P
8862_04,Europa,True,55 Cancri e,28.0,False,0.0,0.0,0.0,0.0,0.0,C,S
8736_02,Mars,False,TRAPPIST-1e,20.0,False,,2.0,289.0,976.0,0.0,F,P
0539_02,Europa,True,55 Cancri e,36.0,False,0.0,0.0,0.0,0.0,0.0,C,P


### Analyse Features

In [343]:
len(grid_search_model["preprocessing"].get_feature_names_out())

26

In [344]:
grid_search_model = grid_search.best_estimator_
# feature_importances = grid_search_model
feature_importances = grid_search_model["random_forest"].feature_importances_
feature_importances.round(4)

sorted(zip(feature_importances, grid_search_model["preprocessing"].get_feature_names_out()), reverse=True)[-10:]

[(0.005624680068655777, 'cat__HomePlanet_Mars'),
 (0.004081404211267831, 'cat__Carbin_deck_B'),
 (0.002768466598281784, 'cat__Destination_PSO J318.5-22'),
 (0.0016715543953473861, 'cat__Carbin_deck_D'),
 (0.001150131133157336, 'cat__Carbin_deck_A'),
 (0.0007725036623994828, 'cat__Carbin_deck_nan'),
 (0.0006795646428000571, 'cat__Carbin_side_None'),
 (0.0003724542853319606, 'cat__VIP_True'),
 (0.00033113219493927386, 'cat__VIP_False'),
 (0.0, 'cat__Carbin_deck_T')]

## Evaluate the performance on the validation set

In [345]:
# randome forest tree
y_proba = rnd_clf.predict_proba(X_val)[:, 1]
y_pred = rnd_clf.predict(X_val)
auc_roc = roc_auc_score(y_val, y_proba)
f1 = f1_score(y_val, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.8596218308424542
F1 score: 0.7821552723059096


In [346]:
# gridsearchCV
y_proba = grid_search.predict_proba(X_val)[:, 1]
y_pred = grid_search.predict(X_val)
auc_roc = roc_auc_score(y_val, y_proba)
f1 = f1_score(y_val, y_pred)

# Print the AUC-ROC score
print('AUC-ROC score:', auc_roc)
print('F1 score:', f1)

AUC-ROC score: 0.8808842554745104
F1 score: 0.8004496908375494


# output data

In [347]:
y_test = rnd_clf.predict(X_test)
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') # 0.7912

In [348]:
y_test = grid_search.predict(X_test)
output = pd.DataFrame(y_test, index=X_test.index, columns=['Transported'])
output.to_csv('submission.csv') # 0.7877

In [349]:
X_train.columns

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Carbin_deck',
       'Carbin_side'],
      dtype='object')