In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer, SimpleImputer
from category_encoders import HashingEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from lightgbm import LGBMClassifier


%matplotlib inline

In [166]:
train_data = pd.read_csv('train.csv', index_col='PassengerId')
train_data.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Checking null value

In [143]:
#Checking null value 
train_data.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [167]:
# Split "Cabin" column and drop "Name" column
def split_column(df):

    # Split column "Cabin" into 3 columns "CabinDeck", "CabinNum", "CabinSide"
    df[['CabinDeck', 'CabinNum', 'CabinSide']] = df.Cabin.str.split('/', expand = True)

    # Drop 2 columns "Name" and "Cabin"
    df = df.drop(['Name', 'Cabin'], axis='columns')

    return df

In [168]:
train_data = split_column(train_data)

In [169]:
train_data.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CabinDeck,CabinNum,CabinSide
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [170]:
train_data['Transported'] = list(map(int, train_data['Transported']))

In [171]:
train_data.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CabinDeck,CabinNum,CabinSide
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0,B,0,P
0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1,F,0,S
0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S
0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S
0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1,F,1,S


In [172]:
# Encoding other categorical variables using Ordinal Encoder
def encoder_cat(df):
    cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinDeck', 'CabinNum', 'CabinSide']
    encode = OrdinalEncoder()
    df[cat_cols] = encode.fit_transform(df[cat_cols])
    
    return df

In [173]:
train_data = encoder_cat(train_data)
train_data.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,CabinDeck,CabinNum,CabinSide
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0001_01,1.0,0.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0
0002_01,0.0,0.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1,5.0,0.0,1.0
0003_01,1.0,0.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0,0.0,0.0,1.0
0003_02,1.0,0.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0,0.0,0.0,1.0
0004_01,0.0,0.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1,5.0,1.0,1.0


In [174]:
# create new variable "Total Expense"
col_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train_data['TotalExpense'] = train_data[col_to_sum].sum(axis=1)

In [175]:
# Determine missing values
miss_cols = train_data.isnull().sum().sort_values(ascending=False)
miss_cols = list(miss_cols[miss_cols>1].index)
miss_cols

['CryoSleep',
 'ShoppingMall',
 'VIP',
 'HomePlanet',
 'CabinDeck',
 'CabinNum',
 'CabinSide',
 'VRDeck',
 'FoodCourt',
 'Spa',
 'Destination',
 'RoomService',
 'Age']

In [176]:
# Impute missing values by SimpleImputer (mean)
tf = ColumnTransformer([("imp", SimpleImputer(strategy='mean'), miss_cols)])
train_data[miss_cols] = tf.fit_transform(train_data[miss_cols])

In [177]:
# Check missing values again
train_data.isnull().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
CabinDeck       0
CabinNum        0
CabinSide       0
TotalExpense    0
dtype: int64

# Data training

In [178]:
labels_training = train_data.pop('Transported')
X_train, X_valid, y_train, y_valid = train_test_split(train_data, labels_training, random_state=42)

In [179]:
# function to train model and get accuracy
def predict_acc(model, verbose=None):
    if verbose == None:
        model = model()
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)
        accuracy = metrics.accuracy_score(y_valid, predictions)
        print(f'Accuracy of {str(model)} is: ', accuracy)
        print(f'Cross validation score of {str(model)} is: ',cross_val_score(model, X_valid, y_valid, cv=5).mean())
    else:
        model = model(verbose=verbose)
        model.fit(X_train, y_train)
        predictions = model.predict(X_valid)
        accuracy = metrics.accuracy_score(y_valid, predictions)
        print(f'Accuracy of {str(model)} is: ', accuracy)
        print(f'Cross validation score of {str(model)} is: ',cross_val_score(model, X_valid, y_valid, cv=5).mean())

In [180]:
predict_acc(SVC, None)

Accuracy of SVC() is:  0.7819687212511499
Cross validation score of SVC() is:  0.7713989088405107


In [181]:
predict_acc(KNeighborsClassifier, None)

Accuracy of KNeighborsClassifier() is:  0.7511499540018399
Cross validation score of KNeighborsClassifier() is:  0.7649663647438953


In [182]:
predict_acc(DecisionTreeClassifier, None)

Accuracy of DecisionTreeClassifier() is:  0.7410303587856486
Cross validation score of DecisionTreeClassifier() is:  0.7382806292706182


In [183]:
predict_acc(GradientBoostingClassifier, None)

Accuracy of GradientBoostingClassifier() is:  0.7907083716651334
Cross validation score of GradientBoostingClassifier() is:  0.7829026961173791


In [184]:
predict_acc(RandomForestClassifier, None)

Accuracy of RandomForestClassifier() is:  0.7907083716651334
Cross validation score of RandomForestClassifier() is:  0.7810519624980137


In [185]:
from xgboost import XGBClassifier
predict_acc(XGBClassifier, None)

Accuracy of XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...) is:  0.7925482980680773
Cross validation score of XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,

In [186]:
predict_acc(LGBMClassifier)

Accuracy of LGBMClassifier() is:  0.797608095676173
Cross validation score of LGBMClassifier() is:  0.7838105831876687


In [187]:
predict_acc(CatBoostClassifier, verbose=False)

Accuracy of <catboost.core.CatBoostClassifier object at 0x0000028F8BAB8490> is:  0.797148114075437
Cross validation score of <catboost.core.CatBoostClassifier object at 0x0000028F8BAB8490> is:  0.7925621060437524


In [53]:
# Backward feature selection
model_fs = CatBoostClassifier(verbose=False)
sf = SequentialFeatureSelector(model_fs, scoring='accuracy', direction = 'backward', n_features_to_select='auto', tol=None)
sf.fit(train_data,labels_training)

In [54]:
best_features = list(sf.get_feature_names_out())
best_features

['CryoSleep',
 'RoomService',
 'Spa',
 'VRDeck',
 'CabinDeck',
 'CabinSide',
 'TotalExpense']

In [188]:
best_features

['CryoSleep',
 'RoomService',
 'Spa',
 'VRDeck',
 'CabinDeck',
 'CabinSide',
 'TotalExpense']

# Testing Data

In [191]:
X_test_data = pd.read_csv('test.csv', index_col='PassengerId')
# For writing to submission file
PassengerIdTest = X_test_data.index

In [192]:
X_test_data = split_column(X_test_data)

In [193]:
X_test_data = encoder_cat(X_test_data)

In [194]:
X_test_data['TotalExpense']= X_test_data[col_to_sum].sum(axis=1)

In [195]:
X_test_data[miss_cols] = tf.fit_transform(X_test_data[miss_cols])

In [196]:
X_test_data.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinDeck,CabinNum,CabinSide,TotalExpense
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0013_01,0.0,1.0,2.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,820.0,1.0,0.0
0018_01,0.0,0.0,2.0,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,5.0,927.0,1.0,2832.0
0019_01,1.0,1.0,0.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0
0021_01,1.0,0.0,2.0,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,2.0,1.0,1.0,7418.0
0023_01,0.0,0.0,2.0,20.0,0.0,10.0,0.0,635.0,0.0,0.0,5.0,1029.0,1.0,645.0


In [197]:
model = CatBoostClassifier(verbose=False)
model.fit(train_data[best_features], labels_training)
prediction = model.predict(X_test_data[best_features])

In [198]:
res = pd.DataFrame(
        {
            'PassengerId': list(PassengerIdTest),
            'Transported': [(p == 1) for p in list(prediction)]
        }
    )
res.to_csv('submission.csv', index=False)