In [144]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split

In [145]:
dataset_df = pd.read_csv('train.csv')

In [146]:
dataset_df = dataset_df.drop(['PassengerId', 'Name'], axis=1)

Fill the dataset with 0s for the missing values in these columns

In [147]:
dataset_df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']] = (
            dataset_df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']].fillna(value=0))

Fill the missing values in Age with the mean

In [148]:
dataset_df['Age'] = dataset_df['Age'].fillna(dataset_df['Age'].mean())

Remove the rows with missing values in categorical columns

In [149]:
dataset_df = dataset_df.dropna()

Convert the booleans to integers cause the model can't handle booleans

In [150]:
dataset_df['Transported'] = dataset_df['Transported'].astype(int)
dataset_df['VIP'] = dataset_df['VIP'].astype(int)
dataset_df['CryoSleep'] = dataset_df['CryoSleep'].astype(int)

Split the Cabin column into 3 different columns

In [151]:
dataset_df[['Deck', 'Cabin_num', 'Side']] = dataset_df['Cabin'].str.split("/", expand=True)

The deck and the side are character columns, so we need to convert them to numeric

In [169]:
dataset_df['Deck'] = pd.Categorical(dataset_df['Deck'])
dataset_df['Deck'] = dataset_df['Deck'].cat.codes
dataset_df['Side'] = pd.Categorical(dataset_df['Side'])
dataset_df['Side'] = dataset_df['Side'].cat.codes
dataset_df['Cabin_num'] = pd.Categorical(dataset_df['Cabin_num'])
dataset_df['Cabin_num'] = dataset_df['Cabin_num'].cat.codes

Convert Home Planet and Destination to numbers

In [153]:
dataset_df['HomePlanet'] = pd.Categorical(dataset_df['HomePlanet'])
dataset_df['HomePlanet'] = dataset_df['HomePlanet'].cat.codes

dataset_df['Destination'] = pd.Categorical(dataset_df['Destination'])
dataset_df['Destination'] = dataset_df['Destination'].cat.codes

Drop the original Cabin column

In [154]:
dataset_df = dataset_df.drop(['Cabin'], axis=1)

In [155]:
dataset_df.head(25)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Cabin_num,Side
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,0,0
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,1,5,0,1
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,0,0,1
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,0,0,1
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,1,5,1,1
5,0,0,1,44.0,0,0.0,483.0,0.0,291.0,0.0,1,5,0,0
6,0,0,2,26.0,0,42.0,1539.0,3.0,0.0,0.0,1,5,2,1
7,0,1,2,28.0,0,0.0,0.0,0.0,0.0,0.0,1,6,0,1
8,0,0,2,35.0,0,0.0,785.0,17.0,216.0,0.0,1,5,3,1
9,1,1,0,14.0,0,0.0,0.0,0.0,0.0,0.0,1,1,1,0


In [156]:
# Split the dataset into training and testing dataset
X = dataset_df.drop('Transported', axis=1)
y = dataset_df['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Random Forest Classifier

In [157]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)
rfc.fit(X_train, y_train)

In [158]:
rfc_pred = rfc.predict(X_test)

In [159]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, rfc_pred))

[[1018  199]
 [ 285  936]]


In [160]:
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

           0       0.78      0.84      0.81      1217
           1       0.82      0.77      0.79      1221

    accuracy                           0.80      2438
   macro avg       0.80      0.80      0.80      2438
weighted avg       0.80      0.80      0.80      2438


# Logistic Regression

In [161]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(max_iter=2500, random_state=42)
logmodel.fit(X_train, y_train)

Confusion matrix

In [162]:
logmodel_pred = logmodel.predict(X_test)
print(confusion_matrix(y_test, logmodel_pred))

[[916 301]
 [240 981]]


classification report

In [163]:
print(classification_report(y_test, logmodel_pred))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77      1217
           1       0.77      0.80      0.78      1221

    accuracy                           0.78      2438
   macro avg       0.78      0.78      0.78      2438
weighted avg       0.78      0.78      0.78      2438


# Voting

In [164]:
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [165]:
voting_clf = VotingClassifier(estimators=[
    ('rfc', RandomForestClassifier(random_state=42, n_estimators=2500, n_jobs=-1)),
    ('lr', LogisticRegression(random_state=42, max_iter=2500)),
    ('svc', SVC(random_state=42)),
    ('dtc', DecisionTreeClassifier(random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5))
], voting='hard')

In [166]:
voting_clf.fit(X_train, y_train)

In [167]:
for name, clf in voting_clf.named_estimators_.items():
    print(name, "=", clf.score(X_test, y_test))

rfc = 0.8010664479081214
lr = 0.7780968006562756
svc = 0.7858900738310091
dtc = 0.7547169811320755
knn = 0.7592288761279737


# XGBoost

In [168]:
from xgboost import XGBClassifier

In [170]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [171]:
xgb_pred = xgb.predict(X_test)

In [172]:
print(confusion_matrix(y_test, xgb_pred))

[[985 232]
 [257 964]]


In [173]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1217
           1       0.81      0.79      0.80      1221

    accuracy                           0.80      2438
   macro avg       0.80      0.80      0.80      2438
weighted avg       0.80      0.80      0.80      2438
