# Import the Dataframe from Data Analysis

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_pickle('./dataframe.pkl')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   int8   
 2   CryoSleep     8693 non-null   int8   
 3   Destination   8693 non-null   int8   
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   int8   
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   bool   
 12  CabinLevel    8693 non-null   int8   
 13  CabinSide     8693 non-null   int8   
dtypes: bool(1), float64(6), int8(6), object(1)
memory usage: 534.9+ KB


# Preprocess the Data

First, split the data into features (X) and labels (Y).

In [4]:
X = df[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CabinLevel', 'CabinSide']]
Y = df['Transported']

Now scale the data.

In [5]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[3.33333333e-01, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 2.50000000e-01, 5.00000000e-01],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        1.82322960e-03, 7.50000000e-01, 1.00000000e+00],
       [3.33333333e-01, 0.00000000e+00, 1.00000000e+00, ...,
        2.03041478e-03, 1.25000000e-01, 1.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        0.00000000e+00, 8.75000000e-01, 1.00000000e+00],
       [3.33333333e-01, 0.00000000e+00, 0.00000000e+00, ...,
        1.34048813e-01, 6.25000000e-01, 1.00000000e+00],
       [3.33333333e-01, 0.00000000e+00, 1.00000000e+00, ...,
        4.97244437e-04, 6.25000000e-01, 1.00000000e+00]])

# Create the train/test split

In [6]:
from sklearn.model_selection import StratifiedKFold

splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=67)

Try each classifier against the Stratified K-Fold split and examine stats. Random Forest and Gradient Boost are best performers. Others are commented out for faster tuning.

In [9]:
#from sklearn.linear_model import LogisticRegression
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

#lr = LogisticRegression(random_state=67)
#knn = KNeighborsClassifier(n_neighbors=int(np.sqrt(len(X_scaled))))
#dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=4, min_samples_leaf=3, max_features='sqrt', random_state=67)
#svm = SVC(probability=True)
gbst = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=1.0, min_samples_split=2, max_depth=4, random_state=67)

#lr_scores_stratified = []
#knn_scores_stratified = []
#dt_scores_stratified = []
rf_scores_stratified = []
#svm_scores_stratified = []
gbst_scores_stratified = []

for train_index, test_index in skf.split(X_scaled, Y):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = Y[train_index], Y[test_index]
    #lr.fit(X_train_fold, y_train_fold)
    #knn.fit(X_train_fold, y_train_fold)
    #dt.fit(X_train_fold, y_train_fold)
    rf.fit(X_train_fold, y_train_fold)
    #svm.fit(X_train_fold, y_train_fold)
    gbst.fit(X_train_fold, y_train_fold)
    #lr_scores_stratified.append(lr.score(X_test_fold, y_test_fold))
    #knn_scores_stratified.append(knn.score(X_test_fold, y_test_fold))
    #dt_scores_stratified.append(dt.score(X_test_fold, y_test_fold))
    rf_scores_stratified.append(rf.score(X_test_fold, y_test_fold))
    #svm_scores_stratified.append(svm.score(X_test_fold, y_test_fold))
    gbst_scores_stratified.append(gbst.score(X_test_fold, y_test_fold))

#print('\nLogistical Regression model:')
#print('   Highest accuracy:', max(lr_scores_stratified)*100, '%')
#print('   Lowest Accuracy:', min(lr_scores_stratified)*100, '%')
#print('   Mean Accuracy:', np.mean(lr_scores_stratified)*100, '%')
#print('   Standard Deviation:', np.std(lr_scores_stratified)*100, '%')
#print('\nK-Nearest Neighbors model:')
#print('   Highest accuracy:', max(knn_scores_stratified)*100, '%')
#print('   Lowest Accuracy:', min(knn_scores_stratified)*100, '%')
#print('   Mean Accuracy:', np.mean(knn_scores_stratified)*100, '%')
#print('   Standard Deviation:', np.std(knn_scores_stratified)*100, '%')
#print('\nDecision Tree model:')
#print('   Highest accuracy:', max(dt_scores_stratified)*100, '%')
#print('   Lowest Accuracy:', min(dt_scores_stratified)*100, '%')
#print('   Mean Accuracy:', np.mean(dt_scores_stratified)*100, '%')
#print('   Standard Deviation:', np.std(dt_scores_stratified)*100, '%')
print('\nRandom Forest model:')
print('   Highest accuracy:', max(rf_scores_stratified)*100, '%')
print('   Lowest Accuracy:', min(rf_scores_stratified)*100, '%')
print('   Mean Accuracy:', np.mean(rf_scores_stratified)*100, '%')
print('   Standard Deviation:', np.std(rf_scores_stratified)*100, '%')
#print('\nSupport Vector Machine model:')
#print('   Highest accuracy:', max(svm_scores_stratified)*100, '%')
#print('   Lowest Accuracy:', min(svm_scores_stratified)*100, '%')
#print('   Mean Accuracy:', np.mean(svm_scores_stratified)*100, '%')
#print('   Standard Deviation:', np.std(svm_scores_stratified)*100, '%')
print('\nGradient Boost Machine model:')
print('   Highest accuracy:', max(gbst_scores_stratified)*100, '%')
print('   Lowest Accuracy:', min(gbst_scores_stratified)*100, '%')
print('   Mean Accuracy:', np.mean(gbst_scores_stratified)*100, '%')
print('   Standard Deviation:', np.std(gbst_scores_stratified)*100, '%')


Random Forest model:
   Highest accuracy: 82.39355581127732 %
   Lowest Accuracy: 78.13578826237054 %
   Mean Accuracy: 80.0186500535693 %
   Standard Deviation: 1.4244448905524851 %

Gradient Boost Machine model:
   Highest accuracy: 83.42922899884925 %
   Lowest Accuracy: 78.59608745684696 %
   Mean Accuracy: 80.1912754784863 %
   Standard Deviation: 1.273937924409236 %
