In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from xgboost import XGBClassifier

In [5]:
PATH = 'spaceship-titanic'
train = pd.read_csv(f'{PATH}/train.csv')
test = pd.read_csv(f'{PATH}/test.csv')

In [6]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [8]:
train['Transported'].replace(False, 0, inplace=True)
train['Transported'].replace(True, 1, inplace=True)

train.Transported = train.Transported.astype(int)

train['VIP'].replace(False, 0, inplace=True)
train['VIP'].replace(True, 1, inplace=True)
test['VIP'].replace(False, 0, inplace=True)
test['VIP'].replace(True, 1, inplace=True)

test.VIP = test.VIP.astype("Int8")
train.VIP = train.VIP.astype("Int8")

train.CryoSleep.replace(False, 0, inplace=True)
test.CryoSleep.replace(False, 0, inplace=True)

train.CryoSleep.replace(True, 1, inplace=True)
test.CryoSleep.replace(True, 1, inplace=True)

test.CryoSleep = test.CryoSleep.astype("Int8")
train.CryoSleep = train.CryoSleep.astype("Int8")

In [9]:
train[['deck','num', 'side']] = train['Cabin'].str.split('/', expand=True)
test[['deck','num', 'side']] = test['Cabin'].str.split('/', expand=True)

train.drop('Cabin', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)

In [10]:
col_to_sum = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

train['SumSpends'] = train[col_to_sum].sum(axis=1)
test['SumSpends'] = test[col_to_sum].sum(axis=1)

train['MaxSpends'] = train[col_to_sum].max(axis=1)
test['MaxSpends'] = test[col_to_sum].max(axis=1)

In [13]:
# Since dispersion of spendings is very high, we can use a transformation to normalize it.
train['log_spend'] = np.log(train.SumSpends + 1)
test['log_spend'] = np.log(test.SumSpends + 1)

# We must add 1, to prevent -infinite values.

In [14]:
# We should impute null values

null_cols = train.isnull().sum().sort_values(ascending=False)
null_cols = list(null_cols[null_cols>1].index)

In [17]:
# We do not need one-hot encoding in decision trees.

object_cols = [col for col in train.columns if train[col].dtype == 'object' or train[col].dtype == 'category']

oc = OrdinalEncoder()

df_for_encode = pd.concat([train, test])

df_for_encode[object_cols] = df_for_encode[object_cols].astype('category')

df_for_encode[object_cols] = oc.fit_transform(df_for_encode[object_cols])

del train, test

train = df_for_encode.iloc[:8693, :]
test = df_for_encode.iloc[8693: , :]

del df_for_encode

test.drop('Transported', inplace=True, axis=1)

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([("imp", SimpleImputer(strategy='mean'), null_cols)])

train[null_cols] = ct.fit_transform(train[null_cols])
test[null_cols] = ct.transform(test[null_cols])

In [19]:
train.drop('PassengerId', axis=1, inplace = True)
test.drop('PassengerId', axis=1, inplace = True)

y_train = train['Transported']
X_train = train.drop('Transported', axis=1)
X_test = test

if X_train.shape[1] == X_test.shape[1]:
    print('Shapes are equal. We are ready to train models.')
else:
    print('There is something wrong in preprocessing steps.')

Shapes are equal. We are ready to train models.


In [53]:
# X_train.shape, y_train.shape, X_test.shape
x_tr, x_ts, y_tr, y_ts = train_test_split(X_train, y_train, test_size=0.5, random_state=0)
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(x_tr, x_ts, y_tr, y_ts)

print(models)

100%|██████████| 29/29 [00:09<00:00,  2.99it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.80               0.80     0.80      0.80   
RandomForestClassifier             0.79               0.79     0.79      0.79   
XGBClassifier                      0.79               0.79     0.79      0.79   
ExtraTreesClassifier               0.79               0.79     0.79      0.79   
LogisticRegression                 0.78               0.78     0.78      0.78   
SGDClassifier                      0.78               0.78     0.78      0.78   
AdaBoostClassifier                 0.78               0.78     0.78      0.78   
LinearSVC                          0.78               0.78     0.78      0.78   
CalibratedClassifierCV             0.78               0.78     0.78      0.78   
SVC                                0.78               0.78     0.78      0.78   
BaggingClassifier           




#### random forest

In [58]:
# Random Forest
top_accuracy = 0
top_value_x = 0
top_value_y = 0
top_model = 0
top_x_values = [] # [7, 4, 6, 11, 12, 19, 20, 22, 29, 30, 47]
top_y_values = [] # [31, 11, 12, 22, 14, 20, 23, 29, 41, 24, 22]

for value_x in np.arange(1, 100+1, 1):
    for value_y in np.arange(1, 100+1, 1):
        # Random Forest Classifier
        random_forest = RandomForestClassifier(n_estimators=value_x, 
                                            criterion='gini',
                                            max_depth=value_y)

        random_forest.fit(x_tr, y_tr)
        y_pred = random_forest.predict(x_ts)
        random_forest.score(x_tr, y_tr)   
        acc_random_forest = round(metrics.accuracy_score(y_pred, y_ts) * 100, 2)
        if acc_random_forest > top_accuracy:
            top_accuracy = acc_random_forest
            top_value_x = value_x
            top_value_y = value_y
            top_x_values.append(top_value_x)
            top_y_values.append(top_value_y)
            top_model = random_forest
        
        print(f'{value_x = :3d}, {value_y = :3d}, {acc_random_forest = :2.2f} %, {top_value_x = :2d}, {top_value_y = :2d}, {top_accuracy = :2.2f} %', end='\r')


value_x = 100, value_y = 100, acc_random_forest = 79.27 %, top_value_x = 31, top_value_y = 68, top_accuracy = 80.17 %

In [65]:
len(top_x_values[10:]), len(top_y_values[10:])

(10, 10)

In [66]:
# top_x_values = [7, 4, 6, 11, 12, 19, 20, 22, 29, 30, 47]
# top_y_values = [31, 11, 12, 22, 14, 20, 23, 29, 41, 24, 22]
top_accuracy = 0
for x_value, y_value in zip(top_x_values[5:], top_y_values[5:]):
    for i in range(1, 1000+1):
        # Random Forest Classifier
        random_forest = RandomForestClassifier(n_estimators=x_value, 
                                            criterion='gini',
                                            max_depth=y_value)
        random_forest.fit(x_tr, y_tr)
        y_pred = random_forest.predict(x_ts)
        random_forest.score(x_tr, y_tr)   
        acc_random_forest = round(metrics.accuracy_score(y_pred, y_ts) * 100, 2)

        if acc_random_forest > top_accuracy:
            top_accuracy = acc_random_forest  
            top_value_x = x_value
            top_value_y = y_value  
            top_model = random_forest                 
        
        print(f'{x_value = :2d}, {y_value = :2d}, {i:4d}/1000, {acc_random_forest = :2.2f} %, {top_value_x = :2d}, {top_value_y = :2d}, {top_accuracy = :2.2f} %', end='\r')

x_value = 31, y_value = 68, 1000/1000, acc_random_forest = 79.09 %, top_value_x = 28, top_value_y = 19, top_accuracy = 80.12 %

In [67]:
y_pred = pd.Series(top_model.predict(X_test))
# top_model.score(x_tr, y_tr)   
# acc_random_forest = round(metrics.accuracy_score(y_pred, y_ts) * 100, 2)
sample_df = pd.read_csv('spaceship-titanic/sample_submission.csv')

y_pred.loc[(y_pred == 1)]= 'True'
y_pred.loc[(y_pred == 0)]= 'False'

submission = pd.DataFrame({
        "PassengerId": sample_df["PassengerId"],
        "Transported": y_pred
    })
submission.to_csv('submission.csv', index=False)
print('Saved submission!')

Saved submission!


In [49]:
y_pred

0       False
1       False
2        True
3        True
4       False
        ...  
4272     True
4273    False
4274     True
4275     True
4276     True
Length: 4277, dtype: object