In [32]:
import pandas as pd
import numpy as np
train_titanic = pd.read_csv("/home/tuandinh/Desktop/ML and AI/Spaceship Titanic competiton/datasets/train.csv")
test_titanic = pd.read_csv("/home/tuandinh/Desktop/ML and AI/Spaceship Titanic competiton/datasets/test.csv")
train_titanic.head(), test_titanic.head()

(  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
 0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
 1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
 2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
 3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
 4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   
 
    RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
 0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
 1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
 2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
 3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
 4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   
 
    Transported  
 0        False  
 1         True  
 2        False  
 3        False  


In [33]:
train_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [34]:
train_titanic.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

### **Features scalling data**

In [35]:
col_names = "RoomService", "FoodCourt","ShoppingMall","Spa","VRDeck"
roomservice_ix, foodcourt_ix, shopping_ix, spa_ix, vrCheck_ix = [
    train_titanic.columns.get_loc(c) for c in col_names] # get the column indices
print(roomservice_ix, foodcourt_ix, shopping_ix, spa_ix, vrCheck_ix)

7 8 9 10 11


In [36]:
import math
from sklearn.base import BaseEstimator, TransformerMixin
attribs_spending_money = [7,8,9,10,11]
# roomservice_ix, foodcourt_ix, shopping_ix, spa_ix, vrCheck_ix = 7,8,9,10,11
class CombineSpendingMoneyOfPerson(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        array = np.zeros(len(X)) # mang mot cot toan so 0
        for ele in range(len(X)):
            for i in attribs_spending_money:
                if ~math.isnan(X[ele][i]):
                    array[ele] += (X[ele][i])
        return np.c_[X,array]

# function for add feature ["spen_money_of_person"]
def feature_scalling_dataset(train, test, drop=None):
    attr_adder = CombineSpendingMoneyOfPerson()
    # Dataset with added columns ["spending_money_of_person"]
    train_titanic_extra_attribs = attr_adder.transform(train.values)
    test_titanic_extra_attribs = attr_adder.transform(test.values)
    train_titanic_extra_attribs = pd.DataFrame(train_titanic_extra_attribs,
                            columns=list(train) + ["spending_money_of_person"],
                            index= train.index)
    test_titanic_extra_attribs = pd.DataFrame(test_titanic_extra_attribs,
                            columns=list(test) + ["spending_money_of_person"],
                            index= test.index)
    return train_titanic_extra_attribs, test_titanic_extra_attribs
train_titanic_1, test_titanic_1 = feature_scalling_dataset(train_titanic, test_titanic)
train_titanic_1.isnull().sum()

PassengerId                   0
HomePlanet                  201
CryoSleep                   217
Cabin                       199
Destination                 182
Age                         179
VIP                         203
RoomService                 181
FoodCourt                   183
ShoppingMall                208
Spa                         183
VRDeck                      188
Name                        200
Transported                   0
spending_money_of_person    908
dtype: int64

In [37]:
# Split columns ["Cabin"] to ["Deck","Side"]
def deck_side_encode(data):
    split_cabin = pd.DataFrame(list(map(lambda x: str(x).split("/"), data.Cabin)))
    new_data = data.copy()
    new_data["Side"] = split_cabin[2]
    new_data["Deck"] = split_cabin[0]
    new_data = new_data.drop("Cabin", axis=1)
    return new_data

In [38]:
attribs_num = ["Age","RoomService","FoodCourt","ShoppingMall","Spa","VRDeck","spending_money_of_person"]
attribs_cat = ["HomePlanet", "Destination", "Cabin"]
attribs_cat_bool = ["CryoSleep", "VIP"]

### **Handle Data**

In [39]:
from sklearn.impute import SimpleImputer
def prepare_data(train, test, drop=None):
    train, test = feature_scalling_dataset(train, test)
    # delete columns Name and PassengerId
    train = train.drop(["Name", "PassengerId"], axis=1)
    test = test.drop(["Name", "PassengerId"], axis=1)
    # missing value
    imputer = SimpleImputer(strategy="mean")
    imputer_cat_bool = SimpleImputer(strategy="most_frequent")
    train[attribs_num] = imputer.fit_transform(train[attribs_num])
    test[attribs_num] = imputer.fit_transform(test[attribs_num])
    train[attribs_cat_bool] = imputer_cat_bool.fit_transform(train[attribs_cat_bool])
    test[attribs_cat_bool] = imputer_cat_bool.fit_transform(test[attribs_cat_bool])
    train[attribs_cat] = imputer_cat_bool.fit_transform(train[attribs_cat])
    test[attribs_cat] = imputer_cat_bool.fit_transform(test[attribs_cat])
    train_titan_encoder = pd.get_dummies(deck_side_encode(train), dummy_na=True, columns=["HomePlanet","Destination","Deck","Side"])
    test_titan_encoder = pd.get_dummies(deck_side_encode(test), dummy_na=True, columns=["HomePlanet","Destination","Deck","Side"])
    # Handle features
    #train data
    train_titan_encoder["CryoSleep"] = train_titan_encoder["CryoSleep"].astype(int)
    train_titan_encoder["VIP"] = train_titan_encoder["VIP"].astype(int)
    train_titan_encoder["Transported"] = train_titan_encoder["Transported"].astype(int)
    # test data
    test_titan_encoder["CryoSleep"] = test_titan_encoder["CryoSleep"].astype(int)
    test_titan_encoder["VIP"] = test_titan_encoder["VIP"].astype(int)
    # test_titan_encoder["Transported"] = test_titan_encoder["Transported"].astype(int)
    return train_titan_encoder, test_titan_encoder

In [40]:
train_prep, test_prep = prepare_data(train_titanic, test_titanic)

In [41]:
train_prep.isnull().sum()

CryoSleep                    0
Age                          0
VIP                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
Transported                  0
spending_money_of_person     0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
HomePlanet_nan               0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
Destination_nan              0
Deck_A                       0
Deck_B                       0
Deck_C                       0
Deck_D                       0
Deck_E                       0
Deck_F                       0
Deck_G                       0
Deck_T                       0
Deck_nan                     0
Side_P                       0
Side_S                       0
Side_nan                     0
dtype: int64

In [42]:
train_prep.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,spending_money_of_person,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_nan,Side_P,Side_S,Side_nan
0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,736.0,...,0,0,0,1,0,0,0,0,1,0
2,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,10383.0,...,0,0,0,0,0,0,0,0,1,0
3,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,5176.0,...,0,0,0,0,0,0,0,0,1,0
4,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1091.0,...,0,0,0,1,0,0,0,0,1,0
