In [174]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

In [187]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


## Handle Missing Values
### Extract PassengerId

In [188]:
# Split PassengerId
# Train-set
train['P_grp'] = train.PassengerId.str.split('_', expand=True)[0].astype(int)
train['P_id'] = train.PassengerId.str.split('_', expand=True)[1].astype(int)

# Test-set
test['P_grp'] = test.PassengerId.str.split('_', expand=True)[0].astype(int)
test['P_id'] = test.PassengerId.str.split('_', expand=True)[1].astype(int)

### Cabin

In [189]:
# Train-set
ca_id = train[train.Cabin.isna()].index
for i in range(199):
    if train.P_id.iloc[ca_id[i]] < 2:
        train.Cabin.replace(np.nan, train.Cabin.iloc[ca_id[i]+1], inplace=True)
    else:
        train.Cabin.replace(np.nan, train.Cabin.iloc[ca_id[i]-1], inplace=True)
        
# Test-set
ca_id = test[test.Cabin.isna()].index
for i in range(100):
    if test.P_id.iloc[ca_id[i]] < 2:
        test.Cabin.replace(np.nan, test.Cabin.iloc[ca_id[i]+1], inplace=True)
    else:
        test.Cabin.replace(np.nan, test.Cabin.iloc[ca_id[i]-1], inplace=True)

### Extract Cabin

In [190]:
# Split Cabin
# Train-set
train['C_1'] = train.Cabin.str.split('/', expand=True)[0]
train['C_2'] = train.Cabin.str.split('/', expand=True)[1]
train['C_3'] = train.Cabin.str.split('/', expand=True)[2]
train.drop(['PassengerId', 'Cabin', 'Name'], axis=1, inplace=True)

# Test-set
test['C_1'] = test.Cabin.str.split('/', expand=True)[0]
test['C_2'] = test.Cabin.str.split('/', expand=True)[1]
test['C_3'] = test.Cabin.str.split('/', expand=True)[2]
test.drop(['PassengerId', 'Cabin','Name'], axis=1, inplace=True)

### RoomService, FoodCourt, ShoppingMall, Spa, VRDeck

In [191]:
# Train-set
train.RoomService.replace(np.nan, 0.0, inplace=True)
train.FoodCourt.replace(np.nan, 0.0, inplace=True)
train.ShoppingMall.replace(np.nan, 0.0, inplace=True)
train.Spa.replace(np.nan, 0.0, inplace=True)
train.VRDeck.replace(np.nan, 0.0, inplace=True)

# Test-set
test.RoomService.replace(np.nan, 0.0, inplace=True)
test.FoodCourt.replace(np.nan, 0.0, inplace=True)
test.ShoppingMall.replace(np.nan, 0.0, inplace=True)
test.Spa.replace(np.nan, 0.0, inplace=True)
test.VRDeck.replace(np.nan, 0.0, inplace=True)

### HomePlanet & Destination & CryoSleep & VIP

In [192]:
# Train-set
h_id = train[train.HomePlanet.isna()].index
d_id = train[train.Destination.isna()].index
c_id = train[train.CryoSleep.isna()].index
v_id = train[train.VIP.isna()].index
for i in range(201):
    if train.P_id.iloc[h_id[i]] < 2:
        train.HomePlanet.replace(np.nan, train.HomePlanet.iloc[h_id[i]+1], inplace=True)
    else:
        train.HomePlanet.replace(np.nan, train.HomePlanet.iloc[h_id[i]-1], inplace=True)
        
for i in range(182):
    if train.P_id.iloc[d_id[i]] < 2:
        train.Destination.replace(np.nan, train.Destination.iloc[d_id[i]+1], inplace=True)
    else:
        train.Destination.replace(np.nan, train.Destination.iloc[d_id[i]-1], inplace=True)
        
for i in range(217):
    if train.P_id.iloc[c_id[i]] < 2:
        train.CryoSleep.replace(np.nan, train.CryoSleep.iloc[c_id[i]+1], inplace=True)
    else:
        train.CryoSleep.replace(np.nan, train.CryoSleep.iloc[c_id[i]-1], inplace=True)
        
for i in range(203):
    if train.P_id.iloc[v_id[i]] < 2:
        train.VIP.replace(np.nan, train.VIP.iloc[v_id[i]+1], inplace=True)
    else:
        train.VIP.replace(np.nan, train.VIP.iloc[v_id[i]-1], inplace=True)
        
# Test-set
h_id = test[test.HomePlanet.isna()].index
d_id = test[test.Destination.isna()].index
c_id = test[test.CryoSleep.isna()].index
v_id = test[test.VIP.isna()].index
for i in range(87):
    if test.P_id.iloc[h_id[i]] < 2:
        test.HomePlanet.replace(np.nan, test.HomePlanet.iloc[h_id[i]+1], inplace=True)
    else:
        test.HomePlanet.replace(np.nan, test.HomePlanet.iloc[h_id[i]-1], inplace=True)
        
for i in range(92):
    if test.P_id.iloc[d_id[i]] < 2:
        test.Destination.replace(np.nan, test.Destination.iloc[d_id[i]+1], inplace=True)
    else:
        test.Destination.replace(np.nan, test.Destination.iloc[d_id[i]-1], inplace=True)   
        
for i in range(93):
    if test.P_id.iloc[c_id[i]] < 2:
        test.CryoSleep.replace(np.nan, test.CryoSleep.iloc[c_id[i]+1], inplace=True)
    else:
        test.CryoSleep.replace(np.nan, test.CryoSleep.iloc[c_id[i]-1], inplace=True)
        
for i in range(93):
    if test.P_id.iloc[v_id[i]] < 2:
        test.VIP.replace(np.nan, test.VIP.iloc[v_id[i]+1], inplace=True)
    else:
        test.VIP.replace(np.nan, test.VIP.iloc[v_id[i]-1], inplace=True)

### Age

In [193]:
train.Age.replace(np.nan, 27.0, inplace=True)
test.Age.replace(np.nan, 27.0, inplace=True)

## Quantify Data

In [194]:
# Train-set
train.CryoSleep = train.CryoSleep.apply(lambda x: 1 if x == True else 0)
train.VIP = train.VIP.apply(lambda x: 1 if x == True else 0)
train.Transported = train.Transported.apply(lambda x: 1 if x == True else 0)

# Test-set
test.CryoSleep = test.CryoSleep.apply(lambda x: 1 if x == True else 0)
test.VIP = test.VIP.apply(lambda x: 1 if x == True else 0)

## Create Model
### Split Data

In [198]:
X = train.drop('Transported', axis=1)
y = train.Transported

X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)

In [199]:
ctb = CatBoostClassifier(iterations=5, loss_function='Logloss', cat_features=[0,2,12,13,14], silent=True).fit(X_tr, y_tr)
val = ctb.score(X_val, y_val)
tr = ctb.score(X_tr, y_tr)
print(tr)
print(val)

0.7907679033649698
0.7809085681426107
