In [31]:
import pandas as pd
import seaborn as sns

In [32]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [33]:
def split_columns(raw_dataset):
    dataset = raw_dataset.copy()

    dataset['Group'] = dataset['PassengerId'].str[:4].astype(float)
    dataset['PeopleId'] = dataset['PassengerId'].str[-2:].astype(float)
    
    dataset['Deck'] = dataset['Cabin'].str[0]
    dataset['Num'] = dataset['Cabin'].str[2:-2].astype(float)
    dataset['Side'] = dataset['Cabin'].str[-1]

    return dataset


train_data = split_columns(train_data)
test_data = split_columns(test_data)

In [34]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,PeopleId,Deck,Num,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1.0,1.0,B,0.0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2.0,1.0,F,0.0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3.0,1.0,A,0.0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3.0,2.0,A,0.0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4.0,1.0,F,1.0,S


In [35]:
train_data = train_data.drop(['PassengerId','Cabin','Name','PeopleId'], axis=1)
test_data = test_data.drop(['PassengerId','Cabin','Name','PeopleId'], axis=1)

In [36]:
def boolean_to_int(dataset,columns):
    for column in columns:
        dataset.loc[dataset[column]==True, column] = 1
        dataset.loc[dataset[column]==False, column] = 0

boolean_to_int(train_data, ['Transported', 'CryoSleep', 'VIP'])
boolean_to_int(test_data, ['CryoSleep', 'VIP'])

In [37]:
df = pd.get_dummies(train_data[['HomePlanet','Destination','Deck','Side']], drop_first=True)
train_data = train_data.drop(['HomePlanet','Destination','Deck','Side'], axis=1)
train_data = pd.concat([train_data,df], axis=1)

df2 = pd.get_dummies(test_data[['HomePlanet','Destination','Deck','Side']], drop_first=True)
test_data = test_data.drop(['HomePlanet','Destination','Deck','Side'], axis=1)
test_data = pd.concat([test_data,df2], axis=1)

In [38]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_data = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns)

scaler2 = MinMaxScaler()
test_data = pd.DataFrame(scaler2.fit_transform(test_data), columns=test_data.columns)

In [39]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=2)
train_data = pd.DataFrame(imputer.fit_transform(train_data), columns=train_data.columns)

imputer2 = KNNImputer(n_neighbors=2)
test_data = pd.DataFrame(imputer2.fit_transform(test_data), columns=test_data.columns)

In [41]:
from sklearn.model_selection import train_test_split

y = train_data['Transported'].astype(int)
x = train_data.drop('Transported', axis=1)
x_train, x_val, y_train, y_val = train_test_split(x,y,test_size=0.1,random_state=42)

In [42]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=500, early_stopping_rounds=5)
xgb.fit(x_train,y_train,eval_set=[(x_val,y_val)])

[0]	validation_0-logloss:0.58848
[1]	validation_0-logloss:0.52996
[2]	validation_0-logloss:0.49306
[3]	validation_0-logloss:0.46833
[4]	validation_0-logloss:0.45406
[5]	validation_0-logloss:0.43851
[6]	validation_0-logloss:0.43186
[7]	validation_0-logloss:0.42456
[8]	validation_0-logloss:0.41918
[9]	validation_0-logloss:0.41636
[10]	validation_0-logloss:0.41361
[11]	validation_0-logloss:0.41236
[12]	validation_0-logloss:0.41402
[13]	validation_0-logloss:0.41341
[14]	validation_0-logloss:0.41444
[15]	validation_0-logloss:0.41377


In [43]:
result = xgb.predict(test_data)
final_data = pd.read_csv('test.csv')
result = [bool(x) for x in result]

df = pd.DataFrame({'PassengerId': final_data['PassengerId'], 'Transported': result})
df.to_csv('submission.csv',index=False)