In [214]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

In [215]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

y_train = train_df['Transported']
test_passengerId = test_df["PassengerId"]

modes = train_df.mode().iloc[0] # Fill out the empty data

def proc_data(df):
    df.fillna(modes, inplace=True)

    
proc_data(train_df)
proc_data(test_df)

X_train = train_df
X_test = test_df

X_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [216]:
train_df['CryoSleep'] = (train_df['CryoSleep']).astype(int)
test_df['CryoSleep'] = (test_df['CryoSleep']).astype(int)

train_df['HomePlanet'] = train_df['HomePlanet'].map( {'Europa': 0, 'Mars': 1, 'Earth': 2} ).astype(int)
test_df['HomePlanet'] = test_df['HomePlanet'].map( {'Europa': 0, 'Mars': 1, 'Earth': 2} ).astype(int)

train_df['Destination'] = train_df['Destination'].map( { '55 Cancri e': 0, 'PSO J318.5-22': 1, 'TRAPPIST-1e': 2 } ).astype(int)
test_df['Destination'] = test_df['Destination'].map( { '55 Cancri e': 0, 'PSO J318.5-22': 1, 'TRAPPIST-1e': 2 }).astype(int)

train_df['Is_Baby'] = (train_df['Age'] == 0).astype(int)
test_df['Is_Baby'] = (test_df['Age'] == 0).astype(int)

train_df['LivingLocation'] = train_df['Cabin'].str[-1].map( { 'S': 0, 'P': 1}).astype(int)
test_df['LivingLocation'] = test_df['Cabin'].str[-1].map( { 'S': 0, 'P': 1}).astype(int)

train_df['TotalSpending'] = np.log((train_df['FoodCourt'] + train_df['ShoppingMall'] + train_df['RoomService'] + train_df['VRDeck'] + train_df['Spa']) + 1)
test_df['TotalSpending'] = np.log((test_df['FoodCourt'] + train_df['ShoppingMall'] + train_df['RoomService'] + train_df['VRDeck'] + train_df['Spa']) + 1)

train_df['ZeroSpender'] = (train_df['TotalSpending'] == 0).astype(int)
test_df['ZeroSpender'] = (test_df['TotalSpending'] == 0).astype(int)

train_df['YoungAdultSleeper'] = ((train_df['Age'] >= 20) & (train_df['Age'] <= 25) & (train_df['CryoSleep'] == True)).astype(int)
test_df['YoungAdultSleeper'] = ((test_df['Age'] >= 20) & (test_df['Age'] <= 25) & (test_df['CryoSleep'] == True)).astype(int)

In [217]:
train_df.drop(columns=['PassengerId', 'Name', 'Transported', 'Cabin'], inplace=True)
test_df.drop(columns=['PassengerId', 'Name', 'Cabin'], inplace=True)

train_df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Is_Baby,LivingLocation,TotalSpending,ZeroSpender,YoungAdultSleeper
0,0,0,2,39.0,False,0.0,0.0,0.0,0.0,0.0,0,1,0.000000,1,0
1,2,0,2,24.0,False,109.0,9.0,25.0,549.0,44.0,0,0,6.602588,0,0
2,0,0,2,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0,0,9.248021,0,0
3,0,0,2,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0,0,8.551981,0,0
4,2,0,2,16.0,False,303.0,70.0,151.0,565.0,2.0,0,0,6.995766,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,0,0,41.0,True,0.0,6819.0,0.0,1643.0,74.0,0,1,9.052165,0,0
8689,2,1,1,18.0,False,0.0,0.0,0.0,0.0,0.0,0,0,0.000000,1,0
8690,2,0,2,26.0,False,0.0,0.0,1872.0,1.0,0.0,0,0,7.535830,0,0
8691,0,0,0,32.0,False,0.0,1049.0,0.0,353.0,3235.0,0,0,8.442039,0,0


In [218]:
test_df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Is_Baby,LivingLocation,TotalSpending,ZeroSpender,YoungAdultSleeper
0,2,1,2,27.0,False,0.0,0.0,0.0,0.0,0.0,0,0,0.000000,1,0
1,2,0,2,19.0,False,0.0,9.0,0.0,2823.0,0.0,0,0,6.602588,0,0
2,0,1,0,31.0,False,0.0,0.0,0.0,0.0,0.0,0,0,8.825854,0,0
3,0,0,2,38.0,False,0.0,6652.0,0.0,181.0,585.0,0,0,9.263502,0,0
4,2,0,2,20.0,False,10.0,0.0,635.0,0.0,0.0,0,0,6.929517,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,2,1,2,34.0,False,0.0,0.0,0.0,0.0,0.0,0,0,7.389564,0,0
4273,2,0,2,42.0,False,0.0,847.0,17.0,10.0,144.0,0,0,7.218177,0,0
4274,1,1,0,24.0,False,0.0,0.0,0.0,0.0,0.0,0,1,7.784057,0,1
4275,0,0,2,24.0,False,0.0,2680.0,0.0,0.0,523.0,0,1,9.025456,0,0


In [220]:
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20)}

rf = RandomForestClassifier()
rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, n_iter=5, cv=5)
rand_search.fit(X_train, y_train)

In [221]:
best_rf = rand_search.best_estimator_
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'max_depth': 8, 'n_estimators': 470}


In [222]:
y_pred = best_rf.predict(X_test)

In [223]:
output = pd.DataFrame({'PassengerId': test_passengerId, 'Transported': y_pred})
output_cut = output.iloc[:4277]
output_cut.to_csv('submission.csv', index=False)