In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
118,0126_01,Mars,True,F/27/S,TRAPPIST-1e,67.0,False,0.0,0.0,0.0,0.0,0.0,Milly Mish,True
7150,7628_01,Mars,False,F/1584/P,TRAPPIST-1e,19.0,False,37.0,0.0,1288.0,0.0,0.0,Hoces Wala,True
3176,3424_01,Earth,False,E/233/S,TRAPPIST-1e,37.0,False,190.0,0.0,2.0,799.0,0.0,Jillie Carrishley,False
117,0123_01,Earth,False,F/27/P,TRAPPIST-1e,18.0,False,55.0,597.0,49.0,0.0,1.0,Iree Fielson,False
1464,1548_01,Europa,True,B/50/P,55 Cancri e,51.0,,0.0,0.0,0.0,0.0,0.0,Okulk Unicting,True


In [4]:
train_df.shape

(8693, 14)

## Data cleaning

In [5]:
train_df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
services_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train_df[services_cols]  = train_df[services_cols].fillna(0.0)
test_df[services_cols]  = test_df[services_cols].fillna(0.0)

In [7]:
train_age_mean = train_df['Age'].mean()
test_age_mean = test_df['Age'].mean()

train_df['Age'] = train_df['Age'].fillna(train_age_mean)
test_df['Age'] = test_df['Age'].fillna(test_age_mean)

In [8]:
train_df = train_df.fillna(method='bfill')
test_df = test_df.fillna(method='bfill')

In [9]:
train_df.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [10]:
test_df.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
dtype: int64

In [11]:
col_to_drop = ['PassengerId', 'Cabin', 'Name']

n_train_df = train_df.drop(col_to_drop, axis='columns')
n_test_df = test_df.drop(col_to_drop, axis='columns')

## Data preprocessing

In [12]:
n_train_df.sample(5)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
5552,Earth,True,TRAPPIST-1e,38.0,False,0.0,0.0,0.0,0.0,0.0,True
1271,Earth,False,TRAPPIST-1e,14.0,False,1489.0,2.0,126.0,460.0,72.0,False
3414,Europa,False,55 Cancri e,35.0,False,0.0,388.0,0.0,1792.0,2891.0,False
4558,Earth,False,TRAPPIST-1e,23.0,False,14.0,0.0,437.0,103.0,333.0,True
2914,Mars,True,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,True


In [13]:
n_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   object 
 1   CryoSleep     8693 non-null   bool   
 2   Destination   8693 non-null   object 
 3   Age           8693 non-null   float64
 4   VIP           8693 non-null   bool   
 5   RoomService   8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   Spa           8693 non-null   float64
 9   VRDeck        8693 non-null   float64
 10  Transported   8693 non-null   bool   
dtypes: bool(3), float64(6), object(2)
memory usage: 568.9+ KB


### Dummy Variable

In [14]:
n_train_df["CryoSleep"] = n_train_df["CryoSleep"].apply([lambda x: 1 if(x == True) else 0])
n_train_df["VIP"] = n_train_df["VIP"].apply([lambda x: 1 if(x == True) else 0])
n_train_df["Transported"] = n_train_df["Transported"].apply([lambda x: 1 if(x == True) else 0])


n_test_df["CryoSleep"] = n_test_df["CryoSleep"].apply([lambda x: 1 if(x == True) else 0])
n_test_df["VIP"] = n_test_df["VIP"].apply([lambda x: 1 if(x == True) else 0])

In [15]:
n_train_df["HomePlanet"].value_counts()

Earth     4697
Europa    2184
Mars      1812
Name: HomePlanet, dtype: int64

In [16]:
n_train_df["Destination"].value_counts()

TRAPPIST-1e      6041
55 Cancri e      1836
PSO J318.5-22     816
Name: Destination, dtype: int64

In [17]:
n_train_df = pd.get_dummies(n_train_df, columns=['HomePlanet', 'Destination'], drop_first=True)
n_test_df = pd.get_dummies(n_test_df, columns=['HomePlanet', 'Destination'], drop_first=True)

In [18]:
X = n_train_df.drop(columns=["Transported"])
y = n_train_df["Transported"]

## Train test split

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Build Model 

In [20]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [21]:
model.fit(X_train, y_train)

RandomForestClassifier()

In [22]:
model.predict([X_train.iloc[100]])



array([0], dtype=int64)

In [23]:
y_train.iloc[100]

0

In [24]:
model.score(X_test, y_test)

0.7768832662449684

## Model test data predict

In [72]:
test_df_pred = model.predict(n_test_df)

In [73]:
pId, trans_arr = [], []
for i in range(len(test_df_pred)):
    pId.append(test_df.iloc[i]['PassengerId'])
    if(test_df_pred[i] == 1):
        trans_arr.append('True')
    if(test_df_pred[i] == 0):
        trans_arr.append('False')

In [74]:
len(pId), len(trans_arr)

(4277, 4277)

In [75]:
test_df.shape

(4277, 13)

In [76]:
sample_sub_df = pd.DataFrame(data={ 'PassengerId' : pId, 'Transported': trans_arr})

In [77]:
sample_sub_df

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [78]:
sample_sub_df.to_csv("sample_submission.csv", index=False)