# Space titanic

> Aim: minimally engineer features for an MVP random forest model

## Import packages

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 

## Load data

In [2]:
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

In [3]:
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [4]:
test


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


## Feature engineering

### Bind train test together

In [5]:
train["dataset"] = "train"
test["dataset"] = "test"

train_test = pd.concat([train, test])
train_test.reset_index(drop=True, inplace=True)

### Remove useless cols

In [6]:
train_test.drop(columns="Name", inplace=True)

### Remove NAs

In [7]:
train_test.isnull().sum(axis=0)

PassengerId        0
HomePlanet       288
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Transported     4277
dataset            0
dtype: int64

In [8]:
feature_cols = [col for col in train_test.columns if col not in ["PassengerId", "Transported", "dataset"]]
train_test = train_test.dropna(subset = feature_cols)
train_test.reset_index(drop=True, inplace=True)

In [9]:
train_test.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported      object
dataset          object
dtype: object

### Convert categorical to numeric

In [10]:
le = preprocessing.LabelEncoder()

for feature in feature_cols:
    if train_test[feature].dtype == object:
        le = le.fit(train_test[feature])
        train_test[feature] = le.transform(train_test[feature])

### Split train and test back

In [11]:
train = train_test[train_test["dataset"] == "train"].copy()
test = train_test[train_test["dataset"] == "test"].copy()

for data in [train, test]:
    data.reset_index(drop=True, inplace=True)
    data.drop(columns="dataset", inplace=True)

## Train model

### Split into train and test

In [12]:
X = train[feature_cols].copy()

y = train["Transported"].copy()
le = le.fit(y)
y = le.transform(y)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=32)

### Fit a random forest

In [14]:
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_train, y_train)

RandomForestClassifier()

## Evaluate model

In [15]:
y_pred = clf.predict(X_test)

In [16]:
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(f"Accuracy: {metrics.f1_score(y_test, y_pred)}")
print(f"Accuracy: {metrics.precision_score(y_test, y_pred)}")
print(f"Accuracy: {metrics.recall_score(y_test, y_pred)}")

Accuracy: 0.793999104343932
Accuracy: 0.7846441947565544
Accuracy: 0.8026819923371648
Accuracy: 0.7673992673992674
