In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [2]:
spaceship_data = pd.read_csv("../data/spaceship-titanic/train.csv")

spaceship_holdout = pd.read_csv("../data/spaceship-titanic/test.csv")

Xtrain, Xtest, Ytrain, Ytest = train_test_split(spaceship_data.loc[:,spaceship_data.columns != "Transported"], spaceship_data.Transported, train_size=0.8, )

## Feature engineering

- Remove name
- Update role of id?
- Character to categoricals
- Do something with the NAs

### Column datatypes

| Column      |     datatype |
|------------|-------| 
|PassengerId  |    object | 
|HomePlanet   |    object |
|CryoSleep    |    object |
|Cabin        |    object |
|Destination  |    object |
|Age          |   float64 |
|VIP          |    object |
|RoomService  |   float64 |
|FoodCourt    |   float64 |
|ShoppingMall |   float64 |
|Spa          |   float64 |
|VRDeck       |   float64 |
|Name         |    object |
|Transported  |      bool |

### Methods/Functions for preprocessing

- Pipeline, make_pipeline
- UDF
- Standard scaler?
- Dealing with missing levels?


In [3]:


cats = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

conts = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

def impute_median(col):
    col_med = np.nanmedian(col)
    return col.fillna(col_med)

def impute_mode(col):
    col_mode = col.mode()
    return col.fillna(col_mode)
    

def proc_data(df):
    #df = df.copy()
    df = df.drop(columns = ["Name", "PassengerId"], axis = 1)
    # Impute median for continuous vars
    df[conts] = df[conts].apply(impute_median)
    # Impute mode for categoricals
    df[cats] = df[cats].apply(impute_mode)
    # Convert categorial columns explicitly
    df[cats] = df[cats].apply(pd.Categorical)
    df[cats] = df[cats].apply(lambda x: x.cat.codes)
    return df


### Hyperparameter tuning

Functions
- Randomised search cv:
- Grid search cv:

In [4]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 600, stop = 1000, num = 4)]
# Number of features to consider at every split
max_features = 'sqrt'
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [600, 733, 866, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 35, 60, 85, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True]}


In [None]:
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

from sklearn.ensemble import RandomForestClassifier
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=1, random_state=42, n_jobs = 1)
# Fit the random search model
Xtrain_data = proc_data(Xtrain)
Ytrain_data = Ytrain.replace({True: "Transported", False: "NotTransported"})
rf_random.fit(Xtrain_data, Ytrain_data)

In [10]:
print(rf_random.best_params_)

Xtest_data = proc_data(Xtest)
Ytest_data = Ytest.replace({True: "Transported", False: "NotTransported"})

print (rf_random.score(Xtrain_data, Ytrain_data))
print(rf_random.score(Xtest_data, Ytest_data))

{'n_estimators': 600, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': True}
0.8871153293068738
0.7860839562967222


### Model fit

 - Custom scorer: sklearn.metrics/make_scorer, takes udf with train/pred inputs + direction of optimisation

In [34]:
rf_tuned = RandomForestClassifier()
rf_tuned.set_params(**rf_random.best_params_)

rf_tuned.fit(Xtrain_data, Ytrain_data)

transported_pred = rf_tuned.predict(proc_data(spaceship_holdout))


In [37]:
conds = {"Transported": True, "NotTransported": False}
transported_pred = [conds[x] for x in transported_pred]

In [41]:
out_df = pd.DataFrame({'PassengerId': spaceship_holdout.PassengerId, 'isTransported': transported_pred})
#out_df.to_csv()