In [1]:
import numpy as np
import pandas as pd
import skopt
from sklearn import metrics
from skopt.plots import plot_convergence
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

results_array = []

# helpers
def get_mae(model, train_X, val_X, train_y, val_y):
   model.fit(train_X, train_y)
   preds_val = model.predict(val_X)
   mae = metrics.mean_absolute_error(val_y, preds_val)
   return(mae)

def get_survival_rate(gender):
    survival = training_data.loc[training_data.Sex == gender]['Survived']
    return sum(survival) / len(survival)




In [2]:
training_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data = [['female', get_survival_rate('female')], ['male', get_survival_rate('male')]]
df_survival_rates = pd.DataFrame(data, columns=['Gender', 'SurvivalRate'])

df_survival_rates

Unnamed: 0,Gender,SurvivalRate
0,female,0.742038
1,male,0.188908


In [4]:
# TODO: do proper dimensionality reduction + optimization
# TODO: re-add 'Cabin' here

features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Age', 'Embarked']
dummy_columns = ['Sex', 'Embarked']

# clean up the data before setting up features and dependent
base_features = training_data[features]
# QUESTION: why use median here? What about 0? Or what about base_features.mean ?
base_features = base_features.fillna(base_features.median())

X = pd.get_dummies(base_features, columns=dummy_columns)
y = training_data['Survived']

X.head()

Unnamed: 0,Pclass,SibSp,Parch,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,1,0,22.0,0,1,0,0,1
1,1,1,0,38.0,1,0,1,0,0
2,3,0,0,26.0,1,0,0,0,1
3,1,1,0,35.0,1,0,0,0,1
4,3,0,0,35.0,0,1,0,0,1


In [5]:
# create training and verification set
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [11]:
# baseline model
# -----------------------------------------------------------------------------
baseline_model = RandomForestClassifier(
        random_state=1,
        n_estimators=100,
        max_leaf_nodes=10,
)

# baseline_model.fit(train_X, train_y)
# baseline_predicted = baseline_model.predict(val_X)
# baseline_mae = metrics.mean_absolute_error(val_y, baseline_predicted)
baseline_mae = get_mae(baseline_model, train_X, val_X, train_y, val_y)
baseline_mae

NameError: name 'debugger' is not defined

In [7]:
# Hyperparameter-optimized model
# ------------------------------------------------------------------------------
SPACE = [
          skopt.space.Integer(2, 1000, name="max_leaf_nodes"),
          skopt.space.Integer(2, 200, name="n_estimators"),
          skopt.space.Integer(2, 3000, name="max_depth"),
        ]

hopt_model = RandomForestClassifier(
        max_depth=5,
        random_state=0,
)

@skopt.utils.use_named_args(SPACE)
def objective(**params):
    hopt_model.set_params(**params)
    cvs = cross_val_score(hopt_model, X, y, cv=5, n_jobs=-1, scoring="accuracy")
    # turning final to a positive number increases MAE (therefore leave it positive)
    final = -np.mean(cvs)
    return final

optimize_results = skopt.gp_minimize(objective, SPACE, n_calls=50, random_state=0)



In [8]:
optimize_results

          fun: -0.8249495795249784
    func_vals: array([-0.80254001, -0.80141642, -0.79023075, -0.80365733, -0.80365733,
       -0.7991566 , -0.80253374, -0.79915653, -0.80030538, -0.80144809,
       -0.79122217, -0.78671502, -0.79798272, -0.77780815, -0.78456213,
       -0.80365733, -0.82381336, -0.81822684, -0.78671502, -0.80253374,
       -0.78678429, -0.80253374, -0.80253374, -0.78671502, -0.80367638,
       -0.78456213, -0.80253374, -0.78671502, -0.78456213, -0.78671502,
       -0.79122217, -0.78671502, -0.81712828, -0.81043065, -0.80253374,
       -0.82269604, -0.82494958, -0.79346936, -0.82378176, -0.81935043,
       -0.78671502, -0.80920006, -0.80253374, -0.80471195, -0.78671502,
       -0.80137255, -0.80365106, -0.8047372 , -0.82160397, -0.82158507])
       models: [GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
                         kernel=1**2 * Matern(length_scale=[1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),
                         n_restarts_optimizer=2,

In [9]:
hopt_model = RandomForestClassifier(
        random_state=0,
        max_leaf_nodes=optimize_results.x[0],
        n_estimators=optimize_results.x[1],
        max_depth=optimize_results.x[2],
)

hopt_mae = get_mae(hopt_model, train_X, val_X, train_y, val_y)

hopt_mae

0.18385650224215247

In [10]:
model_results = pd.DataFrame()