# Import libraries and dataset

In [72]:
import joblib
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [73]:
train = pd.read_csv("../datasets/train.csv")
train.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Class1,Class2,Class3,...,Alone,FamSize,SameTickets,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_U,Master
0,1,0,0,0.26975,1,0,0.338125,0,0,1,...,0,1,1,0,0,0,0,0,1,0
1,2,1,1,0.46975,1,0,0.685892,1,0,0,...,0,1,1,0,0,1,0,0,0,0
2,3,1,1,0.31975,0,0,0.350727,0,0,1,...,1,0,1,0,0,0,0,0,1,0
3,4,1,1,0.43225,1,0,0.639463,1,0,0,...,0,1,2,0,0,1,0,0,0,0
4,5,0,0,0.43225,0,0,0.352955,0,0,1,...,1,0,1,0,0,0,0,0,1,0


# Train Test Split

Since the data is already split for us, there is no need to use a data-splitting function from a library (or even do it ourselves manually, hehe). Instead, we're going to create a partition of our *df* dataset as training data, and load the test dataset to have them both ready for our Machine Learning Model.

We pass this data frame through our data pipeline so we have homogeneous datasets

In [74]:
x_train = train.drop(columns=["PassengerId", "Survived"])
y_train = train.Survived

# Machine Learning Model

In [75]:
model = GradientBoostingClassifier(random_state=0)

param_grid = {
    # "loss": ["log_loss", "deviance", "exponential"],
    "learning_rate": [0.01, 0.05, 0.1, 0.5],
    "n_estimators": [25, 50, 100, 500, 1000, 2000],
    "subsample": [0.1, 0.25, 0.5, 0.75, 1],
    "criterion": ["friedman_mse", "squared_error"],
    "min_samples_split": [2, 3, 5, 10],
    "min_samples_leaf": [1, 2, 3], 
    "max_depth": [1, 2, 3, 4, 5],
    "min_impurity_decrease": [0.001, 0.01, 0.025, 0.05],
    "max_features": [None, "sqrt", "log2"]
}

n_folds = 10
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, cv=n_folds)
search_results = random_search.fit(x_train, y_train)

best_model = search_results.best_estimator_

In [76]:
pd.DataFrame(search_results.cv_results_).sort_values("rank_test_score").drop(columns="params").head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_min_impurity_decrease,param_max_features,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
58,0.344991,0.01199,0.002705,0.000644,1.0,500,5,1,0.025,sqrt,...,0.876404,0.865169,0.831461,0.831461,0.797753,0.88764,0.842697,0.837278,0.032564,1
15,0.755456,0.00827,0.003902,0.000295,0.75,1000,2,3,0.001,log2,...,0.865169,0.876404,0.842697,0.842697,0.820225,0.876404,0.842697,0.837266,0.034147,2
29,0.993931,0.076309,0.003394,0.000795,1.0,2000,5,2,0.05,sqrt,...,0.876404,0.865169,0.853933,0.820225,0.820225,0.88764,0.831461,0.836155,0.038911,3


In [77]:
feature_importance = pd.DataFrame({"Feature":x_train.columns.to_list(), "Importance":best_model.feature_importances_})
print(feature_importance.sort_values(by="Importance", ascending=False)[0:8])

        Feature  Importance
0           Sex    0.467545
4          Fare    0.081315
7        Class3    0.075819
1           Age    0.053385
13  SameTickets    0.051068
19       Deck_U    0.049969
12      FamSize    0.044974
20       Master    0.040994


In [78]:
dpf = round(len(x_train)/n_folds)
print("Datapoint per fold:", dpf)

print("Train score: ", round(best_model.score(x_train, y_train), 4))

Datapoint per fold: 89
Train score:  0.8575


In [79]:
best_model.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.01,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.025,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_iter_no_change': None,
 'random_state': 0,
 'subsample': 1,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [81]:
filename = 'gradient_boosting_cv10.sav'
# joblib.dump(best_model, "../models/" + filename)

['./models/gradient_boosting_cv10.sav']