# Frame the Problem

Predict who survived the sinking of *The Titanic*. 

# Install and Load the Libraries

# Load the Data

# Conduct Exploratory Data Analysis

# Prepare the data 

*What about missing values?* 

XGBoost can handle them by [default](https://xgboost.readthedocs.io/en/stable/faq.html#how-to-deal-with-missing-values) but do this with care. 

# Build and Evaluate the Model  

# Inspect the Feature Importance

In [None]:
# %pip install seaborn
# %pip install xgboost
# %pip install -U scikit-learn
# %pip install graphviz

# Restart your kernel 


#or 

# !pip install seaborn
# !pip install xgboost
# !pip install -U scikit-learn
# !pip install graphviz



# Restart your kernel 


In [None]:
titanic['is_female'] = np.where(titanic['sex'] == 'female', 1, 0)

In [None]:
# Instatiate a XGBClassifier 
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, random_state=RANDOM_STATE)

# Inspect the parameters
xgb_clf.get_params()

In [None]:
titanic = sns.load_dataset('titanic')

In [None]:
keep = ['pclass', 'is_female', 'sibsp', 'parch', 'fare']

X = titanic.loc[:, keep]
y = titanic.loc[:, 'survived']

In [None]:
#Plot the Learning Curve
evals_result = xgb_clf.evals_result()

train_errors = evals_result['validation_0']['logloss']

validation_errors = evals_result['validation_1']['logloss']

df = pd.DataFrame([train_errors, validation_errors]).T

df.columns = ['train', 'validation']

df.index.name = 'round'

df.plot(title='XGBoost Learning Curve', ylim=(0, 0.7), figsize=(12, 5));

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
titanic.survived.value_counts().plot(kind='bar');

In [None]:
# Train and test split using sklearn
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.33, 
                                                    random_state=RANDOM_STATE,
                                                    stratify=y)

In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import seaborn as sns 
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
xgb_clf.fit(X_train, 
            y_train,
            eval_set=[(X_train, y_train), 
                      (X_test, y_test)],
            verbose=10)

preds = xgb_clf.predict(X_test)


In [None]:
# How many times a feature appears in a tree
xgb.plot_importance(xgb_clf, 
                    importance_type='weight', 
                    max_num_features=5,
                    title='Feature Importance Based on Weight',);

In [None]:
fig, ax = plt.subplots(figsize=(30, 30))
xgb.plot_tree(xgb_clf, num_trees=99, rankdir='LR', ax=ax);
plt.show();

In [None]:
print(f'The accuracy score for XGBClassifier is:', xgb_clf.score(X_test,y_test))

cm_array = confusion_matrix(y_test, preds, labels=[0,1])


label_names = ['did not survive', 'survived']

df_cm = pd.DataFrame(cm_array, 
                     index=label_names, 
                     columns=label_names)

sns.set(font_scale=1.4) # for label size
sns.heatmap(df_cm, annot=True, fmt='d', annot_kws={"size": 16}); # font size

In [None]:
# What is the average gain of splits for each feature?
xgb.plot_importance(xgb_clf, 
                    importance_type='gain', 
                    max_num_features=5, 
                    title='Feature Importance Based on Gain');