In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier, XGBRegressor

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
titanic_df = pd.read_csv("./data/titanic_transformed.csv", index_col=0)

In [3]:
titanic_df.head()

Unnamed: 0,Survived,Fare,Is_male,FamilySize,Pclass_2,Pclass_3,Sex_male,Age_Adult,Age_Elder,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0,7.25,1,1,0,1,1,1,0,0,1,0,1,0,0
1,1,71.2833,0,1,0,0,0,1,0,0,0,0,0,1,0
2,1,7.925,0,0,0,1,0,1,0,0,1,1,0,0,0
3,1,53.1,0,1,0,0,0,1,0,0,1,0,0,1,0
4,0,8.05,1,0,0,1,1,1,0,0,1,0,1,0,0


In [4]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 15 columns):
Survived       891 non-null int64
Fare           891 non-null float64
Is_male        891 non-null int64
FamilySize     891 non-null int64
Pclass_2       891 non-null int64
Pclass_3       891 non-null int64
Sex_male       891 non-null int64
Age_Adult      891 non-null int64
Age_Elder      891 non-null int64
Embarked_Q     891 non-null int64
Embarked_S     891 non-null int64
Title_Miss     891 non-null int64
Title_Mr       891 non-null int64
Title_Mrs      891 non-null int64
Title_Other    891 non-null int64
dtypes: float64(1), int64(14)
memory usage: 111.4 KB


In [5]:
# Create arrays for the features and the target: X, y
X, y = titanic_df.drop("Survived", axis=1), titanic_df.Survived

In [6]:
print(X.shape, y.shape)

(891, 14) (891,)


**Creating Dmatrix**

In [7]:
titanic_dmatrix = xgb.DMatrix(data=X, label=y)

In [8]:
# Create the training and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

In [9]:
# Instantiate the XGBClassifier: xg_cl
xg_cl = XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

Fitting the classifier to the training set and predicting the labels

In [14]:
xg_cl.fit(X_train, y_train)
preds = xg_cl.predict(X_test)

In [24]:
# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

accuracy: 0.854749


## Measuring the accuracy

Creating the parameter dictionary

In [15]:
params = {"objective":"reg:logistic", "max_depth":3, 'silent': 1}
cv_results = xgb.cv(dtrain=titanic_dmatrix, params=params, nfold=3, num_boost_round=5, 
                    metrics="error", as_pandas=True, seed=123)

Lets see the results

In [17]:
cv_results

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.171156,0.002861,0.176206,0.032798
1,0.155444,0.011111,0.18743,0.021354
2,0.15376,0.011691,0.179573,0.022221
3,0.156005,0.010677,0.177329,0.023702
4,0.151515,0.01091,0.177329,0.019504


In [20]:
# Print the accuracy
print(((1-cv_results["test-error-mean"]).iloc[-1]))

0.8226713333333333


Lets measure AUC

In [21]:
cv_results = xgb.cv(dtrain=titanic_dmatrix, params=params, nfold=3, num_boost_round=5, 
                    metrics="auc", as_pandas=True, seed=123)

In [22]:
# Print cv_results
cv_results

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.870051,0.005638,0.851717,0.052519
1,0.879054,0.011074,0.862746,0.034209
2,0.884327,0.007992,0.863182,0.035906
3,0.887519,0.007608,0.864,0.033497
4,0.889614,0.007627,0.864276,0.031572


In [23]:
# Print the AUC
print((cv_results["test-auc-mean"]).iloc[-1])

0.8642759999999999
