In [1]:
# libraries
import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

## 1. The Dataset: Titanic data

#### The Titanic dataset is a classification dataset: it is used for a prediction task where the goal is to determine whether a person survived the 1912 shipwreck of the RMS Titanic. The list of attributes is as follows:

- output variable: **Survival**, 0 = did not survive, 1 = did survive
- input features:
    - **Pclass**: ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
    - **Sex**: male or female
    - **Age**: quantitative continuous variable
    - **Sibsp**: # of siblings / spouses aboard the Titanic
    - **Parch**: # of parents / children aboard the Titanic	
    - **TotalFamily**: # of total family members (including self) on the trip
    - **Ticket**: ticket number
    - **Fare**: passenger fare
    - **Cabin**: cabin number
    - **Embarked**: port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [2]:
# Read in the training data
df = pd.read_csv("Data/titanic_train.csv")
# df['Age'] = df['Age'].fillna(df['Age'].mean())
df.fillna('', inplace=True)
# df["Age"] = df["Age"].apply(lambda x: 0 if x == "" else x)
df["Cabin"] = df["Cabin"].apply(lambda x: "Unknown" if x == "" else x)
df["Embarked"] = df["Embarked"].apply(lambda x: "Unknown" if x == "" else x)
df["Pclass"] = df["Pclass"].apply(lambda x: str(x))

# train_cols = [df.columns[0]] + list(df.columns[2:])
train_cols = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'TotalFamily', 'Ticket', 'Fare', 'Cabin']
# label = df.columns[1]
label = "Survived"
X_df = df[train_cols]
y_df = df[label]

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,TotalFamily,Ticket,Fare,Cabin,Embarked
0,673,0,2,"Mitchell, Mr. Henry Michael",male,70.0,0,0,1,C.A. 24580,10.5,Unknown,S
1,733,0,2,"Knight, Mr. Robert J",male,30.0,0,0,1,239855,0.0,Unknown,S
2,700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42.0,0,0,1,348121,7.65,F G63,S
3,728,1,3,"Mannion, Miss. Margareth",female,30.0,0,0,1,36866,7.7375,Unknown,Q
4,113,0,3,"Barton, Mr. David John",male,22.0,0,0,1,324669,8.05,Unknown,S


## 2. The Model: GAMs

In [3]:
from interpret.glassbox import ExplainableBoostingClassifier

# train a GAM for the training dataset
ebm = ExplainableBoostingClassifier()

ebm.fit(X_df, y_df)

ExplainableBoostingClassifier(feature_names=['PassengerId', 'Pclass', 'Name',
                                             'Sex', 'Age', 'SibSp', 'Parch',
                                             'TotalFamily', 'Ticket', 'Fare',
                                             'Cabin', 'Pclass x Sex',
                                             'Sex x Age', 'Sex x Ticket',
                                             'Sex x Fare', 'Name x Fare',
                                             'Age x Ticket',
                                             'PassengerId x Name',
                                             'PassengerId x Sex', 'Age x SibSp',
                                             'Age x TotalFamily'],
                              feature_types=['continuous', 'categorical',
                                             'categorical', 'categorical',
                                             'continuous', 'continuous',
                                             'conti

In [4]:
# Read in the test data
test_df = pd.read_csv("Data/titanic_test.csv")
test_df.fillna('', inplace=True)
test_df["Cabin"] = test_df["Cabin"].apply(lambda x: "Unknown" if x == "" else x)
test_df["Embarked"] = test_df["Embarked"].apply(lambda x: "Unknown" if x == "" else x)
test_df["Pclass"] = test_df["Pclass"].apply(lambda x: str(x))

test_X_df = test_df[train_cols]
test_y_df = test_df[label]

test_X_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,TotalFamily,Ticket,Fare,Cabin
0,218,2,"Jacobsohn, Mr. Sidney Samuel",male,42.0,1,0,2,243847,27.0,Unknown
1,763,3,"Barah, Mr. Hanna Assi",male,30.0,0,0,1,2663,7.2292,Unknown
2,282,3,"Olsson, Mr. Nils Johan Goransson",male,28.0,0,0,1,347464,7.8542,Unknown
3,129,3,"Peter, Miss. Anna",female,30.0,1,1,3,2668,22.3583,F E69
4,441,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,30.0,1,1,3,F.C.C. 13529,26.25,Unknown


In [5]:
#Training accuracy
train_pred = ebm.predict(X_df).tolist()
train_pred = [0 if x<=0.5 else 1 for x in train_pred]
accuracy_train = round(sum(train_pred == y_df) / len(train_pred), 5)

#Test set accuracy
predictions = ebm.predict(test_X_df).tolist()
predictions = [0 if x <= 0.5 else 1 for x in predictions]

accuracy_test = round(sum(predictions == test_y_df) / len(predictions), 5)

print("The accuracy of the model on the training set is: ", accuracy_train)
print("The accuracy of the model on the test set is: ", accuracy_test)

indices = [index for index, value in enumerate(zip(predictions, test_y_df.tolist())) if value[0] != value[1]]

The accuracy of the model on the training set is:  0.88955
The accuracy of the model on the test set is:  0.82353


### Global feature importance plot and individual feature partial dependence plots (PDP)

In [6]:
from interpret import show 

ebm_global = ebm.explain_global()
show(ebm_global)

The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table as dt


### Local explanations

In [23]:
ebm_local = ebm.explain_local(test_X_df[:], test_y_df[:], name='EBM')
show(ebm_local)

#### Local misclassifications

In [24]:
ebm_local = ebm.explain_local(test_X_df.iloc[indices], test_y_df.iloc[indices], name='EBM')
show(ebm_local)

## Predicted and Actual values are both 1 (for what-if question)

In [115]:
pred_1 = [index for index, value in enumerate(zip(predictions, test_y_df.tolist())) if value[0] == 1 and value[1] == 1]

In [116]:
# To find a good what-if question
ebm_local = ebm.explain_local(test_X_df.iloc[pred_0], test_y_df.iloc[pred_0], name = 'EBM')
show(ebm_local)

In [117]:
#Original Datapoint
index = pred_1[52]
print(test_X_df.iloc[index])
print("Prediction: " + str(test_y_df.iloc[index]))
# orig = ebm.explain_local(test_X_df[index:index+1], test_y_df[index:index+1], name='EBM')
# show(orig)

PassengerId                              249
Pclass                                     1
Name           Beckwith, Mr. Richard Leonard
Sex                                     male
Age                                     37.0
SibSp                                      1
Parch                                      1
TotalFamily                                3
Ticket                                 11751
Fare                                 52.5542
Cabin                                    D35
Name: 191, dtype: object
Prediction: 1


In [118]:
# Updated datapoint
test = test_X_df.iloc[index]
test_y = test_y_df.iloc[index]
# test["Sex"] = "female"
# test["Pclass"] = 2
# test["Age"] = 70
# test["Fare"] = 10
# test["TotalFamily"] = 
test = test.to_frame().T
new_y = ebm.predict(test)
print(new_y)
new = ebm.explain_local(test, pd.Series(new_y), name='EBM')
show(new)

[1]




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

