In [1]:
# libraries
import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

## Titanic data set

In [2]:
df = pd.read_csv("titanic-train.csv", na_values=['none'])
df.fillna('', inplace=True)
df["Age"] = df["Age"].apply(lambda x: 0 if x == "" else x)
df["Cabin"] = df["Cabin"].apply(lambda x: "Unknown" if x == "" else x)
df["Embarked"] = df["Embarked"].apply(lambda x: "Unknown" if x == "" else x)
df["Pclass"] = df["Pclass"].apply(lambda x: str(x))

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,Unknown,S


In [8]:
train_cols = [df.columns[0]] + list(df.columns[2:])
label = df.columns[1]
X_df = df[train_cols]
y_df = df[label]

dataset = {
    'X': X_df,
    'y': y_df
}

print(X_df)

     PassengerId Pclass                                               Name  \
0              1      3                            Braund, Mr. Owen Harris   
1              2      1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2              3      3                             Heikkinen, Miss. Laina   
3              4      1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4              5      3                           Allen, Mr. William Henry   
..           ...    ...                                                ...   
886          887      2                              Montvila, Rev. Juozas   
887          888      1                       Graham, Miss. Margaret Edith   
888          889      3           Johnston, Miss. Catherine Helen "Carrie"   
889          890      1                              Behr, Mr. Karl Howell   
890          891      3                                Dooley, Mr. Patrick   

        Sex   Age  SibSp  Parch            Ticket     Fare    C

### GAMs model

In [4]:
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier()
ebm.fit(dataset['X'], dataset['y'])

ExplainableBoostingClassifier(feature_names=['PassengerId', 'Pclass', 'Name',
                                             'Sex', 'Age', 'SibSp', 'Parch',
                                             'Ticket', 'Fare', 'Cabin',
                                             'Embarked', 'Pclass x Sex',
                                             'Name x Ticket', 'Ticket x Fare',
                                             'Name x Fare', 'Sex x Ticket',
                                             'Sex x Age', 'Name x Age',
                                             'Sex x Parch', 'Age x SibSp',
                                             'SibSp x Ticket'],
                              feature_types=['continuous', 'categorical',
                                             'categorical', 'categorical',
                                             'continuous', 'continuous',
                                             'continuous', 'categorical',
                                      

In [13]:
#Training accuracy
train_pred = ebm.predict(dataset['X']).tolist()
train_pred = [0 if x<=0.5 else 1 for x in train_pred]
accuracy_train = round(sum(train_pred == dataset['y']) / len(train_pred), 5)

predictions = ebm.predict(X_df).tolist()

print("The accuracy of the model on the training set is: ", accuracy_train)

The accuracy of the model on the training set is:  0.90572


In [14]:
from interpret import show 

ebm_global = ebm.explain_global()
show(ebm_global)

## Local Feature Importance (EBM Model)

In [17]:
incorrect_indices = [index for index, value in enumerate(zip(predictions, y_df.tolist())) if value[0] != value[1]]
test_indices = incorrect_indices[:10]

ebm_local = ebm.explain_local(X_df, y_df, name='EBM')
show(ebm_local)