# Interpreting Existing models - examples

In [1]:
# Importing necessary libraries
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix

import shap
import lime.lime_tabular

import warnings
warnings.filterwarnings('ignore')

#### We'll start by importing our dataset below. Unlike last time, this time we'll load it from an external datafile

In [43]:
df = pd.read_excel('https://github.com/datawhys/demo-xavier-ai-summit/blob/main/asthma_data.xlsx?raw=true')

In [44]:
column_has_nulls = np.any(df.isna(), axis=0).values

In [45]:
df.columns[column_has_nulls]

Index(['Age of Mother', 'Age of Father', 'Birth Weight',
       'Duration Breast Fed exclusive in weeks',
       'Exposure to House Dust Mites (antibody)',
       'Exposure to Grass Pollen (antibody)', 'Exposure to Egg (antibody)',
       'Exposure to Dairy (antibody)', 'Exposure to Mold and Mildew'],
      dtype='object')

In [46]:
column_is_numeric = np.array([is_numeric_dtype(v) for v in df.dtypes])

In [56]:
categorical_cols = df.columns[~column_is_numeric]; categorical_cols

Index(['Dermatitis to Bronchiale', 'Gender', 'Race', 'Delivery Method',
       'Vaccination Mumps'],
      dtype='object')

In [47]:
numeric_cols_with_nulls = df.columns[np.all(np.array([column_has_nulls, column_is_numeric]), axis=0)]

In [48]:
numeric_cols_with_nulls

Index(['Age of Mother', 'Age of Father', 'Birth Weight',
       'Duration Breast Fed exclusive in weeks',
       'Exposure to House Dust Mites (antibody)',
       'Exposure to Grass Pollen (antibody)', 'Exposure to Egg (antibody)',
       'Exposure to Dairy (antibody)', 'Exposure to Mold and Mildew'],
      dtype='object')

In [49]:
non_numeric_cols_with_nulls = df.columns[np.all(np.array([column_has_nulls, np.bitwise_not(column_is_numeric)]), axis=0)]

In [50]:
non_numeric_cols_with_nulls

Index([], dtype='object')

In [51]:
df_imputed_nums = df.copy()

In [52]:
df_imputed_nums[numeric_cols_with_nulls] = df_imputed_nums[numeric_cols_with_nulls].fillna(value=df[numeric_cols_with_nulls].mean())

In [53]:
df_imputed_nums

Unnamed: 0,Dermatitis to Bronchiale,Gender,Race,Age of Mother,Age of Father,Pregnancy Duration,Rank in Family,Delivery Method,Birth Weight,Duration Breast Fed exclusive in weeks,Vaccination Mumps,Exposure to House Dust Mites (antibody),Exposure to Grass Pollen (antibody),Exposure to Egg (antibody),Exposure to Dairy (antibody),Exposure to Mold and Mildew
0,2. Asthma,Male,African/american,40.0,47.0,38,4,Caesarean,2.500,65.000000,No,80.000000,0.790,5.070000,1.400000,80.000000
1,2. Asthma,Female,African/american,17.0,22.0,38,1,Natural,2.800,2.000000,No,80.000000,0.175,29.800000,4.030000,80.000000
2,1. No Asthma,Female,African/american,28.0,30.0,38,2,Natural,2.920,13.000000,No,35.200000,0.175,0.175000,0.175000,35.900000
3,2. Asthma,Male,African/american,17.0,19.0,38,2,Natural,3.325,9.000000,Yes,0.175000,0.520,10.300000,18.500000,21.110000
4,1. No Asthma,Male,African/american,31.0,32.0,40,1,Natural,3.500,4.000000,No,2.610000,0.175,80.000000,44.600000,7.015000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1. No Asthma,Female,Other/mixed race,34.0,38.0,38,2,Natural,3.750,19.000000,Yes,1.620000,0.175,1.030000,3.070000,2.320000
396,1. No Asthma,Male,Other/mixed race,21.0,24.0,40,2,Natural,3.240,17.041916,No,38.000000,0.175,6.580000,0.460000,20.550667
397,2. Asthma,Female,Other/mixed race,24.0,29.0,41,1,Natural,3.700,13.000000,Yes,15.670151,0.175,0.175000,0.175000,20.550667
398,1. No Asthma,Male,Other/mixed race,27.0,29.0,36,1,Caesarean,2.220,17.041916,No,1.490000,0.175,0.175000,5.830684,20.550667


In [54]:
ohe = OneHotEncoder(categories='auto')

In [57]:
cat_feats_encoded = ohe.fit_transform(df_imputed_nums[categorical_cols])

In [62]:
df_prepared = pd.get_dummies(df_imputed_nums, columns=categorical_cols)

In [65]:
df_prepared

Unnamed: 0,Age of Mother,Age of Father,Pregnancy Duration,Rank in Family,Birth Weight,Duration Breast Fed exclusive in weeks,Exposure to House Dust Mites (antibody),Exposure to Grass Pollen (antibody),Exposure to Egg (antibody),Exposure to Dairy (antibody),...,Gender_Female,Gender_Male,Race_African/american,Race_Asian/pacific islander,Race_Caucasian,Race_Other/mixed race,Delivery Method_Caesarean,Delivery Method_Natural,Vaccination Mumps_No,Vaccination Mumps_Yes
0,40.0,47.0,38,4,2.500,65.000000,80.000000,0.790,5.070000,1.400000,...,0,1,1,0,0,0,1,0,1,0
1,17.0,22.0,38,1,2.800,2.000000,80.000000,0.175,29.800000,4.030000,...,1,0,1,0,0,0,0,1,1,0
2,28.0,30.0,38,2,2.920,13.000000,35.200000,0.175,0.175000,0.175000,...,1,0,1,0,0,0,0,1,1,0
3,17.0,19.0,38,2,3.325,9.000000,0.175000,0.520,10.300000,18.500000,...,0,1,1,0,0,0,0,1,0,1
4,31.0,32.0,40,1,3.500,4.000000,2.610000,0.175,80.000000,44.600000,...,0,1,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,34.0,38.0,38,2,3.750,19.000000,1.620000,0.175,1.030000,3.070000,...,1,0,0,0,0,1,0,1,0,1
396,21.0,24.0,40,2,3.240,17.041916,38.000000,0.175,6.580000,0.460000,...,0,1,0,0,0,1,0,1,1,0
397,24.0,29.0,41,1,3.700,13.000000,15.670151,0.175,0.175000,0.175000,...,1,0,0,0,0,1,0,1,0,1
398,27.0,29.0,36,1,2.220,17.041916,1.490000,0.175,0.175000,5.830684,...,0,1,0,0,0,1,1,0,1,0


In [64]:
df_prepared

Unnamed: 0,Age of Mother,Age of Father,Pregnancy Duration,Rank in Family,Birth Weight,Duration Breast Fed exclusive in weeks,Exposure to House Dust Mites (antibody),Exposure to Grass Pollen (antibody),Exposure to Egg (antibody),Exposure to Dairy (antibody),...,Gender_Female,Gender_Male,Race_African/american,Race_Asian/pacific islander,Race_Caucasian,Race_Other/mixed race,Delivery Method_Caesarean,Delivery Method_Natural,Vaccination Mumps_No,Vaccination Mumps_Yes
0,40.0,47.0,38,4,2.500,65.000000,80.000000,0.790,5.070000,1.400000,...,0,1,1,0,0,0,1,0,1,0
1,17.0,22.0,38,1,2.800,2.000000,80.000000,0.175,29.800000,4.030000,...,1,0,1,0,0,0,0,1,1,0
2,28.0,30.0,38,2,2.920,13.000000,35.200000,0.175,0.175000,0.175000,...,1,0,1,0,0,0,0,1,1,0
3,17.0,19.0,38,2,3.325,9.000000,0.175000,0.520,10.300000,18.500000,...,0,1,1,0,0,0,0,1,0,1
4,31.0,32.0,40,1,3.500,4.000000,2.610000,0.175,80.000000,44.600000,...,0,1,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,34.0,38.0,38,2,3.750,19.000000,1.620000,0.175,1.030000,3.070000,...,1,0,0,0,0,1,0,1,0,1
396,21.0,24.0,40,2,3.240,17.041916,38.000000,0.175,6.580000,0.460000,...,0,1,0,0,0,1,0,1,1,0
397,24.0,29.0,41,1,3.700,13.000000,15.670151,0.175,0.175000,0.175000,...,1,0,0,0,0,1,0,1,0,1
398,27.0,29.0,36,1,2.220,17.041916,1.490000,0.175,0.175000,5.830684,...,0,1,0,0,0,1,1,0,1,0
