In [None]:
#import WhiteBox Sensitivty function 
from whitebox.eval import WhiteBoxSensitivity
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

In [32]:
#read in the data
df = pd.read_csv('datasets/winequality.csv')

In [34]:
# select the y variable of interest
ydepend = 'Type'

# required - convert all string data to categorical as that is required for all object/string variables being used in WhiteBox
df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))

# the above function is equivalent to running these two lines of code but is provided as it is useful in larger datasets
# df['AlcoholContent'] = pd.Categorical(df['AlcoholContent'])
# df['quality'] = pd.Categorical(df['quality'])

# select only those columns which are categorical in nature and make a copy for modeling
df.select_dtypes(include = ['category'])
model_df = df.copy(deep = True)

# create dummies example using all categorical columns
# this naming convention must be precise for WhiteBox to recognize what dummies are associated with what variables
dummies = pd.concat([pd.get_dummies(model_df.loc[:, col], prefix = col) for col in model_df.select_dtypes(include = ['category']).columns], axis = 1)

# add the dummies to the numeric dataframe for modeling
finaldf = pd.concat([model_df.select_dtypes(include = [np.number]), dummies], axis = 1)

# fit the model using the dummy dataframe
clf = RandomForestClassifier()
clf.fit(finaldf.loc[:, finaldf.columns != ydepend], df.loc[:, ydepend])


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
#run the WhiteBox Sensitivity function and save it locally
WB = WhiteBoxSensitivity(clf,
                    model_df=finaldf,
                    ydepend=ydepend,
                    cat_df=df,
                    featuredict=None,
                    groupbyvars=['AlcoholContent'],
                    aggregate_func=np.mean,
                    verbose=None,
                     std_num=2
                    )

WB.run()

WB.save('./WINEQUALITY_SENSITIVITY_CLASSIFICATION.html')