## WhiteBox Error and Sensitivity Analysis on Wine Quality Data

Goals of tutorial include:
* [Importing wine quality dataset](#wine_quality)
* [Handling categorical features inherent within data by using dummy variables and pandas categorical datatype](#categorical)
* [Build model](#model)
* [Deploying WhiteBoxError graphics](#wbox_error)
* [Deploying WhiteBoxSensitivity graphics](#wbox_sensitivity)

In [1]:
# remove when actual package
%load_ext autoreload

%autoreload 2

import os
import sys
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

module_path = os.path.abspath('/projects/us_eminence/WhiteBox_Production')
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from whitebox import utils
from whitebox.WhiteBox import WhiteBoxError, WhiteBoxSensitivity

### Import wine quality dataset <a id=wine_quality><a>
Perform basic exploratory data analysis to better understand what types of columns are available

In [3]:
df = pd.read_csv('./datasets/winequality.csv')

In [4]:
df.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,quality,Type,AlcoholContent
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,5,Red,Low
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,5,Red,Low
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,5,Red,Low
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,6,Red,Low
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,5,Red,Low


In [5]:
df.dtypes # it looks like most of our columns are numeric, with the exception of Type and AlcoholContent

fixed.acidity           float64
volatile.acidity        float64
citric.acid             float64
residual.sugar          float64
chlorides               float64
free.sulfur.dioxide     float64
total.sulfur.dioxide    float64
density                 float64
pH                      float64
sulphates               float64
quality                   int64
Type                     object
AlcoholContent           object
dtype: object

In [6]:
df.groupby('AlcoholContent')['fixed.acidity'].count() # most of our data resides in low/medium alcohol content

AlcoholContent
High       852
Low       2832
Medium    2813
Name: fixed.acidity, dtype: int64

In [7]:
df.groupby('Type')['fixed.acidity'].count() # and most of our data is white wine

Type
Red      1599
White    4898
Name: fixed.acidity, dtype: int64

### Handling categorical data <a id=categorical><a>

In [8]:
# we need to convert the categorical columns to the special pandas categorical datatype

# first need to identify these string columns from before
string_cols = df.select_dtypes(include = ['O'])

# convert these into categorical datatype
for cat in string_cols:
    df[cat] = pd.Categorical(df[cat])

In [9]:
df.dtypes # great - now they are categorical datatypes

fixed.acidity            float64
volatile.acidity         float64
citric.acid              float64
residual.sugar           float64
chlorides                float64
free.sulfur.dioxide      float64
total.sulfur.dioxide     float64
density                  float64
pH                       float64
sulphates                float64
quality                    int64
Type                    category
AlcoholContent          category
dtype: object

### Build model <a id=model><a>

In [10]:
# we need to define what our dependent variable is
ydepend = 'quality' # we are going to treat quality as a continuous variable for the purposes of the tutorial

# subset the training data
x_train = df.loc[:, df.columns != ydepend].copy(deep = True)

# convert these into their categorical codes using utility function convert_categorical_indpendent
x_train = utils.convert_categorical_independent(x_train)

# pull out the dependent variable column
y_train = df.loc[:, ydepend]

In [11]:
# build the model
modelobj = RandomForestRegressor()

modelobj.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [12]:
df.head(1)

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,quality,Type,AlcoholContent
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,5,Red,Low


### Create WhiteBoxError <a id=wbox_error><a>

In [29]:
# specify featuredict as a subset of columns we want to focus on
featuredict = {'fixed.acidity': 'FIXED ACIDITY',
               'quality': 'SUPERQUALITY',
               'ALL DATA' : "ALL DATA",
               'AlcoholContent': 'AC',
               'sulphates': 'SULPHATES',
              'volatile.acidity': 'VOLATILE ACIDITY',
              'residual.sugar': 'RESIDUAL SUGAR',
              'free.sulfur.dioxide': 'FREE SULFUR DIOXIDE'}

# hack it for one groupby variable
df['ALL DATA'] = 'ALL DATA'

# specify the groupby variables
groupbyvars = ['ALL DATA']

# instantiate wbox error
WB = WhiteBoxError(modelobj = modelobj,
                   model_df = x_train,
                   ydepend= ydepend,
                   cat_df = df,
                   groupbyvars = groupbyvars,
                   featuredict = featuredict,
                   verbose=None)

In [30]:
# run wbox error
WB.run()

[{'Type': 'Continuous', 'Data': [{'groupByValue': 'ALL DATA', 'errNeg': -0.31111111111111106, 'errPos': 0.28064516129032246, 'FIXED ACIDITY': 5.1, 'predictedYSmooth': 6.1255555555555565, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2826086956521739, 'errPos': 0.24642857142857136, 'FIXED ACIDITY': 5.3, 'predictedYSmooth': 6.206153846153847, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2374999999999999, 'errPos': 0.2999999999999999, 'FIXED ACIDITY': 5.5, 'predictedYSmooth': 6.2046153846153835, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2085714285714285, 'errPos': 0.32592592592592584, 'FIXED ACIDITY': 5.6, 'predictedYSmooth': 6.06470588235294, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.22857142857142848, 'errPos': 0.22962962962962963, 'FIXED ACIDITY': 5.7, 'predictedYSmooth': 5.908888888888888, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.25714285

[{'Type': 'Continuous', 'Data': [{'groupByValue': 'ALL DATA', 'errNeg': -0.29999999999999993, 'errPos': 0.271875, 'predictedYSmooth': 6.00722891566265, 'SULPHATES': 0.3, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.23333333333333325, 'errPos': 0.31142857142857144, 'predictedYSmooth': 6.050561797752809, 'SULPHATES': 0.32, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2863636363636363, 'errPos': 0.2937499999999999, 'predictedYSmooth': 6.173333333333334, 'SULPHATES': 0.33, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.25862068965517243, 'errPos': 0.35294117647058826, 'predictedYSmooth': 5.875000000000001, 'SULPHATES': 0.34, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.20999999999999988, 'errPos': 0.33999999999999997, 'predictedYSmooth': 5.7341176470588255, 'SULPHATES': 0.35, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.291111111111111, 'errPos': 0.29615

[{'Type': 'Continuous', 'Data': [{'groupByValue': 'ALL DATA', 'errNeg': -0.21999999999999986, 'errPos': 0.2562499999999998, 'predictedYSmooth': 6.374647887323944, 'VOLATILE ACIDITY': 0.12, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.26499999999999985, 'errPos': 0.26578947368421035, 'predictedYSmooth': 6.091346153846155, 'VOLATILE ACIDITY': 0.14, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.276470588235294, 'errPos': 0.27419354838709675, 'predictedYSmooth': 6.26195652173913, 'VOLATILE ACIDITY': 0.15, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2288888888888887, 'errPos': 0.2794871794871794, 'predictedYSmooth': 6.145945945945945, 'VOLATILE ACIDITY': 0.16, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.24791666666666648, 'errPos': 0.27441860465116275, 'predictedYSmooth': 6.097887323943662, 'VOLATILE ACIDITY': 0.17, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'err

[{'Type': 'Continuous', 'Data': [{'groupByValue': 'ALL DATA', 'errNeg': -0.27599999999999997, 'errPos': 0.2828571428571427, 'predictedYSmooth': 5.453333333333333, 'RESIDUAL SUGAR': 0.9, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2806451612903226, 'errPos': 0.24791666666666648, 'predictedYSmooth': 5.71340206185567, 'RESIDUAL SUGAR': 1.0, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2785714285714284, 'errPos': 0.26111111111111107, 'predictedYSmooth': 5.901360544217685, 'RESIDUAL SUGAR': 1.1, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2904761904761904, 'errPos': 0.30506329113924063, 'predictedYSmooth': 5.807070707070704, 'RESIDUAL SUGAR': 1.2, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.09999999999999964, 'errPos': 0.20000000000000018, 'predictedYSmooth': 6.0, 'RESIDUAL SUGAR': 1.25, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2439999999999999, 

[{'Type': 'Continuous', 'Data': [{'groupByValue': 'ALL DATA', 'errNeg': -0.2717948717948718, 'errPos': 0.3270833333333332, 'FREE SULFUR DIOXIDE': 4.0, 'predictedYSmooth': 5.587068965517243, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.221951219512195, 'errPos': 0.3390624999999999, 'FREE SULFUR DIOXIDE': 5.0, 'predictedYSmooth': 5.446511627906979, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2422535211267604, 'errPos': 0.2970149253731342, 'FREE SULFUR DIOXIDE': 6.0, 'predictedYSmooth': 5.705847953216372, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.2972972972972972, 'errPos': 0.24736842105263146, 'FREE SULFUR DIOXIDE': 7.0, 'predictedYSmooth': 5.556250000000001, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA', 'errNeg': -0.21666666666666648, 'errPos': 0.31351351351351353, 'FREE SULFUR DIOXIDE': 8.0, 'predictedYSmooth': 5.50989010989011, 'groupByVarName': 'ALL DATA'}, {'groupByValue': 'ALL DATA

In [31]:
len(WB.outputs)

7

In [32]:
WB.save('../output/example_winequality_error_nogroup.html')

### WhiteBox Sensitivity Analysis <a id=wbox_sensitivity><a>

In [None]:
# whitebox sensitivity behaves very similarly to WhiteBox Error

# specify your dependent variable
ydepend = 'quality'


# specify groupby variables
groupbyVars = ['Type']

# we need to create dummy variables to enhance our model further
dummydf = df.copy(deep=True)

# create dummies example using all categorical columns
dummies = pd.concat([pd.get_dummies(dummydf.loc[:, col], prefix = col) for col in dummydf.select_dtypes(include = ['category']).columns], axis = 1)
finaldf = pd.concat([dummydf.select_dtypes(include = [np.number]), dummies], axis = 1)



# create train dataset for fitting model
xtrain = finaldf.loc[:, finaldf.columns != ydepend].copy(deep = True)
# create dependent variable dataset
ytrain = finaldf.loc[:, ydepend]

# fit the model
modelobj.fit(xtrain, ytrain)

In [None]:

# specify featuredict as a subset of columns we want to focus on
# specify featuredict as a subset of columns we want to focus on
featuredict = {'fixed.acidity': 'FIXED ACIDITY',
               'Type': 'TYPE',
               'quality': 'SUPERQUALITY',
               'AlcoholContent': 'AC',
               'sulphates': 'SULPHATES',
              'volatile.acidity': 'VOLATILE ACIDITY',
              'residual.sugar': 'RESIDUAL SUGAR',
              'free.sulfur.dioxide': 'FREE SULFUR DIOXIDE'}

# instantiate whitebox sensitivity
WB = WhiteBoxSensitivity(modelobj = modelobj,
                   model_df = finaldf,
                   ydepend= ydepend,
                   cat_df = df,
                   groupbyvars = groupbyvars,
                   featuredict = featuredict,
                    verbose=None)
# run
WB.run()

In [None]:
# save the final outputs to disk
WB.save(fpath = '../output/example_winequality_sensitivity.html')