In [1]:
import os
import sys
sys.path.append('../../')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from matplotlib.lines import Line2D
from statsmodels.stats.multitest import multipletests
import itertools
from sklearn.utils import resample

In [3]:
from facct22.utils import get_db_con

from facct22.analysis_functions import (
    get_all_decisions, groups, schemas, users, dps, dt, pdr, group_order, colors, 
    ttests_operational_metrics, assign_conf_mat_cell, _modify_value_and_time
)

In [4]:
cred_file = '../../conf/credentials.yaml'
engine = get_db_con(cred_file)

#### Fetch Decisions

In [5]:
all_decisions = get_all_decisions(engine, schemas, users, groups)

#### Decision Time

Let's consider the decision time as the output variable and regres it on the experiment particulars

In [6]:
regression_features = ['user_name', 'group', 'trx_amnt', 'decision_time']

all_decisions[regression_features].head()

Unnamed: 0,user_name,group,trx_amnt,decision_time
0,j,Control-B,206.86,24
1,n,Irrelevant,102.69,126
2,n,Control-A,88.78,61
3,j,Control-B,137.94,8
4,t,Irrelevant,190.0,39


In [8]:
dummified = pd.get_dummies(all_decisions[regression_features])

In [9]:
y = dummified['decision_time']
X = dummified.drop(columns='decision_time')

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

In [14]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.1)

In [20]:
scaler = MinMaxScaler().fit(train_X)
train_X_scaled = scaler.transform(train_X)
valid_X_scaled = scaler.transform(valid_X)

  return self.partial_fit(X, y)


#### Unregularized Linear Regression

In [142]:
mean_absolute_error(valid_y.values, pred_y) = LinearRegression().fit(train_X_scaled, train_y)

In [143]:
feature_importance = dict()
for i in range(mod.coef_.shape[0]):
    feature_importance[X.columns[i]] = mod.coef_[i]
feature_importance

{'trx_amnt': 28.40032921729359,
 'user_name_j': 486553560381134.5,
 'user_name_n': 486553560381184.3,
 'user_name_t': 486553560381158.25,
 'group_Control-A': 35728577007598.69,
 'group_Control-B': 35728577007592.664,
 'group_Irrelevant': 35728577007607.0,
 'group_LIME': 35728577007596.516,
 'group_Random': 35728577007603.28,
 'group_TreeInt': 35728577007602.06,
 'group_TreeSHAP': 35728577007601.91}

In [144]:
pred_y = mod.predict(valid_X)
mse = mean_squared_error(valid_y.values, pred_y)
np.sqrt(mse)

8337.54992691644

In [145]:
mean_absolute_error(valid_y.values, pred_y)

6027.333452722063

#### L2

In [152]:
mod = Ridge(alpha=10000).fit(train_X_scaled, train_y)

In [153]:
feature_importance = dict()
for i in range(mod.coef_.shape[0]):
    feature_importance[X.columns[i]] = mod.coef_[i]
pd.Series(feature_importance).sort_values(ascending=False)

user_name_n         2.413716
group_Irrelevant    0.296492
group_Random        0.141180
group_TreeInt       0.070201
group_TreeSHAP      0.068872
trx_amnt            0.021260
group_Control-A    -0.069834
user_name_t        -0.094541
group_LIME         -0.153972
group_Control-B    -0.352940
user_name_j        -2.319175
dtype: float64

In [154]:
pred_y = mod.predict(valid_X)
mse = mean_squared_error(valid_y.values, pred_y)
np.sqrt(mse)

43.164987981081985

In [155]:
mean_absolute_error(valid_y.values, pred_y)

30.717480999793867

#### L1

In [162]:
mod = Lasso(alpha=1).fit(train_X_scaled, train_y)

In [163]:
feature_importance = dict()
for i in range(mod.coef_.shape[0]):
    feature_importance[X.columns[i]] = mod.coef_[i]
pd.Series(feature_importance).sort_values(ascending=False)

user_name_n         23.382283
group_TreeSHAP       0.000000
group_TreeInt        0.000000
group_Random         0.000000
group_LIME          -0.000000
group_Irrelevant     0.000000
group_Control-A     -0.000000
user_name_t         -0.000000
trx_amnt             0.000000
group_Control-B     -1.039182
user_name_j        -20.620689
dtype: float64

In [148]:
pred_y = mod.predict(valid_X)
mse = mean_squared_error(valid_y.values, pred_y)
np.sqrt(mse)

38.88432335089966

In [149]:
mean_absolute_error(valid_y.values, pred_y)

25.56658061591904

#### Decision Tree

In [179]:
mod = DecisionTreeRegressor(criterion='mse', max_depth=None, min_samples_split=20)
mod.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=20, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [180]:
feature_importance = dict()
for i in range(mod.feature_importances_.shape[0]):
    feature_importance[X.columns[i]] = mod.feature_importances_[i]
pd.Series(feature_importance).sort_values(ascending=False)

trx_amnt            0.493854
user_name_n         0.337656
user_name_j         0.096402
group_Irrelevant    0.026416
group_Control-B     0.013651
group_TreeInt       0.008829
group_Control-A     0.006838
group_Random        0.006817
group_TreeSHAP      0.006762
group_LIME          0.002776
user_name_t         0.000000
dtype: float64

In [181]:
pred_y = mod.predict(valid_X)
mse = mean_squared_error(valid_y.values, pred_y)
np.sqrt(mse)

38.373128874782566

In [182]:
mean_absolute_error(valid_y.values, pred_y)

24.924844987414957