In [1]:
import os
import sys
sys.path.append('../../')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats
from matplotlib.lines import Line2D
from statsmodels.stats.multitest import multipletests
import itertools
from sklearn.utils import resample

In [3]:
from facct22.utils import get_db_con

from facct22.analysis_functions import (
    get_all_decisions, groups, schemas, users, dps, dt, pdr, group_order, colors, 
    ttests_operational_metrics, assign_conf_mat_cell, _modify_value_and_time
)

In [4]:
cred_file = '../../conf/credentials.yaml'
engine = get_db_con(cred_file)

#### Fetch Decisions

In [5]:
all_decisions = get_all_decisions(engine, schemas, users, groups)

#### Decision Time

Let's consider the decision time as the output variable and regres it on the experiment particulars

In [6]:
regression_features = ['user_name', 'group', 'trx_amnt', 'decision_time']

all_decisions[regression_features].head()

Unnamed: 0,user_name,group,trx_amnt,decision_time
0,j,ML Model,206.86,24
1,n,Irrelevant,102.69,126
2,n,Data,88.78,61
3,j,ML Model,137.94,8
4,t,Irrelevant,190.0,39


In [7]:
dummified = pd.get_dummies(all_decisions[regression_features])

In [8]:
y = dummified['decision_time']
X = dummified.drop(columns='decision_time')

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

In [10]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.1)

In [11]:
scaler = MinMaxScaler().fit(train_X)
train_X_scaled = scaler.transform(train_X)
valid_X_scaled = scaler.transform(valid_X)

  return self.partial_fit(X, y)


#### Unregularized Linear Regression

In [18]:
mod = LinearRegression().fit(train_X_scaled, train_y)
pred_y = mod.predict(valid_X_scaled)
# mean_absolute_error(valid_y.values, pred_y) 

In [19]:
feature_importance = dict()
for i in range(mod.coef_.shape[0]):
    feature_importance[X.columns[i]] = mod.coef_[i]
feature_importance

{'trx_amnt': 32.11251005059774,
 'user_name_j': -24.970090355944816,
 'user_name_n': 26.17505253111577,
 'user_name_t': -1.2049621751709425,
 'group_Data': -1.49785150490683,
 'group_Irrelevant': 6.633029394875724,
 'group_LIME': -4.101376747057463,
 'group_ML Model': -8.628236305629672,
 'group_Random': 2.9419339962785926,
 'group_TreeInt': 2.836338409223318,
 'group_TreeSHAP': 1.816162757216361}

In [20]:
pred_y = mod.predict(valid_X)
mse = mean_squared_error(valid_y.values, pred_y)
np.sqrt(mse)

10587.969381622997

In [21]:
mean_absolute_error(valid_y.values, pred_y)

7101.663528799251

#### L2

In [26]:
mod = Ridge(alpha=0.1).fit(train_X_scaled, train_y)

In [27]:
feature_importance = dict()
for i in range(mod.coef_.shape[0]):
    feature_importance[X.columns[i]] = mod.coef_[i]
pd.Series(feature_importance).sort_values(ascending=False)

trx_amnt            31.612474
user_name_n         26.172554
group_Irrelevant     6.631955
group_Random         2.942368
group_TreeInt        2.834667
group_TreeSHAP       1.814974
user_name_t         -1.204897
group_Data          -1.495957
group_LIME          -4.101157
group_ML Model      -8.626851
user_name_j        -24.967657
dtype: float64

In [28]:
pred_y = mod.predict(valid_X)
mse = mean_squared_error(valid_y.values, pred_y)
np.sqrt(mse)

10423.055917692449

In [29]:
mean_absolute_error(valid_y.values, pred_y)

6991.09319334604

#### L1

In [36]:
mod = Lasso(alpha=0.1).fit(train_X_scaled, train_y)

In [37]:
feature_importance = dict()
for i in range(mod.coef_.shape[0]):
    feature_importance[X.columns[i]] = mod.coef_[i]
pd.Series(feature_importance).sort_values(ascending=False)

user_name_n         27.084296
group_Irrelevant     4.201086
group_Random         0.547209
group_TreeInt        0.291565
group_TreeSHAP       0.000000
user_name_t         -0.000000
trx_amnt             0.000000
group_Data          -2.465481
group_LIME          -5.209787
group_ML Model      -9.730710
user_name_j        -23.448748
dtype: float64

In [38]:
pred_y = mod.predict(valid_X)
mse = mean_squared_error(valid_y.values, pred_y)
np.sqrt(mse)

40.09995064286572

In [39]:
mean_absolute_error(valid_y.values, pred_y)

26.123385267515545

#### Decision Tree

In [40]:
mod = DecisionTreeRegressor(criterion='mse', max_depth=None, min_samples_split=20)
mod.fit(train_X, train_y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=20, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [41]:
feature_importance = dict()
for i in range(mod.feature_importances_.shape[0]):
    feature_importance[X.columns[i]] = mod.feature_importances_[i]
pd.Series(feature_importance).sort_values(ascending=False)

trx_amnt            0.465320
user_name_n         0.350276
user_name_t         0.095681
group_Irrelevant    0.024283
group_ML Model      0.015035
group_TreeInt       0.012931
group_Random        0.012191
group_TreeSHAP      0.010157
group_Data          0.008122
group_LIME          0.006004
user_name_j         0.000000
dtype: float64

In [42]:
pred_y = mod.predict(valid_X)
mse = mean_squared_error(valid_y.values, pred_y)
np.sqrt(mse)

44.550365972383226

In [43]:
mean_absolute_error(valid_y.values, pred_y)

29.0569530117704