<a href="https://colab.research.google.com/github/gregory-ch/shap_flex_porting/blob/main/shap_joint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
This module contains using user-defined trained models and prediction functions to compute approximate Shapley values for
single models. 
"""

import numpy as np
import pandas as pd
import random
import csv
import itertools

In [None]:
type(pd.Series())

  """Entry point for launching an IPython kernel.


pandas.core.series.Series

In [None]:
class shapFlex_plus:
    def __init__(self, explain,  model, predict_function, reference = None, target_features = None, \
                     causal = None, causal_weights = None, sample_size = None, use_future = None):
        self.explain = explain
        self.reference = reference if reference else explain
        self.model = model
        predict_function = predict_function
        self.target_features = target_features if isinstance(target_features, pd.core.series.Series) else explain.columns.tolist()
        self.causal = causal #if causal else None
        self.causal_weights = causal_weights #if causal_weights else None
        self.sample_size = sample_size if sample_size else 60
        self.use_future = use_future if isinstance(target_features, pd.core.series.Series) else False
        
        self.n_features = self.explain.shape[1]
        self.n_instances = self.reference.shape[0]

        self.causal_graph = igraph.Graph.DataFrame(self.causal, directed=True) if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v for v in self.causal_graph.vs] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.each_node_causes = {v: v.successors() for v in self.nodes if v.successors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# надо уточнить, мб здесь не только "прямые" successors и predecessors ищутся 
        self.each_node_is_an_effect_from = {v: v.predecessors() for v in nodes if v.predecessors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# но и вообще все
        # имена, кажется, уже прописаны автоматически
        self.causal_nodes = [v['name'] for v in self.each_node_causes.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.effect_nodes = [v['name'] for v in self.each_node_is_an_effect_from.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]



    @staticmethod
    def ulist_df(data):
      unlisted_df = pd.Series(
                  data,
                  index=[
                  index_col + index_row for index_col, index_row in itertools.product(
                      [str(x) for x in range(data.shape[0])], 
                      [str(x) for x in data.columns])]
              )
      return unlisted_df
      
    def loop_over_monte_carlo_samples(self):
      i_size = self.sample_size
      j_size = len(self.target_features)
      data_sample = []
      for i in range(i_size):
        reference_index = np.random.choice(np.arange(0, self.n_features ), size=1, replace=False)
        feature_indices_random = np.random.choice(np.arange(0, self.n_features), size=self.n_features, replace=False)
        # r индексация стартует с 1 а питон с 0 поэтому нам нужно вычиать 1 или ставить по верхней границе индексы в зависимости от функции вызова
        #reference это pd dataframe
        feature_names_random = self.explain.columns[feature_indices_random].values
        reference_instance = reference.iloc[reference_index, feature_indices_random]
        #feature_indices_random это вектор индексов
        explain_instances = explain.iloc[:, feature_indices_random]
        data_sample_feature = []
        for j in range(j_size):
          target_feature_index =  self.explain.columns.get_loc(self.target_features[j])
          target_feature_index_shuffled = list(self.explain.columns.values[feature_indices_random]).index(self.target_features[j])
          #if True:
          #  print(target_feature_index)
          # target_feature_index = (self.explain.columns == self.target_features[j])
          # target_feature_index_shuffled = (self.explain.columns[feature_indices_random] == self.target_features[j])
          
          if self.target_features[j] in self.nodes:
            #unlist как я понял, вытягивает все данные в один длинный вектор, присваивает индексы как название колонки + название строки
            #предположу, что each_node_causes это pd.DataFrame()
            target_feature_causes_these_features = self.unlist_df(
                #loc потому, что кажется target_features это не индекс
                each_node_causes.loc[:, self.target_features[j]]
                )
            target_feature_is_caused_by = self.unlist_df(
                each_node_is_an_effect_from.loc[:, self.target_features[j]]
                )
            
            target_index = target_feature_index_shuffled
            #отмечаем те значения feature_names_random которые равны последнему значению 
            #target_feature_is_caused_by. target_feature_is_caused_by вроде как вектор
            #вернуться должно число. Если вдруг окажется, что датафрейм, -1 элемент будет строка, 
            #надо заменить на индексацию на iloc, == на .isin
            causes_indices = (feature_names_random == target_feature_is_caused_by[-1])
            effects_indices = (feature_names_random == target_feature_causes_these_features[-1])
            sample_indices = feature_indices_random[~feature_indices_random.isin(
                np.concatenate([target_index, causes_indices, effects_indices]))]
            #c() вроде как склеивает вектор(ы) и переменные
            sample_real_indices = sample_indices[sample_indices < target_index]  # Not in causal diagram, feature data from 'explain'.
            sample_fake_indices = sample_indices[sample_indices > target_index]  # Not in causal diagram, feature data from 'reference'.

            feature_indices_real_causes_real_effects = np.concatenate([sample_real_indices, causes_indices, effects_indices, target_index, sample_fake_indices], ignore_index=True)
            feature_indices_real_causes_fake_effects = np.concatenate([sample_real_indices, causes_indices, target_index, effects_indices, sample_fake_indices], ignore_index=True)
            feature_indices_fake_causes_real_effects = np.concatenate([sample_real_indices, effects_indices, target_index, causes_indices, sample_fake_indices], ignore_index=True)
            feature_indices_fake_causes_fake_effects = np.concatenate([sample_real_indices, target_index, causes_indices, effects_indices, sample_fake_indices], ignore_index=True)
          
          if not self.target_features[j] in self.nodes:
            explain_instance_real_target = explain_instances

            # Only create a Frankenstein instance if the target is not the last feature and there is actually
            # one or more features to the right of the target to replace with the reference.
            if (target_feature_index_shuffled < self.n_features):
              explain_instance_real_target.iloc[:, target_feature_index_shuffled + 1: self.n_features + 1] =\
              reference_instance.iloc[:, target_feature_index_shuffled + 1: self.n_features + 1]
            
            # These instances are otherwise the same as the Frankenstein instance created above with the
            # exception that the target feature is now replaced with the target feature in the random reference
            # instance. The difference in model predictions between these two Frankenstein instances is
            # what gives us the stochastic Shapley value approximation.
            explain_instance_fake_target = explain_instance_real_target
            # если не ставить target_feature_index_shuffled в квадратные скобки, не выполняется бродкастинг
            explain_instance_fake_target.iloc[:, [target_feature_index_shuffled]] = reference_instance.iloc[:, [target_feature_index_shuffled]]
          
          else:

            if self.target_features[j] in self.causal_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]
              target_index_temp = (explain_instance_real_causes_fake_effects_real_target.columns.values == self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_real_causes_fake_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]

              explain_instance_real_causes_fake_effects_fake_target = explain_instance_real_causes_fake_effects_real_target
              explain_instance_real_causes_fake_effects_fake_target.iloc[:, target_index_temp] =\
              reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target_cause = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = (explain_instance_fake_causes_real_effects_real_target_cause.columns.values == self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target_cause.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_fake_causes_real_effects[:, target_index_temp + 1: self.n_features]
              
              explain_instance_fake_causes_real_effects_fake_target_cause = explain_instance_fake_causes_real_effects_real_target_cause
              explain_instance_fake_causes_real_effects_fake_target_cause.iloc[:, target_index_temp] =\
              reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]

            if self.target_features[j] in self.effect_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target_effect = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]
              target_index_temp = (explain_instance_real_causes_fake_effects_real_target_effect.columns.values == self.target_features[j])

              if (target_index_temp < self.n_features):
                explain_instance_real_causes_fake_effects_real_target_effect.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]
              
              explain_instance_real_causes_fake_effects_fake_target_effect = explain_instance_real_causes_fake_effects_real_target_effect
              explain_instance_real_causes_fake_effects_fake_target_effect.iloc[:, target_index_temp] =\
              reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = (explain_instance_fake_causes_real_effects_real_target.columns.values == self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features + 1]

              explain_instance_fake_causes_real_effects_fake_target = explain_instance_fake_causes_real_effects_real_target
              explain_instance_fake_causes_real_effects_fake_target.iloc[:, target_index_temp] =\
              reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]

          if not self.target_features[j] in self.nodes:
            explain_instance_real_target = explain_instance_real_target.loc[:, explain.columns]
            explain_instance_fake_target = explain_instance_fake_target.loc[:, explain.columns]
            data_explain_instance = pd.concat([explain_instance_real_target, explain_instance_fake_target], axis=0).reset_index(drop=True)#, ignore_index=True)
            #вот тут не совсем понятно, индекс это число или строка, индексы в data_explain_instance это числа или строки? в любом случае, при запуске можно починить
            data_explain_instance['index'] = np.tile(np.arange(1, explain.shape[0] + 1), 2) 
            data_explain_instance['feature_group'] = np.tile(pd.Series(['real_target', 'fake_target']), explain.shape[0])
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 0
            data_explain_instance['causal_type'] = None

          else:
            if self.target_features[j] in self.causal_nodes:
              explain_instance_real_causes_fake_effects_real_target =\
              explain_instance_real_causes_fake_effects_real_target.loc[:, explain.columns]
              explain_instance_real_causes_fake_effects_fake_target =\
              explain_instance_real_causes_fake_effects_fake_target.loc[:, explain.columns]
              explain_instance_fake_causes_real_effects_real_target_cause =\
              explain_instance_fake_causes_real_effects_real_target_cause.loc[:, explain.columns]
              explain_instance_fake_causes_real_effects_fake_target_cause =\
              explain_instance_fake_causes_real_effects_fake_target_cause.loc[:, explain.columns]

            if self.target_features[j] in self.effect_nodes:
              explain_instance_real_causes_fake_effects_real_target_effect =\
              explain_instance_real_causes_fake_effects_real_target_effect.loc[:, explain.columns]
              explain_instance_real_causes_fake_effects_fake_target_effect =\
              explain_instance_real_causes_fake_effects_fake_target_effect.loc[:, explain.columns]
              explain_instance_fake_causes_real_effects_real_target =\
              explain_instance_fake_causes_real_effects_real_target.loc[:, explain.columns]
              explain_instance_fake_causes_real_effects_fake_target =\
              explain_instance_fake_causes_real_effects_fake_target.loc[:, explain.columns]

            if self.target_features[j] in self.causal_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause], axis=0
              ).reset_index(drop=True)
              data_explain_instance[index] = np.tile(np.arange(1, explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance[feature_group] = np.tile(pd.Series(["real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",
                                                          "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause"]),
                                                        explain.shape[0])
              data_explain_instance[causal_type] = "target_is_a_cause"

            if self.target_features[j] in self.effect_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance[index] = np.tile(np.arange(1, explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance[feature_group] = np.tile(pd.Series(["real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",
                                                          "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"]),
                                                        explain.shape[0])
              data_explain_instance[causal_type] = "target_is_an_effect"

            if (self.target_features[j] in self.causal_nodes) and (self.target_features[j] in self.effect_nodes):
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause,
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, explain.shape[0] + 1), 8)  # Eight Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series([
                "real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",  # Target is a causal node.
                "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause",  # Target is a causal node.
                "real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",  # Target is an effect node.
                "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"  # Target is an effect node.
                ]),
              explain.shape[0])
              data_explain_instance['causal_type'] = np.tile(pd.Series([
                "target_is_a_cause", "target_is_a_cause", "target_is_a_cause", "target_is_a_cause",
                "target_is_an_effect", "target_is_an_effect", "target_is_an_effect", "target_is_an_effect"]
              ),
              explain.shape[0])
            
            data_explain_instance['feature_name'] = target_features[j]
            data_explain_instance[causal] = 1

            data_explain_instance['sample'] = i
            data_explain_instance

        
          data_sample_feature.append(data_explain_instance)
          data_sample_feature
        data_sample.append(data_sample_feature)
        #STOP
      data_sample = pd.concat([pd.concat(data_sample_i, axis=0) for data_sample_i in data_sample], axis=0)
      return data_sample

                        



1) текущий раздел работы строки: 125-397, в строках инициализируется функция сэмплирования, проходит по двум петлям цикла. [закончено]
2) Начало цикла [закончено]

3) "Франкенштейна" cтр 172 -270 [закончено]
 
4) Цикл i loop, j loop  стр 397 [закончено]

5) написан код для инициализации объектов на вход в класс: модель, обработку данных, предикт функцию. Датасет в csv на гугл-диск кинул: https://drive.google.com/file/d/1ADJ2yNZum-quPW3bRWJ4iyEa2OoqlS18/view?usp=sharing, пока для простоты складывается в файлы колаба через drag-and-drop.

6) Инициализация графа [закончено]

7) В основном разделе R/shapFlex дошли до вызова функции predict_shapFlex на 401 стр., проверили инициализацию класса ShapFlex_plus начали отладку запуска loop_over_monte_carlo_samples()



In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

#data = pd.read_csv('/content/data_adult.csv', index_col=0)
data = pd.read_csv('https://kolodezev.ru/download/data_adult.csv', index_col=0)

In [None]:
encoder = OneHotEncoder()
outcome_name = 'income'
outcome_col = pd.Series(data.columns)[data.columns==outcome_name].index[0]
model = RandomForestClassifier(n_estimators=300, random_state=42)
X, y = data.drop(outcome_name, axis=1), data[outcome_name].values
X, y = pd.get_dummies(X, drop_first=True), np.array([1 if x == '<=50K' else 0 for x in y ]).ravel()
model.fit(X, y)

RandomForestClassifier(n_estimators=300, random_state=42)

In [None]:
def predict_function(model, data):
  #pd.DataFrame(model.predict_proba(X)).loc[:, 0][9] если запустить будет результат 0.98, что соответствует
  #выводу для 9 номера который равен 0.98, неважно какой алгоритм, такая высокая степень уверенности
  #позволяет идентифицировать выводимую колонку однозначно
  X, y = data.drop(outcome_name, axis=1), data[outcome_name].values
  X, y = pd.get_dummies(X, drop_first=True), np.array([1 if x == '<=50K' else 0 for x in y ]).ravel()
  return pd.DataFrame(model.predict_proba(X)).loc[:, 0], X, y


In [None]:
explain, reference = data.iloc[:350, :data.shape[1]-1], data.iloc[:, :data.shape[1]-1]
sample_size = 60
target_features = pd.Series(["marital_status", "education", "relationship",  "native_country",
                     "age", "sex", "race", "hours_per_week"])
causal = pd.DataFrame(
  dict(cause=pd.Series(["age", "sex", "race", "native_country",
              "age", "sex", "race", "native_country", "age",
              "sex", "race", "native_country"]),
  effect = pd.Series(np.concatenate([np.tile("marital_status", 4), np.tile("education", 4), np.tile("relationship", 4)])))
)

In [None]:
!pip install igraph
import igraph
causal_graph = igraph.Graph.DataFrame(causal, directed=True)
nodes = [v for v in causal_graph.vs]
each_node_causes = {v: v.successors() for v in nodes if v.successors()}# надо уточнить, мб здесь не только "прямые" successors и predecessors ищутся 
each_node_is_an_effect_from = {v: v.predecessors() for v in nodes if v.predecessors()} # но и вообще все
# имена, кажется, уже прописаны автоматически
causal_nodes = [v['name'] for v in each_node_causes.keys()]
effect_nodes = [v['name'] for v in each_node_is_an_effect_from.keys()]

Collecting igraph
  Downloading igraph-0.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.3 MB/s 
[?25hCollecting texttable>=1.6.2
  Downloading texttable-1.6.4-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph
Successfully installed igraph-0.9.9 texttable-1.6.4


# Новый раздел

In [None]:
explain.head()

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [None]:
exmpl_of_test = shapFlex_plus(explain,  model, predict_function, target_features=pd.Series(["marital_status", "education", "relationship", "native_country",
"age", "sex", "race", "hours_per_week"]))


In [None]:

result = exmpl_of_test.loop_over_monte_carlo_samples()

In [None]:
result

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,index,feature_group,feature_name,causal,causal_type
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,1,real_target,marital_status,0,
1,50.0,Self-emp-not-inc,Bachelors,13.0,,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,2,fake_target,marital_status,0,
2,38.0,Private,HS-grad,9.0,,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,3,real_target,marital_status,0,
3,53.0,Private,11th,7.0,,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,4,fake_target,marital_status,0,
4,28.0,Private,Bachelors,13.0,,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,5,real_target,marital_status,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,,,,,,,,,,,,,,346,fake_target,hours_per_week,0,
696,,,,,,,,,,,,,,347,real_target,hours_per_week,0,
697,,,,,,,,,,,,,,348,fake_target,hours_per_week,0,
698,,,,,,,,,,,,,,349,real_target,hours_per_week,0,


In [None]:
%debug

> [0;32m<ipython-input-51-d9ac69c401e4>[0m(231)[0;36mloop_over_monte_carlo_samples[0;34m()[0m
[0;32m    229 [0;31m          [0mdata_sample_feature[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    230 [0;31m        [0mdata_sample[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mdata_sample_feature[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 231 [0;31m        [0mSTOP[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    232 [0;31m      [0mdata_sample[0m [0;34m=[0m [0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0;34m[[0m[0mpd[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0mdata_sample_i[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m [0;32mfor[0m [0mdata_sample_i[0m [0;32min[0m [0mdata_sample[0m[0;34m][0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    233 [0;31m      [0;32mreturn[0m [0mdata_sample[0m[0;34m[0m[0;34m[0m[0m
[0m
[      age  workclass  education  ...    feature_name caus

In [None]:
result.iloc[344:]

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,index,feature_group,feature_name,causal,causal_type
344,22.0,State-gov,Some-college,10.0,,Protective-serv,Own-child,Black,Female,0.0,0.0,40.0,United-States,345,real_target,marital_status,0,
345,43.0,Self-emp-not-inc,Bachelors,13.0,,Sales,Not-in-family,White,Male,0.0,0.0,42.0,United-States,346,fake_target,marital_status,0,
346,67.0,?,11th,7.0,,?,Husband,White,Male,0.0,0.0,8.0,United-States,347,real_target,marital_status,0,
347,30.0,?,Assoc-voc,11.0,,?,Unmarried,White,Female,0.0,0.0,40.0,United-States,348,fake_target,marital_status,0,
348,56.0,Private,Assoc-acdm,12.0,,Other-service,Not-in-family,White,Male,0.0,0.0,25.0,Iran,349,real_target,marital_status,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,,,,,,,,,,,,,,346,fake_target,hours_per_week,0,
346,,,,,,,,,,,,,,347,real_target,hours_per_week,0,
347,,,,,,,,,,,,,,348,fake_target,hours_per_week,0,
348,,,,,,,,,,,,,,349,real_target,hours_per_week,0,


In [None]:
exmpl_of_test.n_features

13

In [None]:
result[0][0]

KeyError: ignored

In [None]:
avg_rows_age_not_na = []
avg_shape = []
for i in range(len(result)):
  for j in range(len(result[i])):
    avg_rows_age_not_na += [result[i][j].loc[~result[i][j].age.isna()].shape[0]]
    avg_shape += [result[i][j].shape[0]]

np.mean(avg_rows_age_not_na), np.mean(avg_shape)

(2.0, 700.0)

In [None]:
result = result.reset_index()

In [None]:
result.loc[~result.age.isna()]

Unnamed: 0,level_0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,index,feature_group,feature_name,causal,causal_type
0,0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,1,real_target,age,0,
350,0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,1,real_target,age,0,
700,0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,1,real_target,workclass,0,
1050,0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,1,real_target,workclass,0,
1400,0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,1,real_target,education,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544255,5,37.0,Private,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,6,fake_target,capital_loss,0,
544605,5,37.0,Private,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,6,fake_target,hours_per_week,0,
544955,5,37.0,Private,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,6,fake_target,hours_per_week,0,
545305,5,37.0,Private,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,6,fake_target,native_country,0,


In [None]:
pd.Series(result.loc[~result.age.isna()].index[::-1].values).rolling(2).apply(lambda x: x.iloc[0]-x.iloc[1]).value_counts()

350.0    1506
351.0       8
348.0       5
352.0       5
355.0       4
346.0       4
343.0       3
349.0       3
353.0       3
358.0       3
344.0       3
345.0       3
356.0       2
359.0       1
339.0       1
361.0       1
338.0       1
347.0       1
357.0       1
354.0       1
dtype: int64

In [None]:
result.iloc[:, :3].shape, result.iloc[:, 3:].shape, result.shape

((468000, 3), (468000, 15), (468000, 18))

In [None]:
def predict_shapFlex(reference, data_predict, model, predict_function, n_features, causal, causal_weights):
   data_model = data_predict.iloc[:, :n_features]
   data_meta = data_predict.iloc[:, n_features:]
   data_predicted = predict_function(model, data_model)
   data_predicted = pd.concat([data_meta, data_predicted], axis=1)
   intercept = predict_function(model, reference)[0].mean()
   user_fun_y_pred_name = names(data_predicted)[ncol(data_predicted)]

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,index,feature_group,feature_name,causal,causal_type
0,,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,1,real_target,age,0,
1,,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,2,fake_target,age,0,
2,,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,3,real_target,age,0,
3,,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,4,fake_target,age,0,
4,,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,5,real_target,age,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,,,,,,,,,,,,,,296,fake_target,native_country,0,
296,,,,,,,,,,,,,,297,real_target,native_country,0,
297,,,,,,,,,,,,,,298,fake_target,native_country,0,
298,,,,,,,,,,,,,,299,real_target,native_country,0,


In [None]:
288*2

576

In [None]:
import pandas as pd
import numpy as np
x = pd.DataFrame(
  dict(cause=pd.Series(["age", "sex", "race", "native_country",
              "age", "sex", "race", "native_country", "age",
              "sex", "race", "native_country"]),
  effect = pd.Series(np.concatenate([np.tile("marital_status", 4), np.tile("education", 4), np.tile("relationship", 4)])))
)


12

In [None]:
# 24/03/2022
import numpy as np
import pandas as pd
import igraph
import itertools
from catboost import CatBoostClassifier

class shapFlex_plus:
    def __init__(self, explain,  model, predict_function, reference = None, target_features = None, \
                     causal = None, causal_weights = None, sample_size = None, use_future = None):
        self.explain = explain
        self.reference = reference if reference else explain
        self.model = model
        self.predict_function = predict_function
        self.target_features = target_features if isinstance(target_features, pd.core.series.Series) else explain.columns.tolist()
        self.causal = causal #if causal else None
        self.causal_weights = causal_weights #if causal_weights else None
        self.sample_size = sample_size if sample_size else 60
        self.use_future = use_future if isinstance(target_features, pd.core.series.Series) else False
        
        self.n_features = self.explain.shape[1]
        self.n_instances = self.reference.shape[0]

        self.causal_graph = igraph.Graph.DataFrame(self.causal, directed=True) if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v for v in self.causal_graph.vs] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.each_node_causes = {v['name']: [succ['name'] for succ in v.successors()] for v in self.nodes if v.successors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# надо уточнить, мб здесь не только "прямые" successors и predecessors ищутся 
        self.each_node_is_an_effect_from = {v['name']: [pred['name'] for pred in v.predecessors()] for v in self.nodes if v.predecessors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# но и вообще все
        # имена, кажется, уже прописаны автоматически
        self.causal_nodes = [v for v in self.each_node_causes.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.effect_nodes = [v for v in self.each_node_is_an_effect_from.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v['name'] for v in self.nodes] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]

    @staticmethod
    def unlist_df(data):
      unlisted_df = pd.Series(
                  data,
                  index=[
                  index_col + index_row for index_col, index_row in itertools.product(
                      [str(x) for x in range(data.shape[0])], 
                      [str(x) for x in data.columns])]
              )
      return unlisted_df
      
    def loop_over_monte_carlo_samples(self):
      i_size = self.sample_size
      j_size = len(self.target_features)
      data_sample = []

      for i in range(i_size):
        reference_index = np.random.choice(np.arange(0, self.n_features ), size=1, replace=False)
        feature_indices_random = np.random.choice(np.arange(0, self.n_features), size=self.n_features, replace=False)
        # r индексация стартует с 1 а питон с 0 поэтому нам нужно вычиать 1 или ставить по верхней границе индексы в зависимости от функции вызова
        feature_names_random = self.explain.columns[feature_indices_random].values
        reference_instance = self.reference.iloc[reference_index, feature_indices_random]
        #feature_indices_random это вектор индексов
        explain_instances = self.explain.iloc[:, feature_indices_random]
        data_sample_feature = []
        for j in range(j_size):
          target_feature_index =  self.explain.columns.get_loc(self.target_features[j])
          target_feature_index_shuffled = list(self.explain.columns.values[feature_indices_random]).index(self.target_features[j])
          #if True:
          #  print(target_feature_index)
          # target_feature_index = (self.explain.columns == self.target_features[j])
          # target_feature_index_shuffled = (self.explain.columns[feature_indices_random] == self.target_features[j])
          
          if self.target_features[j] in self.nodes:
            #unlist как я понял, вытягивает все данные в один длинный вектор, присваивает индексы как название колонки + название строки
            #предположу, что each_node_causes это pd.DataFrame()
            target_feature_causes_these_features =  [self.target_features[j]] + self.each_node_causes.get(self.target_features[j], []) 
                
            target_feature_is_caused_by =  [self.target_features[j]] + self.each_node_is_an_effect_from.get(self.target_features[j], []) 
            target_index = target_feature_index_shuffled
            #отмечаем те значения feature_names_random которые равны последнему значению 
            #target_feature_is_caused_by. target_feature_is_caused_by вроде как вектор
            #вернуться должно число. Если вдруг окажется, что датафрейм, -1 элемент будет строка, 
            #надо заменить на индексацию на iloc, == на .isin
            causes_indices = np.where(feature_names_random == target_feature_is_caused_by[-1])[0].item()
            effects_indices = np.where(feature_names_random == target_feature_causes_these_features[-1])[0].item()
            sample_indices = feature_indices_random[~np.isin(feature_indices_random, 
                np.concatenate([[target_index], [causes_indices], [effects_indices]]))]
            #c() вроде как склеивает вектор(ы) и переменные
            sample_real_indices = sample_indices[sample_indices < target_index]  # Not in causal diagram, feature data from 'explain'.
            sample_fake_indices = sample_indices[sample_indices > target_index]  # Not in causal diagram, feature data from 'reference'.

            feature_indices_real_causes_real_effects = np.concatenate([sample_real_indices, [causes_indices], [effects_indices], [target_index], sample_fake_indices])
            feature_indices_real_causes_fake_effects = np.concatenate([sample_real_indices, [causes_indices], [target_index], [effects_indices], sample_fake_indices])
            feature_indices_fake_causes_real_effects = np.concatenate([sample_real_indices, [effects_indices], [target_index], [causes_indices], sample_fake_indices])
            feature_indices_fake_causes_fake_effects = np.concatenate([sample_real_indices, [target_index], [causes_indices], [effects_indices], sample_fake_indices])
          
          if not self.target_features[j] in self.nodes:
            explain_instance_real_target = explain_instances.copy()

            # Only create a Frankenstein instance if the target is not the last feature and there is actually
            # one or more features to the right of the target to replace with the reference.
            if (target_feature_index_shuffled < self.n_features):
              #x = reference_instance.iloc[:, target_feature_index_shuffled: ]
              explain_instance_real_target.iloc[:, target_feature_index_shuffled+1: ] =\
                 pd.concat([reference_instance.iloc[:, target_feature_index_shuffled+1: ]] * self.explain.shape[0], axis=0).reset_index(drop=True)
              
            # These instances are otherwise the same as the Frankenstein instance created above with the
            # exception that the target feature is now replaced with the target feature in the random reference
            # instance. The difference in model predictions between these two Frankenstein instances is
            # what gives us the stochastic Shapley value approximation.
            explain_instance_fake_target = explain_instance_real_target.copy()
            
            # ОНИ ПОЧЕМУ ТО ВЫШЛИ ОДИНАКОВЫЕ, ЭТО ОК?
            explain_instance_fake_target.iloc[:, [target_feature_index_shuffled]] =\
               pd.concat([reference_instance.iloc[:, [target_feature_index_shuffled]]]  * self.explain.shape[0], axis=0).reset_index(drop=True)
          
          else:

            if self.target_features[j] in self.causal_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]
              target_index_temp = (explain_instance_real_causes_fake_effects_real_target.columns.values == self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_real_causes_fake_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]

              explain_instance_real_causes_fake_effects_fake_target = explain_instance_real_causes_fake_effects_real_target
              explain_instance_real_causes_fake_effects_fake_target.iloc[:, target_index_temp] =\
              reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target_cause = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = (explain_instance_fake_causes_real_effects_real_target_cause.columns.values == self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target_cause.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_fake_causes_real_effects[:, target_index_temp + 1: self.n_features]
              
              explain_instance_fake_causes_real_effects_fake_target_cause = explain_instance_fake_causes_real_effects_real_target_cause
              explain_instance_fake_causes_real_effects_fake_target_cause.iloc[:, target_index_temp] =\
              reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]

            if self.target_features[j] in self.effect_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target_effect = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]

              target_index_temp = explain_instance_real_causes_fake_effects_real_target_effect.columns.get_loc(self.target_features[j])

              if (target_index_temp < self.n_features):
                explain_instance_real_causes_fake_effects_real_target_effect.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]
              
              explain_instance_real_causes_fake_effects_fake_target_effect = explain_instance_real_causes_fake_effects_real_target_effect
              explain_instance_real_causes_fake_effects_fake_target_effect.iloc[:, target_index_temp] =\
              reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = (explain_instance_fake_causes_real_effects_real_target.columns.values == self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features + 1]

              explain_instance_fake_causes_real_effects_fake_target = explain_instance_fake_causes_real_effects_real_target
              explain_instance_fake_causes_real_effects_fake_target.iloc[:, target_index_temp] =\
              reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]

          if not self.target_features[j] in self.nodes:
            #магическим образом две нижеследующие строчки возвращают датафрейм к старому виду
            explain_instance_real_target = explain_instance_real_target.loc[:, self.explain.columns]
            explain_instance_fake_target = explain_instance_fake_target.loc[:, self.explain.columns]
            data_explain_instance = pd.concat([explain_instance_real_target, explain_instance_fake_target], axis=0).reset_index(drop=True)#, ignore_index=True)
            #вот тут не совсем понятно, индекс это число или строка, индексы в data_explain_instance это числа или строки? в любом случае, при запуске можно починить
            data_explain_instance['index'] = np.tile(np.arange(0, self.explain.shape[0]), 2) 
            data_explain_instance['feature_group'] = np.repeat(['real_target', 'fake_target'], repeats=self.explain.shape[0])
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 0
            data_explain_instance['causal_type'] = None

          else:
            if self.target_features[j] in self.causal_nodes:
              explain_instance_real_causes_fake_effects_real_target =\
              explain_instance_real_causes_fake_effects_real_target.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target =\
              explain_instance_real_causes_fake_effects_fake_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target_cause =\
              explain_instance_fake_causes_real_effects_real_target_cause.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target_cause =\
              explain_instance_fake_causes_real_effects_fake_target_cause.loc[:, self.explain.columns]

            if self.target_features[j] in self.effect_nodes:
              explain_instance_real_causes_fake_effects_real_target_effect =\
              explain_instance_real_causes_fake_effects_real_target_effect.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target_effect =\
              explain_instance_real_causes_fake_effects_fake_target_effect.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target =\
              explain_instance_fake_causes_real_effects_real_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target =\
              explain_instance_fake_causes_real_effects_fake_target.loc[:, self.explain.columns]

            if self.target_features[j] in self.causal_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause], axis=0
              ).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",
                                                          "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_a_cause"

            if self.target_features[j] in self.effect_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",
                                                          "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_an_effect"

            if (self.target_features[j] in self.causal_nodes) and (self.target_features[j] in self.effect_nodes):
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause,
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 8)  # Eight Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series([
                "real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",  # Target is a causal node.
                "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause",  # Target is a causal node.
                "real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",  # Target is an effect node.
                "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"  # Target is an effect node.
                ]),
              self.explain.shape[0])
              data_explain_instance['causal_type'] = np.tile(pd.Series([
                "target_is_a_cause", "target_is_a_cause", "target_is_a_cause", "target_is_a_cause",
                "target_is_an_effect", "target_is_an_effect", "target_is_an_effect", "target_is_an_effect"]
              ),
              self.explain.shape[0])
            
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 1

          data_explain_instance['sample'] = i
          data_sample_feature.append(data_explain_instance)

        data_sample.append(data_sample_feature)

      data_sample = pd.concat([pd.concat(data_sample_i, axis=0) for data_sample_i in data_sample], axis=0)
      return data_sample

    def predict_shapFlex(self, data_predict):
      '''есть self.reference, self.model, self.predict_function, self.n_features, self.causal, self.causal_weights'''
      data_model = data_predict.iloc[:, :self.n_features]
      data_meta = data_predict.iloc[:, self.n_features:]
      data_predicted = pd.DataFrame(predict_function(self.model, data_model), index=data_model.index)
      data_predicted = pd.concat([data_meta, data_predicted], axis=1)
      #мб придется править, в зависимости от формата входных данных (вектор-строка/-столбец), пока результат по всем измерениям, скаляр
      intercept = predict_function(self.model, self.reference).mean(skipna=True)
      #вмест data.shape[1] взял -1
      #костыль, не понимаю, что тут должно быть пока
      user_fun_y_pred_name = data_predicted.columns[-1]
      #тут нюанс: у них перед вэлью !! стоит, что значит значение которое за ними следует, это не значение, а expression, что бы это 
      # ни значило, соответсвенно, может беда быть
      #data_predicted = pd.concat([
      #  data_predicted.drop('feature_group', axis=1), 
      #  data_predicted.reset_index().pivot_table(index='index', columns=[ 'feature_group'], values=user_fun_y_pred_name)
      #  ], axis=1)
      data_predicted = data_predicted.reset_index().pivot_table(
        index=set(data_predicted.columns) - set(['index', 'feature_group', user_fun_y_pred_name]),
        columns=['feature_group'],
        values=user_fun_y_pred_name
      )
      data_non_causal = data_predicted.loc[data_predicted['causal']==0]
      data_non_causal['shap_effect'] = data_non_causal['real_target'] - data_non_causal['fake_target']
      data_causal = data_predicted.loc[data_predicted['causal']==1]

      if isinstance(self.causal, pd.core.frame.DataFrame):
        pass

      data_predicted = pd.concat([data_causal, data_non_causal], ignore_index=True, axis=0)
      data_predicted = data_predicted.loc[:, ['index', 'sample', 'feature_name', 'shap_effect']]

      data_predicted = data_predicted.reset_index().dropna(axis=0).groupby(['index', 'feature_name']).agg({'shap_effect': [np.std, np.mean]})
      data_predicted[('shap_effect', 'intercept')] = intercept[0]

      return data_predicted

    def forward(self):
      data_predict = self.loop_over_monte_carlo_samples()
      data_predicted = self.predict_shapFlex(data_predict)
      return data_predicted




import pandas as pd
import numpy as np
data = pd.read_csv('https://kolodezev.ru/download/data_adult.csv', index_col=0)
outcome_name = 'income'
outcome_col = pd.Series(data.columns)[data.columns==outcome_name].index[0]
X, y = data.drop(outcome_name, axis=1), data[outcome_name].values
cat_features = [inx for inx, value in zip(X.dtypes.index, X.dtypes) if value =='object']
model = CatBoostClassifier()
model.fit(X, y, cat_features=cat_features, verbose=False)
def predict_function(model, data):
  #pd.DataFrame(model.predict_proba(X)).loc[:, 0][9] если запустить будет результат 0.98, что соответствует
  #выводу для 9 номера который равен 0.98, неважно какой алгоритм, такая высокая степень уверенности
  #позволяет идентифицировать выводимую колонку однозначно
  return pd.DataFrame(model.predict_proba(data)[:, [0]])


explain, reference = data.iloc[:350, :data.shape[1]-1], data.iloc[:, :data.shape[1]-1]
sample_size = 60
target_features = pd.Series(["marital_status", "education", "relationship",  "native_country",
                     "age", "sex", "race", "hours_per_week"])
causal = pd.DataFrame(
  dict(cause=pd.Series(["age", "sex", "race", "native_country",
              "age", "sex", "race", "native_country", "age",
              "sex", "race", "native_country"]),
  effect = pd.Series(np.concatenate([np.tile("marital_status", 4), np.tile("education", 4), np.tile("relationship", 4)])))
)
exmpl_of_test = shapFlex_plus(explain,  model, predict_function, target_features=pd.Series(["marital_status", "education", "relationship", "native_country",
"age", "sex", "race", "hours_per_week"]), causal=causal, causal_weights = [0.5 for x in range(len(causal))])
data_predict = exmpl_of_test.loop_over_monte_carlo_samples()
data_predicted = exmpl_of_test.predict_shapFlex(data_predict)
print(data_predicted)

In [None]:
# 25/03/2022
# 134 строка удалили +1 после :
import numpy as np
import pandas as pd
import igraph
import itertools
from catboost import CatBoostClassifier

class shapFlex_plus:
    def __init__(self, explain,  model, predict_function, reference = None, target_features = None, \
                     causal = None, causal_weights = None, sample_size = None, use_future = None):
        self.explain = explain
        self.reference = reference if reference else explain
        self.model = model
        self.predict_function = predict_function
        self.target_features = target_features if isinstance(target_features, pd.core.series.Series) else explain.columns.tolist()
        self.causal = causal #if causal else None
        self.causal_weights = causal_weights #if causal_weights else None
        self.sample_size = sample_size if sample_size else 60
        self.use_future = use_future if isinstance(target_features, pd.core.series.Series) else False
        
        self.n_features = self.explain.shape[1]
        self.n_instances = self.reference.shape[0]

        self.causal_graph = igraph.Graph.DataFrame(self.causal, directed=True) if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v for v in self.causal_graph.vs] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.each_node_causes = {v['name']: [succ['name'] for succ in v.successors()] for v in self.nodes if v.successors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# надо уточнить, мб здесь не только "прямые" successors и predecessors ищутся 
        self.each_node_is_an_effect_from = {v['name']: [pred['name'] for pred in v.predecessors()] for v in self.nodes if v.predecessors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# но и вообще все
        # имена, кажется, уже прописаны автоматически
        self.causal_nodes = [v for v in self.each_node_causes.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.effect_nodes = [v for v in self.each_node_is_an_effect_from.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v['name'] for v in self.nodes] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]

    @staticmethod
    def unlist_df(data):
      unlisted_df = pd.Series(
                  data,
                  index=[
                  index_col + index_row for index_col, index_row in itertools.product(
                      [str(x) for x in range(data.shape[0])], 
                      [str(x) for x in data.columns])]
              )
      return unlisted_df
      
    def loop_over_monte_carlo_samples(self):
      i_size = self.sample_size
      j_size = len(self.target_features)
      data_sample = []

      for i in range(i_size):
        reference_index = np.random.choice(np.arange(0, self.n_features ), size=1, replace=False)
        feature_indices_random = np.random.choice(np.arange(0, self.n_features), size=self.n_features, replace=False)
        # r индексация стартует с 1 а питон с 0 поэтому нам нужно вычиать 1 или ставить по верхней границе индексы в зависимости от функции вызова
        feature_names_random = self.explain.columns[feature_indices_random].values
        reference_instance = self.reference.iloc[reference_index, feature_indices_random]
        #feature_indices_random это вектор индексов
        explain_instances = self.explain.iloc[:, feature_indices_random]
        data_sample_feature = []
        for j in range(j_size):
          target_feature_index =  self.explain.columns.get_loc(self.target_features[j])
          target_feature_index_shuffled = list(self.explain.columns.values[feature_indices_random]).index(self.target_features[j])
          #if True:
          #  print(target_feature_index)
          # target_feature_index = (self.explain.columns == self.target_features[j])
          # target_feature_index_shuffled = (self.explain.columns[feature_indices_random] == self.target_features[j])
          
          if self.target_features[j] in self.nodes:
            #unlist как я понял, вытягивает все данные в один длинный вектор, присваивает индексы как название колонки + название строки
            #предположу, что each_node_causes это pd.DataFrame()
            target_feature_causes_these_features =  [self.target_features[j]] + self.each_node_causes.get(self.target_features[j], []) 
                
            target_feature_is_caused_by =  [self.target_features[j]] + self.each_node_is_an_effect_from.get(self.target_features[j], []) 
            target_index = target_feature_index_shuffled
            #отмечаем те значения feature_names_random которые равны последнему значению 
            #target_feature_is_caused_by. target_feature_is_caused_by вроде как вектор
            #вернуться должно число. Если вдруг окажется, что датафрейм, -1 элемент будет строка, 
            #надо заменить на индексацию на iloc, == на .isin
            causes_indices = np.where(feature_names_random == target_feature_is_caused_by[1:])[0]
            effects_indices = np.where(feature_names_random == target_feature_causes_these_features[1:])[0]
            sample_indices = feature_indices_random[~np.isin(feature_indices_random, 
                np.concatenate([[target_index], causes_indices, effects_indices]))]
            #c() вроде как склеивает вектор(ы) и переменные
            sample_real_indices = sample_indices[sample_indices < target_index]  # Not in causal diagram, feature data from 'explain'.
            sample_fake_indices = sample_indices[sample_indices > target_index]  # Not in causal diagram, feature data from 'reference'.

            feature_indices_real_causes_real_effects = np.concatenate([sample_real_indices, causes_indices, effects_indices, [target_index], sample_fake_indices])
            feature_indices_real_causes_fake_effects = np.concatenate([sample_real_indices, causes_indices, [target_index], effects_indices, sample_fake_indices])
            feature_indices_fake_causes_real_effects = np.concatenate([sample_real_indices, effects_indices, [target_index], causes_indices, sample_fake_indices])
            feature_indices_fake_causes_fake_effects = np.concatenate([sample_real_indices, [target_index], causes_indices, effects_indices, sample_fake_indices])
          
          if not self.target_features[j] in self.nodes:
            explain_instance_real_target = explain_instances.copy()

            # Only create a Frankenstein instance if the target is not the last feature and there is actually
            # one or more features to the right of the target to replace with the reference.
            if (target_feature_index_shuffled < self.n_features):
              #x = reference_instance.iloc[:, target_feature_index_shuffled: ]
              explain_instance_real_target.iloc[:, target_feature_index_shuffled+1: ] =\
                 pd.concat([reference_instance.iloc[:, target_feature_index_shuffled+1: ]] * self.explain.shape[0], axis=0).reset_index(drop=True)
              
            # These instances are otherwise the same as the Frankenstein instance created above with the
            # exception that the target feature is now replaced with the target feature in the random reference
            # instance. The difference in model predictions between these two Frankenstein instances is
            # what gives us the stochastic Shapley value approximation.
            explain_instance_fake_target = explain_instance_real_target.copy()
            
            # ОНИ ПОЧЕМУ ТО ВЫШЛИ ОДИНАКОВЫЕ, ЭТО ОК?
            explain_instance_fake_target.iloc[:, [target_feature_index_shuffled]] =\
               pd.concat([reference_instance.iloc[:, [target_feature_index_shuffled]]]  * self.explain.shape[0], axis=0).reset_index(drop=True)
          
          else:

            if self.target_features[j] in self.causal_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]
              target_index_temp = explain_instance_real_causes_fake_effects_real_target.columns.get_loc(self.target_features[j])

              

              if target_index_temp < self.n_features:
                explain_instance_real_causes_fake_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]*self.explain.shape[0], axis=0).reset_index(drop=True)

              explain_instance_real_causes_fake_effects_fake_target = explain_instance_real_causes_fake_effects_real_target
              explain_instance_real_causes_fake_effects_fake_target.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]] * self.explain.shape[0], axis=0).reset_index(drop=True)
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target_cause = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = explain_instance_fake_causes_real_effects_real_target_cause.columns.get_loc(self.target_features[j])
              


              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target_cause.iloc[:, target_index_temp + 1:] =\
                pd.concat([reference_instance_fake_causes_real_effects[:, target_index_temp + 1:]] * self.explain.shape[0], axis=0).reset_index(drop=True)
              explain_instance_fake_causes_real_effects_fake_target_cause = explain_instance_fake_causes_real_effects_real_target_cause
              explain_instance_fake_causes_real_effects_fake_target_cause.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]] * self.explain.shape[0], axis=0).reset_index(drop=True)
            if self.target_features[j] in self.effect_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target_effect = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]

              target_index_temp = explain_instance_real_causes_fake_effects_real_target_effect.columns.get_loc(self.target_features[j])

              if (target_index_temp < self.n_features):
                explain_instance_real_causes_fake_effects_real_target_effect.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]*self.explain.shape[0], axis=0).reset_index(drop=True)

              explain_instance_real_causes_fake_effects_fake_target_effect = explain_instance_real_causes_fake_effects_real_target_effect
              explain_instance_real_causes_fake_effects_fake_target_effect.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]] * self.explain.shape[0], axis=0).reset_index(drop=True)
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              pd.concat([reference_instance.iloc[:, feature_indices_fake_causes_real_effects]]*self.explain.shape[0], axis=0).reset_index(drop=True)

              explain_instance_fake_causes_real_effects_real_target = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = explain_instance_fake_causes_real_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features + 1]] * self.explain.shape[0], axis=0).reset_index(drop=True)

              explain_instance_fake_causes_real_effects_fake_target = explain_instance_fake_causes_real_effects_real_target
              explain_instance_fake_causes_real_effects_fake_target.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]]*self.explain.shape[0], axis=0).reset_index(drop=True)

          if not self.target_features[j] in self.nodes:
            #магическим образом две нижеследующие строчки возвращают датафрейм к старому виду
            explain_instance_real_target = explain_instance_real_target.loc[:, self.explain.columns]
            explain_instance_fake_target = explain_instance_fake_target.loc[:, self.explain.columns]
            data_explain_instance = pd.concat([explain_instance_real_target, explain_instance_fake_target], axis=0).reset_index(drop=True)#, ignore_index=True)
            #вот тут не совсем понятно, индекс это число или строка, индексы в data_explain_instance это числа или строки? в любом случае, при запуске можно починить
            data_explain_instance['index'] = np.tile(np.arange(0, self.explain.shape[0]), 2) 
            data_explain_instance['feature_group'] = np.repeat(['real_target', 'fake_target'], repeats=self.explain.shape[0])
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 0
            data_explain_instance['causal_type'] = None

          else:
            if self.target_features[j] in self.causal_nodes:
              explain_instance_real_causes_fake_effects_real_target =\
              explain_instance_real_causes_fake_effects_real_target.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target =\
              explain_instance_real_causes_fake_effects_fake_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target_cause =\
              explain_instance_fake_causes_real_effects_real_target_cause.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target_cause =\
              explain_instance_fake_causes_real_effects_fake_target_cause.loc[:, self.explain.columns]

            if self.target_features[j] in self.effect_nodes:
              explain_instance_real_causes_fake_effects_real_target_effect =\
              explain_instance_real_causes_fake_effects_real_target_effect.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target_effect =\
              explain_instance_real_causes_fake_effects_fake_target_effect.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target =\
              explain_instance_fake_causes_real_effects_real_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target =\
              explain_instance_fake_causes_real_effects_fake_target.loc[:, self.explain.columns]

            if self.target_features[j] in self.causal_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause], axis=0
              ).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",
                                                          "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_a_cause"

            if self.target_features[j] in self.effect_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",
                                                          "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_an_effect"

            if (self.target_features[j] in self.causal_nodes) and (self.target_features[j] in self.effect_nodes):
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause,
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 8)  # Eight Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series([
                "real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",  # Target is a causal node.
                "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause",  # Target is a causal node.
                "real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",  # Target is an effect node.
                "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"  # Target is an effect node.
                ]),
              self.explain.shape[0])
              data_explain_instance['causal_type'] = np.tile(pd.Series([
                "target_is_a_cause", "target_is_a_cause", "target_is_a_cause", "target_is_a_cause",
                "target_is_an_effect", "target_is_an_effect", "target_is_an_effect", "target_is_an_effect"]
              ),
              self.explain.shape[0])
            
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 1

          data_explain_instance['sample'] = i
          data_sample_feature.append(data_explain_instance)

        data_sample.append(data_sample_feature)

      data_sample = pd.concat([pd.concat(data_sample_i, axis=0) for data_sample_i in data_sample], axis=0)
      return data_sample

    def predict_shapFlex(self, data_predict):
      '''есть self.reference, self.model, self.predict_function, self.n_features, self.causal, self.causal_weights'''
      data_model = data_predict.iloc[:, :self.n_features]
      data_meta = data_predict.iloc[:, self.n_features:]
      data_predicted = pd.DataFrame(predict_function(self.model, data_model), index=data_model.index)
      data_predicted = pd.concat([data_meta, data_predicted], axis=1)
      #мб придется править, в зависимости от формата входных данных (вектор-строка/-столбец), пока результат по всем измерениям, скаляр
      intercept = predict_function(self.model, self.reference).mean(skipna=True)
      #вмест data.shape[1] взял -1
      #костыль, не понимаю, что тут должно быть пока
      user_fun_y_pred_name = data_predicted.columns[-1]
      #тут нюанс: у них перед вэлью !! стоит, что значит значение которое за ними следует, это не значение, а expression, что бы это 
      # ни значило, соответсвенно, может беда быть
      #data_predicted = pd.concat([
      #  data_predicted.drop('feature_group', axis=1), 
      #  data_predicted.reset_index().pivot_table(index='index', columns=[ 'feature_group'], values=user_fun_y_pred_name)
      #  ], axis=1)
      data_predicted = data_predicted.reset_index().pivot_table(
        index=set(data_predicted.columns) - set(['index', 'feature_group', user_fun_y_pred_name]),
        columns=['feature_group'],
        values=user_fun_y_pred_name
      )
      data_non_causal = data_predicted.loc[data_predicted['causal']==0]
      data_non_causal['shap_effect'] = data_non_causal['real_target'] - data_non_causal['fake_target']
      data_causal = data_predicted.loc[data_predicted['causal']==1]

      if isinstance(self.causal, pd.core.frame.DataFrame):
        pass

      data_predicted = pd.concat([data_causal, data_non_causal], ignore_index=True, axis=0)
      data_predicted = data_predicted.loc[:, ['index', 'sample', 'feature_name', 'shap_effect']]

      data_predicted = data_predicted.reset_index().dropna(axis=0).groupby(['index', 'feature_name']).agg({'shap_effect': [np.std, np.mean]})
      data_predicted[('shap_effect', 'intercept')] = intercept[0]

      return data_predicted

    def forward(self):
      data_predict = self.loop_over_monte_carlo_samples()
      data_predicted = self.predict_shapFlex(data_predict)
      return data_predicted




import pandas as pd
import numpy as np
data = pd.read_csv('https://kolodezev.ru/download/data_adult.csv', index_col=0)
outcome_name = 'income'
outcome_col = pd.Series(data.columns)[data.columns==outcome_name].index[0]
X, y = data.drop(outcome_name, axis=1), data[outcome_name].values
cat_features = [inx for inx, value in zip(X.dtypes.index, X.dtypes) if value =='object']
model = CatBoostClassifier()
model.fit(X, y, cat_features=cat_features, verbose=False)
def predict_function(model, data):
  #pd.DataFrame(model.predict_proba(X)).loc[:, 0][9] если запустить будет результат 0.98, что соответствует
  #выводу для 9 номера который равен 0.98, неважно какой алгоритм, такая высокая степень уверенности
  #позволяет идентифицировать выводимую колонку однозначно
  return pd.DataFrame(model.predict_proba(data)[:, [0]])


explain, reference = data.iloc[:350, :data.shape[1]-1], data.iloc[:, :data.shape[1]-1]
sample_size = 60
target_features = pd.Series(["marital_status", "education", "relationship",  "native_country",
                     "age", "sex", "race", "hours_per_week"])
causal = pd.DataFrame(
  dict(cause=pd.Series(["age", "sex", "race", "native_country",
              "age", "sex", "race", "native_country", "age",
              "sex", "race", "native_country"]),
  effect = pd.Series(np.concatenate([np.tile("marital_status", 4), np.tile("education", 4), np.tile("relationship", 4)])))
)
exmpl_of_test = shapFlex_plus(explain,  model, predict_function, target_features=pd.Series(["marital_status", "education", "relationship", "native_country",
"age", "sex", "race", "hours_per_week"]), causal=causal, causal_weights = [0.5 for x in range(len(causal))])
data_predict = exmpl_of_test.loop_over_monte_carlo_samples()
data_predicted = exmpl_of_test.predict_shapFlex(data_predict)
print(data_predicted)

In [None]:
#28.03
#падает уже в предикт шапфлекс потому что не реализован causal
import numpy as np
import pandas as pd
import igraph
import itertools
from catboost import CatBoostClassifier

class shapFlex_plus:
    def __init__(self, explain,  model, predict_function, reference = None, target_features = None, \
                     causal = None, causal_weights = None, sample_size = None, use_future = None):
        self.explain = explain
        self.reference = reference if reference else explain
        self.model = model
        self.predict_function = predict_function
        self.target_features = target_features if isinstance(target_features, pd.core.series.Series) else explain.columns.tolist()
        self.causal = causal #if causal else None
        self.causal_weights = causal_weights #if causal_weights else None
        self.sample_size = sample_size if sample_size else 60
        self.use_future = use_future if isinstance(target_features, pd.core.series.Series) else False
        
        self.n_features = self.explain.shape[1]
        self.n_instances = self.reference.shape[0]

        self.causal_graph = igraph.Graph.DataFrame(self.causal, directed=True) if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v for v in self.causal_graph.vs] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.each_node_causes = {v['name']: [succ['name'] for succ in v.successors()] for v in self.nodes if v.successors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# надо уточнить, мб здесь не только "прямые" successors и predecessors ищутся 
        self.each_node_is_an_effect_from = {v['name']: [pred['name'] for pred in v.predecessors()] for v in self.nodes if v.predecessors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# но и вообще все
        # имена, кажется, уже прописаны автоматически
        self.causal_nodes = [v for v in self.each_node_causes.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.effect_nodes = [v for v in self.each_node_is_an_effect_from.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v['name'] for v in self.nodes] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]

    @staticmethod
    def unlist_df(data):
      unlisted_df = pd.Series(
                  data,
                  index=[
                  index_col + index_row for index_col, index_row in itertools.product(
                      [str(x) for x in range(data.shape[0])], 
                      [str(x) for x in data.columns])]
              )
      return unlisted_df
      
    def loop_over_monte_carlo_samples(self):
      i_size = self.sample_size
      j_size = len(self.target_features)
      data_sample = []

      for i in range(i_size):
        reference_index = np.random.choice(np.arange(0, self.n_features ), size=1, replace=False)
        feature_indices_random = np.random.choice(np.arange(0, self.n_features), size=self.n_features, replace=False)
        # r индексация стартует с 1 а питон с 0 поэтому нам нужно вычиать 1 или ставить по верхней границе индексы в зависимости от функции вызова
        feature_names_random = self.explain.columns[feature_indices_random].values
        reference_instance = self.reference.iloc[reference_index, feature_indices_random]
        #feature_indices_random это вектор индексов
        explain_instances = self.explain.iloc[:, feature_indices_random]
        data_sample_feature = []
        for j in range(j_size):
          target_feature_index =  self.explain.columns.get_loc(self.target_features[j])
          target_feature_index_shuffled = list(self.explain.columns.values[feature_indices_random]).index(self.target_features[j])
          #if True:
          #  print(target_feature_index)
          # target_feature_index = (self.explain.columns == self.target_features[j])
          # target_feature_index_shuffled = (self.explain.columns[feature_indices_random] == self.target_features[j])
          
          if self.target_features[j] in self.nodes:
            #unlist как я понял, вытягивает все данные в один длинный вектор, присваивает индексы как название колонки + название строки
            #предположу, что each_node_causes это pd.DataFrame()
            target_feature_causes_these_features =  [self.target_features[j]] + self.each_node_causes.get(self.target_features[j], []) 
            target_feature_is_caused_by =  [self.target_features[j]] + self.each_node_is_an_effect_from.get(self.target_features[j], []) 
            target_index = target_feature_index_shuffled
            #отмечаем те значения feature_names_random которые равны последнему значению 
            #target_feature_is_caused_by. target_feature_is_caused_by вроде как вектор
            #вернуться должно число. Если вдруг окажется, что датафрейм, -1 элемент будет строка, 
            #надо заменить на индексацию на iloc, == на .isin
            causes_indices = np.where(np.in1d(feature_names_random, target_feature_is_caused_by[1:]))[0]
            effects_indices  = np.where(np.in1d(feature_names_random, target_feature_causes_these_features[1:]))[0]
            sample_indices = feature_indices_random[~np.isin(feature_indices_random, 
                np.concatenate([[target_index], causes_indices, effects_indices]))]
            #c() вроде как склеивает вектор(ы) и переменные
            sample_real_indices = sample_indices[sample_indices < target_index]  # Not in causal diagram, feature data from 'explain'.
            sample_fake_indices = sample_indices[sample_indices > target_index]  # Not in causal diagram, feature data from 'reference'.

            feature_indices_real_causes_real_effects = np.concatenate([sample_real_indices, causes_indices, effects_indices, [target_index], sample_fake_indices])
            feature_indices_real_causes_fake_effects = np.concatenate([sample_real_indices, causes_indices, [target_index], effects_indices, sample_fake_indices])
            feature_indices_fake_causes_real_effects = np.concatenate([sample_real_indices, effects_indices, [target_index], causes_indices, sample_fake_indices])
            feature_indices_fake_causes_fake_effects = np.concatenate([sample_real_indices, [target_index], causes_indices, effects_indices, sample_fake_indices])
          
          if not self.target_features[j] in self.nodes:
            explain_instance_real_target = explain_instances.copy()

            # Only create a Frankenstein instance if the target is not the last feature and there is actually
            # one or more features to the right of the target to replace with the reference.
            if (target_feature_index_shuffled < self.n_features):
              #x = reference_instance.iloc[:, target_feature_index_shuffled: ]
              explain_instance_real_target.iloc[:, target_feature_index_shuffled+1: ] =\
                 pd.concat([reference_instance.iloc[:, target_feature_index_shuffled+1: ]] * self.explain.shape[0], axis=0).reset_index(drop=True)
              
            # These instances are otherwise the same as the Frankenstein instance created above with the
            # exception that the target feature is now replaced with the target feature in the random reference
            # instance. The difference in model predictions between these two Frankenstein instances is
            # what gives us the stochastic Shapley value approximation.
            explain_instance_fake_target = explain_instance_real_target.copy()
            
            # ОНИ ПОЧЕМУ ТО ВЫШЛИ ОДИНАКОВЫЕ, ЭТО ОК?
            explain_instance_fake_target.iloc[:, [target_feature_index_shuffled]] =\
               pd.concat([reference_instance.iloc[:, [target_feature_index_shuffled]]]  * self.explain.shape[0], axis=0).reset_index(drop=True)
          
          else:

            if self.target_features[j] in self.causal_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]
              target_index_temp = explain_instance_real_causes_fake_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_real_causes_fake_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]  * self.explain.shape[0], axis=0).reset_index(drop=True)
                

              explain_instance_real_causes_fake_effects_fake_target = explain_instance_real_causes_fake_effects_real_target
              explain_instance_real_causes_fake_effects_fake_target.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]]  * self.explain.shape[0], axis=0).reset_index(drop=True)

              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target_cause = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = explain_instance_real_causes_fake_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target_cause.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features+1]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
              
              explain_instance_fake_causes_real_effects_fake_target_cause = explain_instance_fake_causes_real_effects_real_target_cause
              explain_instance_fake_causes_real_effects_fake_target_cause.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)

            if self.target_features[j] in self.effect_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target_effect = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]

              target_index_temp = explain_instance_real_causes_fake_effects_real_target_effect.columns.get_loc(self.target_features[j])

              if (target_index_temp < self.n_features):
                explain_instance_real_causes_fake_effects_real_target_effect.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
              
              explain_instance_real_causes_fake_effects_fake_target_effect = explain_instance_real_causes_fake_effects_real_target_effect
              explain_instance_real_causes_fake_effects_fake_target_effect.iloc[:, target_index_temp] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = explain_instance_fake_causes_real_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
                

              explain_instance_fake_causes_real_effects_fake_target = explain_instance_fake_causes_real_effects_real_target
              explain_instance_fake_causes_real_effects_fake_target.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)

          if not self.target_features[j] in self.nodes:
            #магическим образом две нижеследующие строчки возвращают датафрейм к старому виду
            explain_instance_real_target = explain_instance_real_target.loc[:, self.explain.columns]
            explain_instance_fake_target = explain_instance_fake_target.loc[:, self.explain.columns]
            data_explain_instance = pd.concat([explain_instance_real_target, explain_instance_fake_target], axis=0).reset_index(drop=True)#, ignore_index=True)
            #вот тут не совсем понятно, индекс это число или строка, индексы в data_explain_instance это числа или строки? в любом случае, при запуске можно починить
            data_explain_instance['index'] = np.tile(np.arange(0, self.explain.shape[0]), 2) 
            data_explain_instance['feature_group'] = np.repeat(['real_target', 'fake_target'], repeats=self.explain.shape[0])
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 0
            data_explain_instance['causal_type'] = None

          else:
            if self.target_features[j] in self.causal_nodes:
              explain_instance_real_causes_fake_effects_real_target =\
              explain_instance_real_causes_fake_effects_real_target.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target =\
              explain_instance_real_causes_fake_effects_fake_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target_cause =\
              explain_instance_fake_causes_real_effects_real_target_cause.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target_cause =\
              explain_instance_fake_causes_real_effects_fake_target_cause.loc[:, self.explain.columns]

            if self.target_features[j] in self.effect_nodes:
              explain_instance_real_causes_fake_effects_real_target_effect =\
              explain_instance_real_causes_fake_effects_real_target_effect.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target_effect =\
              explain_instance_real_causes_fake_effects_fake_target_effect.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target =\
              explain_instance_fake_causes_real_effects_real_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target =\
              explain_instance_fake_causes_real_effects_fake_target.loc[:, self.explain.columns]

            if self.target_features[j] in self.causal_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause], axis=0
              ).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",
                                                          "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_a_cause"

            if self.target_features[j] in self.effect_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",
                                                          "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_an_effect"

            if (self.target_features[j] in self.causal_nodes) and (self.target_features[j] in self.effect_nodes):
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause,
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 8)  # Eight Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series([
                "real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",  # Target is a causal node.
                "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause",  # Target is a causal node.
                "real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",  # Target is an effect node.
                "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"  # Target is an effect node.
                ]),
              self.explain.shape[0])
              data_explain_instance['causal_type'] = np.tile(pd.Series([
                "target_is_a_cause", "target_is_a_cause", "target_is_a_cause", "target_is_a_cause",
                "target_is_an_effect", "target_is_an_effect", "target_is_an_effect", "target_is_an_effect"]
              ),
              self.explain.shape[0])
            
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 1

          data_explain_instance['sample'] = i
          data_sample_feature.append(data_explain_instance)

        data_sample.append(data_sample_feature)

      data_sample = pd.concat([pd.concat(data_sample_i, axis=0) for data_sample_i in data_sample], axis=0)
      return data_sample

    def predict_shapFlex(self, data_predict):
      '''есть self.reference, self.model, self.predict_function, self.n_features, self.causal, self.causal_weights'''
      data_model = data_predict.iloc[:, :self.n_features]
      data_meta = data_predict.iloc[:, self.n_features:]
      data_predicted = pd.DataFrame(predict_function(self.model, data_model), index=data_model.index)
      data_predicted = pd.concat([data_meta, data_predicted], axis=1)
      #мб придется править, в зависимости от формата входных данных (вектор-строка/-столбец), пока результат по всем измерениям, скаляр
      intercept = predict_function(self.model, self.reference).mean(skipna=True)
      #вмест data.shape[1] взял -1
      #костыль, не понимаю, что тут должно быть пока
      user_fun_y_pred_name = data_predicted.columns[-1]
      #тут нюанс: у них перед вэлью !! стоит, что значит значение которое за ними следует, это не значение, а expression, что бы это 
      # ни значило, соответсвенно, может беда быть
      #data_predicted = pd.concat([
      #  data_predicted.drop('feature_group', axis=1), 
      #  data_predicted.reset_index().pivot_table(index='index', columns=[ 'feature_group'], values=user_fun_y_pred_name)
      #  ], axis=1)
      data_predicted = data_predicted.reset_index().pivot_table(
        index=set(data_predicted.columns) - set(['index', 'feature_group', user_fun_y_pred_name]),
        columns=['feature_group'],
        values=user_fun_y_pred_name
      )
      data_non_causal = data_predicted.loc[data_predicted['causal']==0]
      data_non_causal['shap_effect'] = data_non_causal['real_target'] - data_non_causal['fake_target']
      data_causal = data_predicted.loc[data_predicted['causal']==1]

      if isinstance(self.causal, pd.core.frame.DataFrame):
        pass

      data_predicted = pd.concat([data_causal, data_non_causal], ignore_index=True, axis=0)
      data_predicted = data_predicted.loc[:, ['index', 'sample', 'feature_name', 'shap_effect']]

      data_predicted = data_predicted.reset_index().dropna(axis=0).groupby(['index', 'feature_name']).agg({'shap_effect': [np.std, np.mean]})
      data_predicted[('shap_effect', 'intercept')] = intercept[0]

      return data_predicted

    def forward(self):
      data_predict = self.loop_over_monte_carlo_samples()
      data_predicted = self.predict_shapFlex(data_predict)
      return data_predicted




import pandas as pd
import numpy as np
data = pd.read_csv('https://kolodezev.ru/download/data_adult.csv', index_col=0)
outcome_name = 'income'
outcome_col = pd.Series(data.columns)[data.columns==outcome_name].index[0]
X, y = data.drop(outcome_name, axis=1), data[outcome_name].values
cat_features = [inx for inx, value in zip(X.dtypes.index, X.dtypes) if value =='object']
model = CatBoostClassifier()
model.fit(X, y, cat_features=cat_features, verbose=False)
def predict_function(model, data):
  #pd.DataFrame(model.predict_proba(X)).loc[:, 0][9] если запустить будет результат 0.98, что соответствует
  #выводу для 9 номера который равен 0.98, неважно какой алгоритм, такая высокая степень уверенности
  #позволяет идентифицировать выводимую колонку однозначно
  return pd.DataFrame(model.predict_proba(data)[:, [0]])


explain, reference = data.iloc[:350, :data.shape[1]-1], data.iloc[:, :data.shape[1]-1]
sample_size = 60
target_features = pd.Series(["marital_status", "education", "relationship",  "native_country",
                     "age", "sex", "race", "hours_per_week"])
causal = pd.DataFrame(
  dict(cause=pd.Series(["age", "sex", "race", "native_country",
              "age", "sex", "race", "native_country", "age",
              "sex", "race", "native_country"]),
  effect = pd.Series(np.concatenate([np.tile("marital_status", 4), np.tile("education", 4), np.tile("relationship", 4)])))
)
exmpl_of_test = shapFlex_plus(explain,  model, predict_function, target_features=pd.Series(["marital_status", "education", "relationship", "native_country",
"age", "sex", "race", "hours_per_week"]), causal=causal, causal_weights = [0.5 for x in range(len(causal))])
data_predict = exmpl_of_test.loop_over_monte_carlo_samples()
data_predicted = exmpl_of_test.predict_shapFlex(data_predict)
print(data_predicted)

In [None]:
df=pd.read_csv("/content/data_predicted.csv")
df.info()
#  data_predicted.pivot_table(
#         index=set(data_predicted.columns) - set(['index', 'feature_group', user_fun_y_pred_name]),
#         columns=['feature_group'],
#         values=user_fun_y_pred_name
#       )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Unnamed: 0       630000 non-null  int64  
 1   index_in_sample  630000 non-null  int64  
 2   feature_group    630000 non-null  object 
 3   causal_type      588000 non-null  object 
 4   feature_name     630000 non-null  object 
 5   causal           630000 non-null  int64  
 6   sample           630000 non-null  int64  
 7   0                630000 non-null  float64
dtypes: float64(1), int64(4), object(3)
memory usage: 38.5+ MB


In [None]:
df2 = df.fillna(0)
df2 = df2.pivot_table(index =['causal', 'sample', 'causal_type', 'feature_name', 'index_in_sample'], columns=['feature_group'],values='0')
df2.reset_index()

feature_group,causal,sample,causal_type,feature_name,index_in_sample,fake_causes_real_effects_fake_target,fake_causes_real_effects_fake_target_cause,fake_causes_real_effects_real_target,fake_causes_real_effects_real_target_cause,fake_target,real_causes_fake_effects_fake_target,real_causes_fake_effects_fake_target_effect,real_causes_fake_effects_real_target,real_causes_fake_effects_real_target_effect,real_target
0,0,0,0,hours_per_week,0,,,,,0.010056,,,,,0.023626
1,0,0,0,hours_per_week,1,,,,,0.004665,,,,,0.020725
2,0,0,0,hours_per_week,2,,,,,0.043046,,,,,0.116128
3,0,0,0,hours_per_week,3,,,,,0.015118,,,,,0.021263
4,0,0,0,hours_per_week,4,,,,,0.006803,,,,,0.006771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167995,1,59,target_is_an_effect,relationship,346,0.065126,,,,,,0.065126,,,
167996,1,59,target_is_an_effect,relationship,347,,,0.034478,,,,,,0.034478,
167997,1,59,target_is_an_effect,relationship,348,0.094700,,,,,,0.094700,,,
167998,1,59,target_is_an_effect,relationship,349,,,0.043508,,,,,,0.043508,


In [None]:
#30.03

import numpy as np
import pandas as pd
import igraph
import itertools
from catboost import CatBoostClassifier

class shapFlex_plus:
    def __init__(self, explain,  model, predict_function, reference = None, target_features = None, \
                     causal = None, causal_weights = None, sample_size = None, use_future = None):
        self.explain = explain
        self.reference = reference if reference else explain
        self.model = model
        self.predict_function = predict_function
        self.target_features = target_features if isinstance(target_features, pd.core.series.Series) else explain.columns.tolist()
        self.causal = causal #if causal else None
        self.causal_weights = causal_weights #if causal_weights else None
        self.sample_size = sample_size if sample_size else 60
        self.use_future = use_future if isinstance(target_features, pd.core.series.Series) else False
        
        self.n_features = self.explain.shape[1]
        self.n_instances = self.reference.shape[0]

        self.causal_graph = igraph.Graph.DataFrame(self.causal, directed=True) if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v for v in self.causal_graph.vs] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.each_node_causes = {v['name']: [succ['name'] for succ in v.successors()] for v in self.nodes if v.successors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# надо уточнить, мб здесь не только "прямые" successors и predecessors ищутся 
        self.each_node_is_an_effect_from = {v['name']: [pred['name'] for pred in v.predecessors()] for v in self.nodes if v.predecessors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# но и вообще все
        # имена, кажется, уже прописаны автоматически
        self.causal_nodes = [v for v in self.each_node_causes.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.effect_nodes = [v for v in self.each_node_is_an_effect_from.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v['name'] for v in self.nodes] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]

    @staticmethod
    def unlist_df(data):
      unlisted_df = pd.Series(
                  data,
                  index=[
                  index_col + index_row for index_col, index_row in itertools.product(
                      [str(x) for x in range(data.shape[0])], 
                      [str(x) for x in data.columns])]
              )
      return unlisted_df
      
    def loop_over_monte_carlo_samples(self):
      i_size = self.sample_size
      j_size = len(self.target_features)
      data_sample = []

      for i in range(i_size):
        reference_index = np.random.choice(np.arange(0, self.n_features ), size=1, replace=False)
        feature_indices_random = np.random.choice(np.arange(0, self.n_features), size=self.n_features, replace=False)
        # r индексация стартует с 1 а питон с 0 поэтому нам нужно вычиать 1 или ставить по верхней границе индексы в зависимости от функции вызова
        feature_names_random = self.explain.columns[feature_indices_random].values
        reference_instance = self.reference.iloc[reference_index, feature_indices_random]
        #feature_indices_random это вектор индексов
        explain_instances = self.explain.iloc[:, feature_indices_random]
        data_sample_feature = []
        for j in range(j_size):
          target_feature_index =  self.explain.columns.get_loc(self.target_features[j])
          target_feature_index_shuffled = list(self.explain.columns.values[feature_indices_random]).index(self.target_features[j])
          #if True:
          #  print(target_feature_index)
          # target_feature_index = (self.explain.columns == self.target_features[j])
          # target_feature_index_shuffled = (self.explain.columns[feature_indices_random] == self.target_features[j])
          
          if self.target_features[j] in self.nodes:
            #unlist как я понял, вытягивает все данные в один длинный вектор, присваивает индексы как название колонки + название строки
            #предположу, что each_node_causes это pd.DataFrame()
            target_feature_causes_these_features =  [self.target_features[j]] + self.each_node_causes.get(self.target_features[j], []) 
            target_feature_is_caused_by =  [self.target_features[j]] + self.each_node_is_an_effect_from.get(self.target_features[j], []) 
            target_index = target_feature_index_shuffled
            #отмечаем те значения feature_names_random которые равны последнему значению 
            #target_feature_is_caused_by. target_feature_is_caused_by вроде как вектор
            #вернуться должно число. Если вдруг окажется, что датафрейм, -1 элемент будет строка, 
            #надо заменить на индексацию на iloc, == на .isin
            causes_indices = np.where(np.in1d(feature_names_random, target_feature_is_caused_by[1:]))[0]
            effects_indices  = np.where(np.in1d(feature_names_random, target_feature_causes_these_features[1:]))[0]
            sample_indices = feature_indices_random[~np.isin(feature_indices_random, 
                np.concatenate([[target_index], causes_indices, effects_indices]))]
            #c() вроде как склеивает вектор(ы) и переменные
            sample_real_indices = sample_indices[sample_indices < target_index]  # Not in causal diagram, feature data from 'explain'.
            sample_fake_indices = sample_indices[sample_indices > target_index]  # Not in causal diagram, feature data from 'reference'.

            feature_indices_real_causes_real_effects = np.concatenate([sample_real_indices, causes_indices, effects_indices, [target_index], sample_fake_indices])
            feature_indices_real_causes_fake_effects = np.concatenate([sample_real_indices, causes_indices, [target_index], effects_indices, sample_fake_indices])
            feature_indices_fake_causes_real_effects = np.concatenate([sample_real_indices, effects_indices, [target_index], causes_indices, sample_fake_indices])
            feature_indices_fake_causes_fake_effects = np.concatenate([sample_real_indices, [target_index], causes_indices, effects_indices, sample_fake_indices])
          
          if not self.target_features[j] in self.nodes:
            explain_instance_real_target = explain_instances.copy()

            # Only create a Frankenstein instance if the target is not the last feature and there is actually
            # one or more features to the right of the target to replace with the reference.
            if (target_feature_index_shuffled < self.n_features):
              #x = reference_instance.iloc[:, target_feature_index_shuffled: ]
              explain_instance_real_target.iloc[:, target_feature_index_shuffled+1: ] =\
                 pd.concat([reference_instance.iloc[:, target_feature_index_shuffled+1: ]] * self.explain.shape[0], axis=0).reset_index(drop=True)
              
            # These instances are otherwise the same as the Frankenstein instance created above with the
            # exception that the target feature is now replaced with the target feature in the random reference
            # instance. The difference in model predictions between these two Frankenstein instances is
            # what gives us the stochastic Shapley value approximation.
            explain_instance_fake_target = explain_instance_real_target.copy()
            
            # ОНИ ПОЧЕМУ ТО ВЫШЛИ ОДИНАКОВЫЕ, ЭТО ОК?
            explain_instance_fake_target.iloc[:, [target_feature_index_shuffled]] =\
               pd.concat([reference_instance.iloc[:, [target_feature_index_shuffled]]]  * self.explain.shape[0], axis=0).reset_index(drop=True)
          
          else:

            if self.target_features[j] in self.causal_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]
              target_index_temp = explain_instance_real_causes_fake_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_real_causes_fake_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]  * self.explain.shape[0], axis=0).reset_index(drop=True)
                

              explain_instance_real_causes_fake_effects_fake_target = explain_instance_real_causes_fake_effects_real_target
              explain_instance_real_causes_fake_effects_fake_target.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]]  * self.explain.shape[0], axis=0).reset_index(drop=True)

              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target_cause = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = explain_instance_real_causes_fake_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target_cause.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features+1]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
              
              explain_instance_fake_causes_real_effects_fake_target_cause = explain_instance_fake_causes_real_effects_real_target_cause
              explain_instance_fake_causes_real_effects_fake_target_cause.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)

            if self.target_features[j] in self.effect_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target_effect = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]

              target_index_temp = explain_instance_real_causes_fake_effects_real_target_effect.columns.get_loc(self.target_features[j])

              if (target_index_temp < self.n_features):
                explain_instance_real_causes_fake_effects_real_target_effect.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
              
              explain_instance_real_causes_fake_effects_fake_target_effect = explain_instance_real_causes_fake_effects_real_target_effect
              explain_instance_real_causes_fake_effects_fake_target_effect.iloc[:, target_index_temp] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = explain_instance_fake_causes_real_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
                

              explain_instance_fake_causes_real_effects_fake_target = explain_instance_fake_causes_real_effects_real_target
              explain_instance_fake_causes_real_effects_fake_target.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)

          if not self.target_features[j] in self.nodes:
            #магическим образом две нижеследующие строчки возвращают датафрейм к старому виду
            explain_instance_real_target = explain_instance_real_target.loc[:, self.explain.columns]
            explain_instance_fake_target = explain_instance_fake_target.loc[:, self.explain.columns]
            data_explain_instance = pd.concat([explain_instance_real_target, explain_instance_fake_target], axis=0).reset_index(drop=True)#, ignore_index=True)
            #вот тут не совсем понятно, индекс это число или строка, индексы в data_explain_instance это числа или строки? в любом случае, при запуске можно починить
            data_explain_instance['index_in_sample'] = np.tile(np.arange(0, self.explain.shape[0]), 2) 
            data_explain_instance['feature_group'] = np.repeat(['real_target', 'fake_target'], repeats=self.explain.shape[0])
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 0
            data_explain_instance['causal_type'] = None

          else:
            if self.target_features[j] in self.causal_nodes:
              explain_instance_real_causes_fake_effects_real_target =\
              explain_instance_real_causes_fake_effects_real_target.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target =\
              explain_instance_real_causes_fake_effects_fake_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target_cause =\
              explain_instance_fake_causes_real_effects_real_target_cause.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target_cause =\
              explain_instance_fake_causes_real_effects_fake_target_cause.loc[:, self.explain.columns]

            if self.target_features[j] in self.effect_nodes:
              explain_instance_real_causes_fake_effects_real_target_effect =\
              explain_instance_real_causes_fake_effects_real_target_effect.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target_effect =\
              explain_instance_real_causes_fake_effects_fake_target_effect.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target =\
              explain_instance_fake_causes_real_effects_real_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target =\
              explain_instance_fake_causes_real_effects_fake_target.loc[:, self.explain.columns]

            if self.target_features[j] in self.causal_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause], axis=0
              ).reset_index(drop=True)
              data_explain_instance['index_in_sample'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",
                                                          "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_a_cause"

            if self.target_features[j] in self.effect_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index_in_sample'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",
                                                          "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_an_effect"

            if (self.target_features[j] in self.causal_nodes) and (self.target_features[j] in self.effect_nodes):
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause,
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index_in_sample'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 8)  # Eight Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series([
                "real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",  # Target is a causal node.
                "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause",  # Target is a causal node.
                "real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",  # Target is an effect node.
                "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"  # Target is an effect node.
                ]),
              self.explain.shape[0])
              data_explain_instance['causal_type'] = np.tile(pd.Series([
                "target_is_a_cause", "target_is_a_cause", "target_is_a_cause", "target_is_a_cause",
                "target_is_an_effect", "target_is_an_effect", "target_is_an_effect", "target_is_an_effect"]
              ),
              self.explain.shape[0])
            
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 1

          data_explain_instance['sample'] = i
          data_sample_feature.append(data_explain_instance)

        data_sample.append(data_sample_feature)

      data_sample = pd.concat([pd.concat(data_sample_i, axis=0) for data_sample_i in data_sample], axis=0).reset_index(drop=True)
      return data_sample

    def predict_shapFlex(self, data_predict):
      '''есть self.reference, self.model, self.predict_function, self.n_features, self.causal, self.causal_weights'''
      data_model = data_predict.iloc[:, :self.n_features].copy()
      data_meta = data_predict.iloc[:, self.n_features:].copy()
      data_predicted = pd.DataFrame(predict_function(self.model, data_model), index=data_model.index)
      data_predicted = pd.concat([data_meta, data_predicted], axis=1)
      #мб придется править, в зависимости от формата входных данных (вектор-строка/-столбец), пока результат по всем измерениям, скаляр
      intercept = predict_function(self.model, self.reference).mean(skipna=True)
      #вмест data.shape[1] взял -1
      #костыль, не понимаю, что тут должно быть пока
      user_fun_y_pred_name = data_predicted.columns[-1]
      #тут нюанс: у них перед вэлью !! стоит, что значит значение которое за ними следует, это не значение, а expression, что бы это 
      # ни значило, соответсвенно, может беда быть
      #data_predicted = pd.concat([
      #  data_predicted.drop('feature_group', axis=1), 
      #  data_predicted.reset_index().pivot_table(index='index', columns=[ 'feature_group'], values=user_fun_y_pred_name)
      #  ], axis=1)

      data_predicted = data_predicted.pivot_table(
        index=['causal_type', 'sample', 'causal', 'feature_name'],#set(data_predicted.columns) - set(['index', 'feature_group', user_fun_y_pred_name]),
        columns=['feature_group'],
        values=user_fun_y_pred_name
      ).reset_index()
      data_non_causal = data_predicted.loc[data_predicted['causal']==0]
      data_non_causal['shap_effect'] = data_non_causal['real_target'] - data_non_causal['fake_target']
      data_causal = data_predicted.loc[data_predicted['causal']==1]

      if isinstance(self.causal, pd.core.frame.DataFrame):
        pass

      data_predicted = pd.concat([data_causal, data_non_causal], ignore_index=True, axis=0)
      data_predicted = data_predicted.loc[:, ['index_in_sample', 'sample', 'feature_name', 'shap_effect']]

      data_predicted = data_predicted.reset_index().dropna(axis=0).groupby(['index_in_sample', 'feature_name']).agg({'shap_effect': [np.std, np.mean]})
      data_predicted[('shap_effect', 'intercept')] = intercept[0]

      return data_predicted

    def forward(self):
      data_predict = self.loop_over_monte_carlo_samples()
      data_predicted = self.predict_shapFlex(data_predict)
      return data_predicted




import pandas as pd
import numpy as np
data = pd.read_csv('https://kolodezev.ru/download/data_adult.csv', index_col=0)
outcome_name = 'income'
outcome_col = pd.Series(data.columns)[data.columns==outcome_name].index[0]
X, y = data.drop(outcome_name, axis=1), data[outcome_name].values
cat_features = [inx for inx, value in zip(X.dtypes.index, X.dtypes) if value =='object']
model = CatBoostClassifier()
model.fit(X, y, cat_features=cat_features, verbose=False)
def predict_function(model, data):
  #pd.DataFrame(model.predict_proba(X)).loc[:, 0][9] если запустить будет результат 0.98, что соответствует
  #выводу для 9 номера который равен 0.98, неважно какой алгоритм, такая высокая степень уверенности
  #позволяет идентифицировать выводимую колонку однозначно
  return pd.DataFrame(model.predict_proba(data)[:, [0]])


explain, reference = data.iloc[:300, :data.shape[1]-1], data.iloc[:, :data.shape[1]-1]
sample_size = 10
target_features = pd.Series(["marital_status", "education", "relationship",  "native_country",
                     "age", "sex", "race", "hours_per_week"])
causal = pd.DataFrame(
  dict(cause=pd.Series(["age", "sex", "race", "native_country",
              "age", "sex", "race", "native_country", "age",
              "sex", "race", "native_country"]),
  effect = pd.Series(np.concatenate([np.tile("marital_status", 4), np.tile("education", 4), np.tile("relationship", 4)])))
)
exmpl_of_test = shapFlex_plus(explain,  model, predict_function, target_features=pd.Series(["marital_status", "education", "relationship", "native_country",
"age", "sex", "race", "hours_per_week"]), causal=causal, causal_weights = [0.5 for x in range(len(causal))])
data_predict = exmpl_of_test.loop_over_monte_carlo_samples()
data_predicted = exmpl_of_test.predict_shapFlex(data_predict)
#print(data_predicted)

In [None]:
#06.04
import numpy as np
import pandas as pd
import igraph
import itertools
from catboost import CatBoostClassifier

class shapFlex_plus:
    def __init__(self, explain,  model, predict_function, reference = None, target_features = None, \
                     causal = None, causal_weights = None, sample_size = None, use_future = None):
        self.explain = explain
        self.reference = reference if reference else explain
        self.model = model
        self.predict_function = predict_function
        self.target_features = target_features if isinstance(target_features, pd.core.series.Series) else explain.columns.tolist()
        self.causal = causal #if causal else None
        self.causal_weights = causal_weights #if causal_weights else None
        self.sample_size = sample_size if sample_size else 60
        self.use_future = use_future if isinstance(target_features, pd.core.series.Series) else False
        
        self.n_features = self.explain.shape[1]
        self.n_instances = self.reference.shape[0]

        self.causal_graph = igraph.Graph.DataFrame(self.causal, directed=True) if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v for v in self.causal_graph.vs] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.each_node_causes = {v['name']: [succ['name'] for succ in v.successors()] for v in self.nodes if v.successors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# надо уточнить, мб здесь не только "прямые" successors и predecessors ищутся 
        self.each_node_is_an_effect_from = {v['name']: [pred['name'] for pred in v.predecessors()] for v in self.nodes if v.predecessors()} if isinstance(self.causal, pd.core.frame.DataFrame) else [None]# но и вообще все
        # имена, кажется, уже прописаны автоматически
        self.causal_nodes = [v for v in self.each_node_causes.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.effect_nodes = [v for v in self.each_node_is_an_effect_from.keys()] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]
        self.nodes = [v['name'] for v in self.nodes] if isinstance(self.causal, pd.core.frame.DataFrame) else [None]

    @staticmethod
    def unlist_df(data):
      unlisted_df = pd.Series(
                  data,
                  index=[
                  index_col + index_row for index_col, index_row in itertools.product(
                      [str(x) for x in range(data.shape[0])], 
                      [str(x) for x in data.columns])]
              )
      return unlisted_df
      
    def loop_over_monte_carlo_samples(self):
      i_size = self.sample_size
      j_size = len(self.target_features)
      data_sample = []

      for i in range(i_size):
        reference_index = np.random.choice(np.arange(0, self.n_features ), size=1, replace=False)
        feature_indices_random = np.random.choice(np.arange(0, self.n_features), size=self.n_features, replace=False)
        # r индексация стартует с 1 а питон с 0 поэтому нам нужно вычиать 1 или ставить по верхней границе индексы в зависимости от функции вызова
        feature_names_random = self.explain.columns[feature_indices_random].values
        reference_instance = self.reference.iloc[reference_index, feature_indices_random]
        #feature_indices_random это вектор индексов
        explain_instances = self.explain.iloc[:, feature_indices_random]
        data_sample_feature = []
        for j in range(j_size):
          target_feature_index =  self.explain.columns.get_loc(self.target_features[j])
          target_feature_index_shuffled = list(self.explain.columns.values[feature_indices_random]).index(self.target_features[j])
          #if True:
          #  print(target_feature_index)
          # target_feature_index = (self.explain.columns == self.target_features[j])
          # target_feature_index_shuffled = (self.explain.columns[feature_indices_random] == self.target_features[j])
          
          if self.target_features[j] in self.nodes:
            #unlist как я понял, вытягивает все данные в один длинный вектор, присваивает индексы как название колонки + название строки
            #предположу, что each_node_causes это pd.DataFrame()
            target_feature_causes_these_features =  [self.target_features[j]] + self.each_node_causes.get(self.target_features[j], []) 
            target_feature_is_caused_by =  [self.target_features[j]] + self.each_node_is_an_effect_from.get(self.target_features[j], []) 
            target_index = target_feature_index_shuffled
            #отмечаем те значения feature_names_random которые равны последнему значению 
            #target_feature_is_caused_by. target_feature_is_caused_by вроде как вектор
            #вернуться должно число. Если вдруг окажется, что датафрейм, -1 элемент будет строка, 
            #надо заменить на индексацию на iloc, == на .isin
            causes_indices = np.where(np.in1d(feature_names_random, target_feature_is_caused_by[1:]))[0]
            effects_indices  = np.where(np.in1d(feature_names_random, target_feature_causes_these_features[1:]))[0]
            sample_indices = feature_indices_random[~np.isin(feature_indices_random, 
                np.concatenate([[target_index], causes_indices, effects_indices]))]
            #c() вроде как склеивает вектор(ы) и переменные
            sample_real_indices = sample_indices[sample_indices < target_index]  # Not in causal diagram, feature data from 'explain'.
            sample_fake_indices = sample_indices[sample_indices > target_index]  # Not in causal diagram, feature data from 'reference'.

            feature_indices_real_causes_real_effects = np.concatenate([sample_real_indices, causes_indices, effects_indices, [target_index], sample_fake_indices])
            feature_indices_real_causes_fake_effects = np.concatenate([sample_real_indices, causes_indices, [target_index], effects_indices, sample_fake_indices])
            feature_indices_fake_causes_real_effects = np.concatenate([sample_real_indices, effects_indices, [target_index], causes_indices, sample_fake_indices])
            feature_indices_fake_causes_fake_effects = np.concatenate([sample_real_indices, [target_index], causes_indices, effects_indices, sample_fake_indices])
          
          if not self.target_features[j] in self.nodes:
            explain_instance_real_target = explain_instances.copy()

            # Only create a Frankenstein instance if the target is not the last feature and there is actually
            # one or more features to the right of the target to replace with the reference.
            if (target_feature_index_shuffled < self.n_features):
              #x = reference_instance.iloc[:, target_feature_index_shuffled: ]
              explain_instance_real_target.iloc[:, target_feature_index_shuffled+1: ] =\
                 pd.concat([reference_instance.iloc[:, target_feature_index_shuffled+1: ]] * self.explain.shape[0], axis=0).reset_index(drop=True)
              
            # These instances are otherwise the same as the Frankenstein instance created above with the
            # exception that the target feature is now replaced with the target feature in the random reference
            # instance. The difference in model predictions between these two Frankenstein instances is
            # what gives us the stochastic Shapley value approximation.
            explain_instance_fake_target = explain_instance_real_target.copy()
            
            # ОНИ ПОЧЕМУ ТО ВЫШЛИ ОДИНАКОВЫЕ, ЭТО ОК?
            explain_instance_fake_target.iloc[:, [target_feature_index_shuffled]] =\
               pd.concat([reference_instance.iloc[:, [target_feature_index_shuffled]]]  * self.explain.shape[0], axis=0).reset_index(drop=True)
          
          else:

            if self.target_features[j] in self.causal_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]
              target_index_temp = explain_instance_real_causes_fake_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_real_causes_fake_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]  * self.explain.shape[0], axis=0).reset_index(drop=True)
                

              explain_instance_real_causes_fake_effects_fake_target = explain_instance_real_causes_fake_effects_real_target
              explain_instance_real_causes_fake_effects_fake_target.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]]  * self.explain.shape[0], axis=0).reset_index(drop=True)

              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target_cause = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = explain_instance_real_causes_fake_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target_cause.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features+1]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
              
              explain_instance_fake_causes_real_effects_fake_target_cause = explain_instance_fake_causes_real_effects_real_target_cause
              explain_instance_fake_causes_real_effects_fake_target_cause.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)

            if self.target_features[j] in self.effect_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target_effect = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]

              target_index_temp = explain_instance_real_causes_fake_effects_real_target_effect.columns.get_loc(self.target_features[j])

              if (target_index_temp < self.n_features):
                explain_instance_real_causes_fake_effects_real_target_effect.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
              
              explain_instance_real_causes_fake_effects_fake_target_effect = explain_instance_real_causes_fake_effects_real_target_effect
              explain_instance_real_causes_fake_effects_fake_target_effect.iloc[:, target_index_temp] =\
                pd.concat([reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = explain_instance_fake_causes_real_effects_real_target.columns.get_loc(self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features + 1]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)
                

              explain_instance_fake_causes_real_effects_fake_target = explain_instance_fake_causes_real_effects_real_target
              explain_instance_fake_causes_real_effects_fake_target.iloc[:, target_index_temp] =\
              pd.concat([reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]]*self.explain.shape[0],
                axis=0).reset_index(drop=True)

          if not self.target_features[j] in self.nodes:
            #магическим образом две нижеследующие строчки возвращают датафрейм к старому виду
            explain_instance_real_target = explain_instance_real_target.loc[:, self.explain.columns]
            explain_instance_fake_target = explain_instance_fake_target.loc[:, self.explain.columns]
            data_explain_instance = pd.concat([explain_instance_real_target, explain_instance_fake_target], axis=0).reset_index(drop=True)#, ignore_index=True)
            #вот тут не совсем понятно, индекс это число или строка, индексы в data_explain_instance это числа или строки? в любом случае, при запуске можно починить
            data_explain_instance['index_in_sample'] = np.tile(np.arange(0, self.explain.shape[0]), 2) 
            data_explain_instance['feature_group'] = np.repeat(['real_target', 'fake_target'], repeats=self.explain.shape[0])
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 0
            data_explain_instance['causal_type'] = None

          else:
            if self.target_features[j] in self.causal_nodes:
              explain_instance_real_causes_fake_effects_real_target =\
              explain_instance_real_causes_fake_effects_real_target.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target =\
              explain_instance_real_causes_fake_effects_fake_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target_cause =\
              explain_instance_fake_causes_real_effects_real_target_cause.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target_cause =\
              explain_instance_fake_causes_real_effects_fake_target_cause.loc[:, self.explain.columns]

            if self.target_features[j] in self.effect_nodes:
              explain_instance_real_causes_fake_effects_real_target_effect =\
              explain_instance_real_causes_fake_effects_real_target_effect.loc[:, self.explain.columns]
              explain_instance_real_causes_fake_effects_fake_target_effect =\
              explain_instance_real_causes_fake_effects_fake_target_effect.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_real_target =\
              explain_instance_fake_causes_real_effects_real_target.loc[:, self.explain.columns]
              explain_instance_fake_causes_real_effects_fake_target =\
              explain_instance_fake_causes_real_effects_fake_target.loc[:, self.explain.columns]

            if self.target_features[j] in self.causal_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause], axis=0
              ).reset_index(drop=True)
              data_explain_instance['index_in_sample'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",
                                                          "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_a_cause"

            if self.target_features[j] in self.effect_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index_in_sample'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series(["real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",
                                                          "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"]),
                                                        self.explain.shape[0])
              data_explain_instance['causal_type'] = "target_is_an_effect"

            if (self.target_features[j] in self.causal_nodes) and (self.target_features[j] in self.effect_nodes):
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause,
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0).reset_index(drop=True)
              data_explain_instance['index_in_sample'] = np.tile(np.arange(1, self.explain.shape[0] + 1), 8)  # Eight Frankenstein instances per explained instance.
              data_explain_instance['feature_group'] = np.tile(pd.Series([
                "real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",  # Target is a causal node.
                "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause",  # Target is a causal node.
                "real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",  # Target is an effect node.
                "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"  # Target is an effect node.
                ]),
              self.explain.shape[0])
              data_explain_instance['causal_type'] = np.tile(pd.Series([
                "target_is_a_cause", "target_is_a_cause", "target_is_a_cause", "target_is_a_cause",
                "target_is_an_effect", "target_is_an_effect", "target_is_an_effect", "target_is_an_effect"]
              ),
              self.explain.shape[0])
            
            data_explain_instance['feature_name'] = self.target_features[j]
            data_explain_instance['causal'] = 1

          data_explain_instance['sample'] = i
          data_sample_feature.append(data_explain_instance)

        data_sample.append(data_sample_feature)

      data_sample = pd.concat([pd.concat(data_sample_i, axis=0) for data_sample_i in data_sample], axis=0).reset_index(drop=True)
      return data_sample

    def predict_shapFlex(self, data_predict):
      '''есть self.reference, self.model, self.predict_function, self.n_features, self.causal, self.causal_weights'''
      data_model = data_predict.iloc[:, :self.n_features].copy()
      data_meta = data_predict.iloc[:, self.n_features:].copy()
      data_predicted = pd.DataFrame(predict_function(self.model, data_model), index=data_model.index)
      data_predicted = pd.concat([data_meta, data_predicted], axis=1)
      #мб придется править, в зависимости от формата входных данных (вектор-строка/-столбец), пока результат по всем измерениям, скаляр
      intercept = predict_function(self.model, self.reference).mean(skipna=True)
      #вмест data.shape[1] взял -1
      #костыль, не понимаю, что тут должно быть пока
      user_fun_y_pred_name = data_predicted.columns[-1]
      #тут нюанс: у них перед вэлью !! стоит, что значит значение которое за ними следует, это не значение, а expression, что бы это 
      # ни значило, соответсвенно, может беда быть
      #data_predicted = pd.concat([
      #  data_predicted.drop('feature_group', axis=1), 
      #  data_predicted.reset_index().pivot_table(index='index', columns=[ 'feature_group'], values=user_fun_y_pred_name)
      #  ], axis=1)
      variables_of_interest = list(set(data_predicted.columns) - set(['feature_group', user_fun_y_pred_name]))
      data_predicted.loc[:, variables_of_interest] =\
        data_predicted.loc[:, variables_of_interest].fillna(-99999)
      data_predicted = data_predicted.pivot_table(
        index=set(data_predicted.columns) - set(['feature_group', user_fun_y_pred_name]),
        columns=['feature_group'],
        values=user_fun_y_pred_name
      ).reset_index()
      
      data_non_causal = data_predicted.loc[data_predicted['causal']==0]
      data_non_causal['shap_effect'] = data_non_causal['real_target'] - data_non_causal['fake_target']
      data_causal = data_predicted.loc[data_predicted['causal']==1]

      if isinstance(self.causal, pd.core.frame.DataFrame):
        data_target_is_a_cause = data_causal[data_causal['causal_type'] == 'target_is_a_cause']
        data_target_is_an_effect = data_causal[data_causal['causal_type'] == 'target_is_an_effect']

        data_target_is_a_cause['shap_u_1_12'] = data_target_is_a_cause.loc[:, 'real_causes_fake_effects_real_target'] -\
          data_target_is_a_cause.loc[:, 'real_causes_fake_effects_fake_target']
        data_target_is_a_cause['shap_u_1_21'] = data_target_is_a_cause.loc[:, 'fake_causes_real_effects_real_target_cause'] -\
          data_target_is_a_cause.loc[:, 'fake_causes_real_effects_fake_target_cause']
        data_target_is_an_effect['shap_u_2_12'] = data_target_is_an_effect.loc[:, 'real_causes_fake_effects_real_target_effect'] -\
          data_target_is_an_effect.loc[:, 'real_causes_fake_effects_fake_target_effect']
        data_target_is_an_effect['shap_u_2_21'] = data_target_is_an_effect.loc[:, 'fake_causes_real_effects_real_target'] -\
          data_target_is_an_effect.loc[:, 'fake_causes_real_effects_fake_target']
        
        data_weights = pd.concat([self.causal, pd.Series(self.causal_weights)], axis=1)
        data_weights.columns = ["target_is_a_cause", "target_is_an_effect", "weight"]
        ##52-53 не понял##
        data_weights = pd.melt(data_weights,  id_vars='weight')
        data_weights.columns= ['weight', "causal_type", "feature_name"]
        data_weights = data_weights.groupby(['causal_type', 'feature_name']).apply(np.mean).reset_index()
        data_target_is_a_cause = data_target_is_a_cause.merge(data_weights, on=['causal_type', 'feature_name'], how='left')
        data_target_is_an_effect = data_target_is_an_effect.merge(data_weights, on=['causal_type', 'feature_name'], how='left')
        #строка ниже: там почему-то лежит NaN, уточнить
        shap_u_1 = np.sum(data_target_is_a_cause[['shap_u_1_12', 'shap_u_1_21']].values *\
           np.hstack([data_target_is_a_cause[['weight']].values, 1 - data_target_is_a_cause[['weight']].values]), axis=-1)
        data_target_is_a_cause['shap_effect'] = shap_u_1
        if data_target_is_an_effect.shape[0] > 0:
          shap_u_2 = np.sum(data_target_is_an_effect[['shap_u_2_12', 'shap_u_2_21']].values *\
           np.hstack([data_target_is_an_effect[['weight']].values, 1 - data_target_is_an_effect[['weight']].values]), axis=-1)
          data_target_is_an_effect['shap_effect'] = shap_u_2

        data_causal = pd.concat([data_target_is_a_cause, data_target_is_an_effect], axis=0)
        data_causal = data_causal.groupby(['index_in_sample', 'sample', 'feature_name']).apply(np.mean).reset_index()# мб докинуть условие на skipna

      data_predicted = pd.concat([data_causal, data_non_causal], ignore_index=True, axis=0)
      data_predicted = data_predicted.loc[:, ['index_in_sample', 'sample', 'feature_name', 'shap_effect']]

      data_predicted = data_predicted.reset_index().dropna(axis=0).groupby(['index_in_sample', 'feature_name']).agg({'shap_effect': [np.std, np.mean]})
      data_predicted[('shap_effect', 'intercept')] = intercept[0]

      return data_predicted

    def forward(self):
      data_predict = self.loop_over_monte_carlo_samples()
      data_predicted = self.predict_shapFlex(data_predict)
      return data_predicted




import pandas as pd
import numpy as np
data = pd.read_csv('https://kolodezev.ru/download/data_adult.csv', index_col=0)
outcome_name = 'income'
outcome_col = pd.Series(data.columns)[data.columns==outcome_name].index[0]
X, y = data.drop(outcome_name, axis=1), data[outcome_name].values
cat_features = [inx for inx, value in zip(X.dtypes.index, X.dtypes) if value =='object']
model = CatBoostClassifier()
model.fit(X, y, cat_features=cat_features, verbose=False)
def predict_function(model, data):
  #pd.DataFrame(model.predict_proba(X)).loc[:, 0][9] если запустить будет результат 0.98, что соответствует
  #выводу для 9 номера который равен 0.98, неважно какой алгоритм, такая высокая степень уверенности
  #позволяет идентифицировать выводимую колонку однозначно
  return pd.DataFrame(model.predict_proba(data)[:, [0]])


explain, reference = data.iloc[:300, :data.shape[1]-1], data.iloc[:, :data.shape[1]-1]
sample_size = 10
target_features = pd.Series(["marital_status", "education", "relationship",  "native_country",
                     "age", "sex", "race", "hours_per_week"])
causal = pd.DataFrame(
  dict(cause=pd.Series(["age", "sex", "race", "native_country",
              "age", "sex", "race", "native_country", "age",
              "sex", "race", "native_country"]),
  effect = pd.Series(np.concatenate([np.tile("marital_status", 4), np.tile("education", 4), np.tile("relationship", 4)])))
)
exmpl_of_test = shapFlex_plus(explain,  model, predict_function, target_features=pd.Series(["marital_status", "education", "relationship", "native_country",
"age", "sex", "race", "hours_per_week"]), causal=causal, causal_weights = [0.5 for x in range(len(causal))])
data_predict = exmpl_of_test.loop_over_monte_carlo_samples()
data_predicted = exmpl_of_test.predict_shapFlex(data_predict)
#print(data_predicted)