<a href="https://colab.research.google.com/github/gregory-ch/shap_flex_porting/blob/main/demo_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
This module contains using user-defined trained models and prediction functions to compute approximate Shapley values for
single models. 
"""

import numpy as np
import pandas as pd
import random
import csv
import itertools

In [None]:
class shapFlex_plus:
    def __init__(self, explain,  model, predict_function, reference = None, target_features = None, \
                     causal = None, causal_weights = None, sample_size = None, use_future = None):
        self.explain = explain
        self.reference = reference if reference else explain
        self.model = model
        predict_function = predict_function
        self.target_features = target_features if target_features else explain.columns.tolist()
        self.causal = causal if causal else None
        self.causal_weights = causal_weights if causal_weights else None
        self.sample_size = sample_size if sample_size else 60
        self.use_future = use_future if target_features else False
        
        self.n_features = self.explain.shape[1]
        self.n_instances = self.reference.shape[0]

        self.nodes = None

    @staticmethod
    def ulist_df(data):
      unlisted_df = pd.Series(
                  data,
                  index=[
                  index_col + index_row for index_col, index_row in itertools.product(
                      [str(x) for x in range(data.shape[0])], 
                      [str(x) for x in data.columns])]
              )
      return unlisted_df
      
    def loop_over_monte_carlo_samples(self):
      i_size = self.sample_size
      j_size = len(self.target_features)
      for i in range(i_size):
        reference_index = np.random.choice(np.arange(0, self.n_features ), size=1, replace=False)
        feature_indices_random = np.random.choice(np.arange(0, self.n_features), size=self.n_features, replace=False)
        # r индексация стартует с 1 а питон с 0 поэтому нам нужно вычиать 1 или ставить по верхней границе индексы в зависимости от функции вызова
        #reference это pd dataframe
        feature_names_random = self.explain.columns[feature_indices_random].values
        print(reference_index, feature_indices_random)
        reference_instance = reference.iloc[reference_index, feature_indices_random]
        #feature_indices_random это вектор индексов
        explain_instances = explain.iloc[:, feature_indices_random]

        for j in range(j_size):
          target_feature_index =  self.explain.columns.get_loc(self.target_features[j])
          target_feature_index_shuffled = list(self.explain.columns.values[feature_indices_random]).index(self.target_features[j])
          # target_feature_index = (self.explain.columns == self.target_features[j])
          # target_feature_index_shuffled = (self.explain.columns[feature_indices_random] == self.target_features[j])
          
          if self.target_features[j] in self.nodes:
            #unlist как я понял, вытягивает все данные в один длинный вектор, присваивает индексы как название колонки + название строки
            #предположу, что each_node_causes это pd.DataFrame()
            target_feature_causes_these_features = self.unlist_df(
                #loc потому, что кажется target_features это не индекс
                each_node_causes.loc[:, self.target_features[j]]
                )
            target_feature_is_caused_by = self.unlist_df(
                each_node_is_an_effect_from.loc[:, self.target_features[j]]
                )
            
            target_index = target_feature_index_shuffled
            #отмечаем те значения feature_names_random которые равны последнему значению 
            #target_feature_is_caused_by. target_feature_is_caused_by вроде как вектор
            #вернуться должно число. Если вдруг окажется, что датафрейм, -1 элемент будет строка, 
            #надо заменить на индексацию на iloc, == на .isin
            causes_indices = (feature_names_random == target_feature_is_caused_by[-1])
            effects_indices = (feature_names_random == target_feature_causes_these_features[-1])
            sample_indices = feature_indices_random[~feature_indices_random.isin(
                np.concatenate([target_index, causes_indices, effects_indices]))]
            #c() вроде как склеивает вектор(ы) и переменные
            sample_real_indices = sample_indices[sample_indices < target_index]  # Not in causal diagram, feature data from 'explain'.
            sample_fake_indices = sample_indices[sample_indices > target_index]  # Not in causal diagram, feature data from 'reference'.

            feature_indices_real_causes_real_effects = np.concatenate([sample_real_indices, causes_indices, effects_indices, target_index, sample_fake_indices])
            feature_indices_real_causes_fake_effects = np.concatenate([sample_real_indices, causes_indices, target_index, effects_indices, sample_fake_indices])
            feature_indices_fake_causes_real_effects = np.concatenate([sample_real_indices, effects_indices, target_index, causes_indices, sample_fake_indices])
            feature_indices_fake_causes_fake_effects = np.concatenate([sample_real_indices, target_index, causes_indices, effects_indices, sample_fake_indices])
          
          if not self.target_features[j] in self.nodes:
            explain_instance_real_target = explain_instances

            # Only create a Frankenstein instance if the target is not the last feature and there is actually
            # one or more features to the right of the target to replace with the reference.
            if (target_feature_index_shuffled < self.n_features):
              explain_instance_real_target.iloc[:, target_feature_index_shuffled + 1: self.n_features + 1] =\
              reference_instance.iloc[:, target_feature_index_shuffled + 1: self.n_features + 1]
            
            # These instances are otherwise the same as the Frankenstein instance created above with the
            # exception that the target feature is now replaced with the target feature in the random reference
            # instance. The difference in model predictions between these two Frankenstein instances is
            # what gives us the stochastic Shapley value approximation.
            explain_instance_fake_target = explain_instance_real_target
            explain_instance_fake_target.iloc[:, target_feature_index_shuffled] = reference_instance[:, target_feature_index_shuffled]
          
          else:

            if self.target_features[j] in self.causal_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]
              target_index_temp = (explain_instance_real_causes_fake_effects_real_target.columns.values == self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_real_causes_fake_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]

              explain_instance_real_causes_fake_effects_fake_target = explain_instance_real_causes_fake_effects_real_target
              explain_instance_real_causes_fake_effects_fake_target.iloc[:, target_index_temp] =\
              reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target_cause = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = (explain_instance_fake_causes_real_effects_real_target_cause.columns.values == self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target_cause.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_fake_causes_real_effects[:, target_index_temp + 1: self.n_features]
              
              explain_instance_fake_causes_real_effects_fake_target_cause = explain_instance_fake_causes_real_effects_real_target_cause
              explain_instance_fake_causes_real_effects_fake_target_cause.iloc[:, target_index_temp] =\
              reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]

            if self.target_features[j] in self.effect_nodes:
              reference_instance_real_causes_fake_effects = reference_instance.iloc[:, feature_indices_real_causes_fake_effects]
              explain_instance_real_causes_fake_effects_real_target_effect = explain_instances.iloc[:, feature_indices_real_causes_fake_effects]
              target_index_temp = (explain_instance_real_causes_fake_effects_real_target_effect.columns.values == self.target_features[j])

              if (target_index_temp < self.n_features):
                explain_instance_real_causes_fake_effects_real_target_effect.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_real_causes_fake_effects.iloc[:, target_index_temp + 1: self.n_features + 1]
              
              explain_instance_real_causes_fake_effects_fake_target_effect = explain_instance_real_causes_fake_effects_real_target_effect
              explain_instance_real_causes_fake_effects_fake_target_effect.iloc[:, target_index_temp] =\
              reference_instance_real_causes_fake_effects.iloc[:, target_index_temp]
              reference_instance_fake_causes_real_effects = reference_instance.iloc[:, feature_indices_fake_causes_real_effects]
              explain_instance_fake_causes_real_effects_real_target = explain_instances.iloc[:, feature_indices_fake_causes_real_effects]
              target_index_temp = (explain_instance_fake_causes_real_effects_real_target.columns.values == self.target_features[j])

              if target_index_temp < self.n_features:
                explain_instance_fake_causes_real_effects_real_target.iloc[:, target_index_temp + 1: self.n_features + 1] =\
                reference_instance_fake_causes_real_effects.iloc[:, target_index_temp + 1: self.n_features + 1]

              explain_instance_fake_causes_real_effects_fake_target = explain_instance_fake_causes_real_effects_real_target
              explain_instance_fake_causes_real_effects_fake_target.iloc[:, target_index_temp] =\
              reference_instance_fake_causes_real_effects.iloc[:, target_index_temp]

          if not self.target_features[j] in self.nodes:
            explain_instance_real_target = explain_instance_real_target.loc[:, explain.columns]
            explain_instance_fake_target = explain_instance_fake_target.loc[:, explain.columns]
            data_explain_instance = pd.concat([explain_instance_real_target, explain_instance_fake_target], axis=0)
            #вот тут не совсем понятно, индекс это число или строка, индексы в data_explain_instance это числа или строки? в любом случае, при запуске можно починить
            data_explain_instance[index] = np.tile(np.arange(1, explain.shape[1] + 1), 2) 
            data_explain_instance[feature_group] = np.tile(pd.Series(['real_target', 'fake_target']), explain.shape[0])
            data_explain_instance[feature_name] = target_features[j]
            data_explain_instance[causal] = 0
            data_explain_instance[causal_type] = None

          else:
            if self.target_features[j] in self.causal_nodes:
              explain_instance_real_causes_fake_effects_real_target =\
              explain_instance_real_causes_fake_effects_real_target.loc[:, explain.columns]
              explain_instance_real_causes_fake_effects_fake_target =\
              explain_instance_real_causes_fake_effects_fake_target.loc[:, explain.columns]
              explain_instance_fake_causes_real_effects_real_target_cause =\
              explain_instance_fake_causes_real_effects_real_target_cause.loc[:, explain.columns]
              explain_instance_fake_causes_real_effects_fake_target_cause =\
              explain_instance_fake_causes_real_effects_fake_target_cause.loc[:, explain.columns]

            if self.target_features[j] in self.effect_nodes:
              explain_instance_real_causes_fake_effects_real_target_effect =\
              explain_instance_real_causes_fake_effects_real_target_effect.loc[:, explain.columns]
              explain_instance_real_causes_fake_effects_fake_target_effect =\
              explain_instance_real_causes_fake_effects_fake_target_effect.loc[:, explain.columns]
              explain_instance_fake_causes_real_effects_real_target =\
              explain_instance_fake_causes_real_effects_real_target.loc[:, explain.columns]
              explain_instance_fake_causes_real_effects_fake_target =\
              explain_instance_fake_causes_real_effects_fake_target.loc[:, explain.columns]

            if self.target_features[j] in self.causal_nodes:
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause], axis=0
              )
              data_explain_instance[index] = np.tile(np.arange(1, explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance[feature_group] = np.tile(pd.Series(["real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",
                                                          "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause"]),
                                                        explain.shape[0])
              data_explain_instance[causal_type] = "target_is_a_cause"

            if self.target_features[j] in self.effect_nodes:
              data_explain_instance <- pd.concat([
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0)
              data_explain_instance[index] = np.tile(np.arange(1, explain.shape[0] + 1), 4)  # Four Frankenstein instances per explained instance.
              data_explain_instance[feature_group] = np.tile(pd.Series(["real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",
                                                          "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"]),
                                                        explain.shape[0])
              data_explain_instance[causal_type] = "target_is_an_effect"

            if (self.target_features[j] in self.causal_nodes) and (self.target_features[j] in self.effect_nodes):
              data_explain_instance = pd.concat([
                explain_instance_real_causes_fake_effects_real_target,
                explain_instance_real_causes_fake_effects_fake_target,
                explain_instance_fake_causes_real_effects_real_target_cause,
                explain_instance_fake_causes_real_effects_fake_target_cause,
                explain_instance_real_causes_fake_effects_real_target_effect,
                explain_instance_real_causes_fake_effects_fake_target_effect,
                explain_instance_fake_causes_real_effects_real_target,
                explain_instance_fake_causes_real_effects_fake_target
              ], axis=0)
              data_explain_instance[index] = np.tile(np.arange(1, explain.shape[0] + 1), 8)  # Eight Frankenstein instances per explained instance.
              data_explain_instance[feature_group] = np.tile(pd.Series([
                "real_causes_fake_effects_real_target", "real_causes_fake_effects_fake_target",  # Target is a causal node.
                "fake_causes_real_effects_real_target_cause", "fake_causes_real_effects_fake_target_cause",  # Target is a causal node.
                "real_causes_fake_effects_real_target_effect", "real_causes_fake_effects_fake_target_effect",  # Target is an effect node.
                "fake_causes_real_effects_real_target", "fake_causes_real_effects_fake_target"  # Target is an effect node.
                ]),
              explain.shape[0])
              data_explain_instance[causal_type] = np.tile(pd.Series([
                "target_is_a_cause", "target_is_a_cause", "target_is_a_cause", "target_is_a_cause",
                "target_is_an_effect", "target_is_an_effect", "target_is_an_effect", "target_is_an_effect"]
              ),
              explain.shape[0])
            
            data_explain_instance[feature_name] = target_features[j]
            data_explain_instance[causal] = 1

          data_explain_instance[sample] = i
          data_explain_instance

        data_sample_feature = pd.concat(data_sample_feature, axis=0)
        data_sample_feature

                        



1) текущий раздел работы строки: 125-397, в строках инициализируется функция сэмплирования, проходит по двум петлям цикла. [закончено]
2) Начало цикла [закончено]

3) "Франкенштейна" cтр 172 -270 [закончено]
 
4) Цикл i loop, j loop  стр 397 [закончено]

5) написан код для инициализации объектов на вход в класс: модель, обработку данных, предикт функцию. Датасет в csv на гугл-диск кинул: https://drive.google.com/file/d/1ADJ2yNZum-quPW3bRWJ4iyEa2OoqlS18/view?usp=sharing, пока для простоты складывается в файлы колаба через drag-and-drop.

6) Инициализация графа [закончено]

7) В основном разделе R/shapFlex дошли до вызова функции predict_shapFlex на 401 стр., проверили инициализацию класса ShapFlex_plus начали отладку запуска loop_over_monte_carlo_samples()



In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

data = pd.read_csv('/content/data_adult.csv', index_col=0)
encoder = OneHotEncoder()
outcome_name = 'income'
outcome_col = pd.Series(data.columns)[data.columns==outcome_name].index[0]
model = RandomForestClassifier(n_estimators=300, random_state=42)
X, y = data.drop(outcome_name, axis=1), data[outcome_name].values
X, y = pd.get_dummies(X, drop_first=True), np.array([1 if x == '<=50K' else 0 for x in y ]).ravel()
model.fit(X, y)

def predict_function(model, data):
  #pd.DataFrame(model.predict_proba(X)).loc[:, 0][9] если запустить будет результат 0.98, что соответствует
  #выводу для 9 номера который равен 0.98, неважно какой алгоритм, такая высокая степень уверенности
  #позволяет идентифицировать выводимую колонку однозначно
  X, y = data.drop(outcome_name, axis=1), data[outcome_name].values
  X, y = pd.get_dummies(X, drop_first=True), np.array([1 if x == '<=50K' else 0 for x in y ]).ravel()
  return pd.DataFrame(model.predict_proba(X)).loc[:, 0], X, y

explain, reference = data.iloc[:300, :data.shape[1]-1], data.iloc[:, :data.shape[1]-1]
sample_size = 60
target_features = pd.Series(["marital_status", "education", "relationship",  "native_country",
                     "age", "sex", "race", "hours_per_week"])
causal = pd.DataFrame(
  dict(cause=pd.Series(["age", "sex", "race", "native_country",
              "age", "sex", "race", "native_country", "age",
              "sex", "race", "native_country"]),
  effect = pd.Series(np.concatenate([np.tile("marital_status", 4), np.tile("education", 4), np.tile("relationship", 4)])))
)

# Новый раздел

In [None]:
!pip install igraph
import igraph

Collecting igraph
  Downloading igraph-0.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.2 MB/s 
[?25hCollecting texttable>=1.6.2
  Downloading texttable-1.6.4-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph
Successfully installed igraph-0.9.9 texttable-1.6.4


In [None]:
causal_graph = igraph.Graph.DataFrame(causal, directed=True)
nodes = [v for v in causal_graph.vs]
each_node_causes = {v: v.successors() for v in nodes if v.successors()}# надо уточнить, мб здесь не только "прямые" successors и predecessors ищутся 
each_node_is_an_effect_from = {v: v.predecessors() for v in nodes if v.predecessors()} # но и вообще все
# имена, кажется, уже прописаны автоматически
causal_nodes = [v['name'] for v in each_node_causes.keys()]
effect_nodes = [v['name'] for v in each_node_is_an_effect_from.keys()]

In [None]:
exmpl_of_test = shapFlex_plus(explain,  model, predict_function)
# exmpl_of_test.explain.columns

Index(['age', 'workclass', 'education', 'education_num', 'marital_status',
       'occupation', 'relationship', 'race', 'sex', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country'],
      dtype='object')

In [None]:
exmpl_of_test = shapFlex_plus(explain,  model, predict_function)
exmpl_of_test.loop_over_monte_carlo_samples()

[3] [12  6 11  7  2  0  1  9  5 10  3  8  4]


TypeError: ignored