In [None]:
import numpy as np
import pandas as pd

import xgboost as xgb

from shapflow.flow import CausalLinks
from shapflow.flow import build_feature_graph
from shapflow.flow import GraphExplainer
from shapflow.flow import edge_credits2edge_credit
from shapflow.flow import translator, create_xgboost_f

from shap_flow_util import read_csv_incl_timeindex

import time
import dill
import tqdm
import multiprocess as mp

In [None]:
periods = [('2015-01-08', '2021-09-30'),
            ('2021-10-01', '2023-12-31'),
            ('2015-01-08', '2023-12-31')]
version = 'v5'
for start_date, end_date in periods:
    model_name = 'xgb_start_{}_end_{}'.format(start_date, end_date, version)
    X_test = read_csv_incl_timeindex('./data/{}/X_test_{}.csv'.format(version, model_name))

    model = xgb.Booster()
    model.load_model("./models/{}/{}_best.json".format(version, model_name))
    seed = 7
    
    n_bg = 100 # number of sampled background samples (paper: 100)
    nsamples = 1000 # number of forefround samples to explain (paper: 1,000)
    nruns = 500
    bg = X_test.sample(n=n_bg, random_state=seed) # background samples
    fg = X_test.sample(n=nsamples, random_state=seed) # foreground samples (samples to explain)

    bg.to_csv('./data/{}/bg_{}.csv'.format(version, model_name), sep=',', index=True)
    fg.to_csv('./data/{}/fg_{}.csv'.format(version, model_name), sep=',', index=True)


    
    causal_links = CausalLinks()
    categorical_feature_names = []
    display_translator = translator(X_test.columns, X_test, X_test)
    target_name = 'price_da'
    feature_names = list(X_test.columns)

    calendar_features = ['month', 'dayofyear_sin', 'dayofyear_cos', 'hour_sin', 'hour_cos', 'day_of_week']
    non_calendar_features = list(set(feature_names) - set(calendar_features))
    causal_links.add_causes_effects(calendar_features, non_calendar_features) 

    intermediate = ['temp_mean_7d_avg', 'temp_mean']
    upstream = ['load_da', 'nuclear_avail', 'solar_da']
    causal_links.add_causes_effects(intermediate, upstream)

    causal_links.add_causes_effects(feature_names, 
                                    target_name, 
                                    create_xgboost_f(feature_names, model))

    causal_graph = build_feature_graph(X_test, 
                                    causal_links=causal_links, 
                                    categorical_feature_names=categorical_feature_names,
                                    display_translator=display_translator,
                                    target_name=target_name,
                                    method='xgboost')
    causal_graph.draw()

    
    #calculate multiple background result (same as in income.ipynb)
    # change this to a suitable value, depending on machine (e.g. 6, 12; on cluster 20)
    num_processes = 10
    from shap_flow_util import calculate_edge_credit

    start = time.time()

    model.set_param('n_jobs', -1)
    model.set_param('device', 'cpu')

    pool = mp.Pool(num_processes)
    _args = [(causal_graph, bg[i:i+1], fg, nruns) for i in range(len(bg))]
    edge_credits = pool.starmap(calculate_edge_credit, tqdm.tqdm(_args, total=len(_args)))
    pool.close()
    pool.join()

    end = time.time()
    print(end - start)
    # need this for being able to draw shapley flow (need to call shap_values for one bg sample redundandly)
    # (caused by using futures )
    model.set_param('n_jobs', -1)
    explainer = GraphExplainer(causal_graph, bg[0:1], nruns, silent=False)
    cf = explainer.shap_values(fg)
    # save credit flow to file
    cf.edge_credit = edge_credits2edge_credit(edge_credits, cf.graph)
    with open('./credit_flow/{}/flow_{}.pkl'.format(version, model_name), 'wb') as file:
        dill.dump(cf, file)