In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.utils import resample

np.random.seed(42)
random_state = 42

In [8]:
# modelNames

fullmodelNames = ["full_r00_g10_3000k",
                "full_r01_g10_3000k",
                "full_r02_g10_3000k",
                "full_r03_g10_3000k",
                "full_r04_g10_3000k",
                "full_r05_g10_3000k",
                "full_r06_g10_3000k",
                "full_r07_g10_3000k",
                "full_r08_g10_3000k",
                "full_r09_g10_3000k",
                "full_r10_g10_3000k",
                "full_r10_g09_3000k",
                "full_r10_g08_3000k",
                "full_r10_g07_3000k",
                "full_r10_g06_3000k",
                "full_r10_g05_3000k",
                "full_r10_g04_3000k",
                "full_r10_g03_3000k",
                "full_r10_g02_3000k",
                "full_r10_g01_3000k",
                "full_r10_g00_3000k"]

y_full_y_modelNames = ["y_full_y_r00_g10_3000k",
                "y_full_y_r01_g10_3000k",
                "y_full_y_r02_g10_3000k",
                "y_full_y_r03_g10_3000k",
                "y_full_y_r04_g10_3000k",
                "y_full_y_r05_g10_3000k",
                "y_full_y_r06_g10_3000k",
                "y_full_y_r07_g10_3000k",
                "y_full_y_r08_g10_3000k",
                "y_full_y_r09_g10_3000k",
                "y_full_y_r10_g10_3000k",
                "y_full_y_r10_g09_3000k",
                "y_full_y_r10_g08_3000k",
                "y_full_y_r10_g07_3000k",
                "y_full_y_r10_g06_3000k",
                "y_full_y_r10_g05_3000k",
                "y_full_y_r10_g04_3000k",
                "y_full_y_r10_g03_3000k",
                "y_full_y_r10_g02_3000k",
                "y_full_y_r10_g01_3000k",
                "y_full_y_r10_g00_3000k"]
                    


# all 21 act datasets
act_r00_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r00_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r01_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r01_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r02_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r02_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r03_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r03_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r04_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r04_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r05_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r05_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r06_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r06_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r07_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r07_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r08_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r08_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r09_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r09_g10_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g00_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g00_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g01_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g01_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g02_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g02_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g03_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g03_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g04_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g04_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g05_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g05_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g06_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g06_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g07_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g07_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g08_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g08_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g09_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g09_3000k.csv", header=0, index_col=None, float_precision='high')
act_r10_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/act_r10_g10_3000k.csv", header=0, index_col=None, float_precision='high')

# all 21 shap datasets
shap_r00_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r00_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r01_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r01_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r02_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r02_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r03_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r03_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r04_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r04_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r05_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r05_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r06_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r06_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r07_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r07_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r08_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r08_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r09_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r09_g10_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g00_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g00_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g01_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g01_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g02_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g02_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g03_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g03_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g04_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g04_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g05_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g05_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g06_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g06_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g07_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g07_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g08_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g08_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g09_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g09_3000k.csv", header=0, index_col=None, float_precision='high')
shap_r10_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/shap_r10_g10_3000k.csv", header=0, index_col=None, float_precision='high')


full_r00_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r00_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r01_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r01_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r02_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r02_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r03_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r03_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r04_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r04_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r05_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r05_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r06_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r06_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r07_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r07_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r08_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r08_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r09_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r09_g10_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g00_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g09_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g01_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g08_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g02_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g07_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g03_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g06_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g04_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g05_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g05_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g04_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g06_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g03_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g07_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g02_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g08_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g01_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g09_3000k.csv", header=0, index_col=None, float_precision='high')
full_r10_g00_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/full_r10_g10_3000k.csv", header=0, index_col=None, float_precision='high')

y_dict = {
    "r00_g10_y": full_r00_g10_3000k,
    "r01_g10_y": full_r01_g10_3000k,
    "r02_g10_y": full_r02_g10_3000k,
    "r03_g10_y": full_r03_g10_3000k,
    "r04_g10_y": full_r04_g10_3000k,
    "r05_g10_y": full_r05_g10_3000k,
    "r06_g10_y": full_r06_g10_3000k,
    "r07_g10_y": full_r07_g10_3000k,
    "r08_g10_y": full_r08_g10_3000k,
    "r09_g10_y": full_r09_g10_3000k,
    "r10_g10_y": full_r10_g00_3000k,
    "r10_g09_y": full_r10_g01_3000k,
    "r10_g08_y": full_r10_g02_3000k,
    "r10_g07_y": full_r10_g03_3000k,
    "r10_g06_y": full_r10_g04_3000k,
    "r10_g05_y": full_r10_g05_3000k,
    "r10_g04_y": full_r10_g06_3000k,
    "r10_g03_y": full_r10_g07_3000k,
    "r10_g02_y": full_r10_g08_3000k,
    "r10_g01_y": full_r10_g09_3000k,
    "r10_g00_y": full_r10_g10_3000k
}

# all 21 y_full_y datasets
y_full_y_r00_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r00_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r01_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r01_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r02_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r02_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r03_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r03_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r04_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r04_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r05_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r05_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r06_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r06_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r07_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r07_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r08_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r08_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r09_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r09_g10_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g10_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g00_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g09_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g01_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g08_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g02_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g07_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g03_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g06_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g04_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g05_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g05_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g04_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g06_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g03_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g07_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g02_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g08_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g01_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g09_3000k.csv", header=0, index_col=None, float_precision='high')
y_full_y_r10_g00_3000k = pd.read_csv("C:/Projects/public/XAI_Master/datasets/y_full_y_r10_g10_3000k.csv", header=0, index_col=None, float_precision='high')

train_dict = {
    "r00g10-train": y_full_y_r00_g10_3000k,
    "r01g10-train": y_full_y_r01_g10_3000k,
    "r02g10-train": y_full_y_r02_g10_3000k,
    "r03g10-train": y_full_y_r03_g10_3000k,
    "r04g10-train": y_full_y_r04_g10_3000k,
    "r05g10-train": y_full_y_r05_g10_3000k,
    "r06g10-train": y_full_y_r06_g10_3000k,
    "r07g10-train": y_full_y_r07_g10_3000k,
    "r08g10-train": y_full_y_r08_g10_3000k,
    "r09g10-train": y_full_y_r09_g10_3000k,
    "r10g10-train": y_full_y_r10_g00_3000k,
    "r10g09-train": y_full_y_r10_g01_3000k,
    "r10g08-train": y_full_y_r10_g02_3000k,
    "r10g07-train": y_full_y_r10_g03_3000k,
    "r10g06-train": y_full_y_r10_g04_3000k,
    "r10g05-train": y_full_y_r10_g05_3000k,
    "r10g04-train": y_full_y_r10_g06_3000k,
    "r10g03-train": y_full_y_r10_g07_3000k,
    "r10g02-train": y_full_y_r10_g08_3000k,
    "r10g01-train": y_full_y_r10_g09_3000k,
    "r10g00-train": y_full_y_r10_g10_3000k
}

test_dict = {
    "r00g10-test": full_r00_g10_3000k,
    "r01g10-test": full_r01_g10_3000k,
    "r02g10-test": full_r02_g10_3000k,
    "r03g10-test": full_r03_g10_3000k,
    "r04g10-test": full_r04_g10_3000k,
    "r05g10-test": full_r05_g10_3000k,
    "r06g10-test": full_r06_g10_3000k,
    "r07g10-test": full_r07_g10_3000k,
    "r08g10-test": full_r08_g10_3000k,
    "r09g10-test": full_r09_g10_3000k,
    "r10g10-test": full_r10_g00_3000k,
    "r10g09-test": full_r10_g01_3000k,
    "r10g08-test": full_r10_g02_3000k,
    "r10g07-test": full_r10_g03_3000k,
    "r10g06-test": full_r10_g04_3000k,
    "r10g05-test": full_r10_g05_3000k,
    "r10g04-test": full_r10_g06_3000k,
    "r10g03-test": full_r10_g07_3000k,
    "r10g02-test": full_r10_g08_3000k,
    "r10g01-test": full_r10_g09_3000k,
    "r10g00-test": full_r10_g10_3000k
}

In [9]:
def createTargetStatistics(datasetDict):
    df = pd.DataFrame(columns=['Dataset', 'Red', 'None', 'Green', 'Entries'])
    
    for i, (dataset_name, data) in enumerate(datasetDict.items()):
        target_counts = data['target'].value_counts(normalize=True)
        target_minus1 = target_counts.get(-1, 0)
        target_0 = target_counts.get(0, 0)
        target_1 = target_counts.get(1, 0)

        df.loc[i] = [dataset_name, target_minus1, target_0, target_1, len(data)]
    
    return df

statistics_train = createTargetStatistics(train_dict).reset_index(drop=True).round(4)
print(statistics_train)
statistics_train.to_csv("C:/Projects/public/XAI_Master/datasets/targetStatistics_train.csv", index=False, float_format='%.4f')
statistics_test = createTargetStatistics(test_dict).reset_index(drop=True).round(4)
print(statistics_test)
statistics_test.to_csv("C:/Projects/public/XAI_Master/datasets/targetStatistics_test.csv", index=False, float_format='%.4f')

         Dataset     Red    None   Green  Entries
0   r00g10-train  0.0085  0.0000  0.9915    10000
1   r01g10-train  0.0085  0.0000  0.9915    10000
2   r02g10-train  0.0045  0.0020  0.9935    10000
3   r03g10-train  0.0065  0.0005  0.9930    10000
4   r04g10-train  0.0060  0.0005  0.9935    10000
5   r05g10-train  0.0070  0.0000  0.9930    10000
6   r06g10-train  0.0050  0.0010  0.9940    10000
7   r07g10-train  0.0045  0.0005  0.9950    10000
8   r08g10-train  0.0060  0.0000  0.9940    10000
9   r09g10-train  0.1185  0.0000  0.8815    10000
10  r10g10-train  0.4995  0.0005  0.5000    10000
11  r10g09-train  0.8790  0.0000  0.1210    10000
12  r10g08-train  0.9965  0.0000  0.0035    10000
13  r10g07-train  0.9930  0.0005  0.0065    10000
14  r10g06-train  0.9950  0.0000  0.0050    10000
15  r10g05-train  0.9945  0.0010  0.0045    10000
16  r10g04-train  0.9895  0.0010  0.0095    10000
17  r10g03-train  0.9950  0.0005  0.0045    10000
18  r10g02-train  0.9920  0.0005  0.0075    10000


In [12]:
train = pd.concat([y_full_y_r00_g10_3000k, y_full_y_r01_g10_3000k, y_full_y_r02_g10_3000k, y_full_y_r03_g10_3000k, y_full_y_r04_g10_3000k, y_full_y_r05_g10_3000k, y_full_y_r06_g10_3000k, y_full_y_r07_g10_3000k, y_full_y_r08_g10_3000k, y_full_y_r09_g10_3000k, y_full_y_r10_g00_3000k, y_full_y_r10_g01_3000k, y_full_y_r10_g02_3000k, y_full_y_r10_g03_3000k, y_full_y_r10_g04_3000k, y_full_y_r10_g05_3000k, y_full_y_r10_g06_3000k, y_full_y_r10_g07_3000k, y_full_y_r10_g08_3000k, y_full_y_r10_g09_3000k, y_full_y_r10_g10_3000k], ignore_index=True)
test = pd.concat([full_r00_g10_3000k, full_r01_g10_3000k, full_r02_g10_3000k, full_r03_g10_3000k, full_r04_g10_3000k, full_r05_g10_3000k, full_r06_g10_3000k, full_r07_g10_3000k, full_r08_g10_3000k, full_r09_g10_3000k, full_r10_g00_3000k, full_r10_g01_3000k, full_r10_g02_3000k, full_r10_g03_3000k, full_r10_g04_3000k, full_r10_g05_3000k, full_r10_g06_3000k, full_r10_g07_3000k, full_r10_g08_3000k, full_r10_g09_3000k, full_r10_g10_3000k], ignore_index=True)

train_0_steps = train[train['stepsTaken'] == 0].drop(columns=['stepsTaken']).copy()
test_0_steps = test[test['stepsTaken'] == 0].drop(columns=['stepsTaken']).copy()


def dfByCols(train, test, steps=[0], cols=['pix', 'act']):
    colsDict = {
        'pix': [f"pix{i}{j}{c}" for i in range(7) for j in range(7) for c in ['r', 'g', 'b']],
        'act': [f'act{i}' for i in range(1, 129)],
        'shap': [f"shap{i}{j}{c}" for i in range(7) for j in range(7) for c in ['r', 'g', 'b']],
        'ngc': [f'ngc{i}{j}' for i in range(7) for j in range(7)],
        'xgc': [f'xgc{i}{j}' for i in range(7) for j in range(7)],
        'gcpp': [f'gcpp{i}{j}' for i in range(7) for j in range(7)]
    }
    colList = []
    for col in cols:
        if col in colsDict:
            colList.extend(colsDict[col])
        else:
            print(f"Warning: {col} not found in colsDict")
    train_x = train[train['stepsTaken'].isin(steps)].drop(columns=['stepsTaken', 'target'])[colList].copy()
    train_y = train[train['stepsTaken'].isin(steps)]['target'].copy()
    test_x = test[test['stepsTaken'].isin(steps)].drop(columns=['stepsTaken', 'target'])[colList].copy()
    test_y = test[test['stepsTaken'].isin(steps)]['target'].copy()

    return train_x, train_y, test_x, test_y

steps = range(0, 5)

dataDict = {
    'pixels': {
        'data': (dfByCols(train, test, steps=steps, cols=['pix']))},
    'neuronActivations': {
        'data': (dfByCols(train, test, steps=steps, cols=['pix', 'act']))},
    'SHAP': {
        'data': (dfByCols(train, test, steps=steps, cols=['pix', 'shap']))},
    'GradCAM': {
        'data': (dfByCols(train, test, steps=steps, cols=['pix', 'ngc']))},
    'XGradCAM': {
        'data': (dfByCols(train, test, steps=steps, cols=['pix', 'xgc']))},
    #'GradCAMPlusPlus': {
    #   'data': (dfByCols(train, test, steps=[0], cols=['pix', 'gcpp']))},
}

def train_and_test(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    n_iterations=1000
    accuracies = []
    
    for _ in range(n_iterations):
        # Create bootstrap sample indices
        indices = resample(np.arange(len(y_test)), replace=True)
        
        # Calculate accuracy on bootstrap sample
        bootstrap_acc = accuracy_score(y_test[indices], y_pred[indices])
        accuracies.append(bootstrap_acc)
    
    # Calculate standard deviation of bootstrap accuracies
    return np.mean(accuracies), np.std(accuracies)
    #return accuracy_score(y_test, y_pred)

models = {
    'linear_regression': lambda: LogisticRegression(max_iter=1000, random_state=random_state),
    'decision_tree': lambda: DecisionTreeClassifier(random_state=random_state),
    'random_forest': lambda: RandomForestClassifier(random_state=random_state),
    'hist_gradient_boosting': lambda: HistGradientBoostingClassifier()}

print(f"Predictions with {steps} steps")
for dataset in dataDict.keys():  
    for model_name, model_fn in models.items():
        model = model_fn()  # Creating a new model instance, ensuring no transfer learning
        accuracy, accStDev = train_and_test(model, *dataDict[dataset]['data'])
        print(f"{dataset}: {model_name} - Accuracy: {accuracy:.2%}, St.dev.: {accStDev:.2%}")



Predictions with range(0, 5) steps
pixels: linear_regression - Accuracy: 51.17% StDev: 0.18%
pixels: decision_tree - Accuracy: 50.49% StDev: 0.17%
pixels: random_forest - Accuracy: 50.54% StDev: 0.17%
pixels: hist_gradient_boosting - Accuracy: 51.06% StDev: 0.17%
neuronActivations: linear_regression - Accuracy: 59.46% StDev: 0.17%


KeyboardInterrupt: 

In [None]:
# Merge and shuffle(?) the datasets
act_train = pd.concat([act_r01_g10_3000k, act_r02_g10_3000k, act_r03_g10_3000k, act_r04_g10_3000k, act_r05_g10_3000k, act_r06_g10_3000k, act_r07_g10_3000k, act_r08_g10_3000k, act_r09_g10_3000k, act_r10_g01_3000k, act_r10_g02_3000k, act_r10_g03_3000k, act_r10_g04_3000k, act_r10_g05_3000k, act_r10_g06_3000k, act_r10_g07_3000k, act_r10_g08_3000k, act_r10_g09_3000k], ignore_index=True)
act_test = pd.concat([act_r00_g10_3000k, act_r10_g00_3000k], ignore_index=True) # leave out dataset: act_r10_g10_3000k
act_x_train = act_train.drop('target', axis=1)
act_y_train = act_train['target']
act_x_test = act_test.drop('target', axis=1)
act_y_test = act_test['target']
# Apply MinMaxScaler
act_scaler = MinMaxScaler()
act_x_train = pd.DataFrame(act_scaler.fit_transform(act_x_train), columns=act_x_train.columns)
act_x_test = pd.DataFrame(act_scaler.transform(act_x_test), columns=act_x_test.columns)

shap_train = pd.concat([shap_r01_g10_3000k, shap_r02_g10_3000k, shap_r03_g10_3000k, shap_r04_g10_3000k, shap_r05_g10_3000k, shap_r06_g10_3000k, shap_r07_g10_3000k, shap_r08_g10_3000k, shap_r09_g10_3000k, shap_r10_g01_3000k, shap_r10_g02_3000k, shap_r10_g03_3000k, shap_r10_g04_3000k, shap_r10_g05_3000k, shap_r10_g06_3000k, shap_r10_g07_3000k, shap_r10_g08_3000k, shap_r10_g09_3000k], ignore_index=True)
shap_test = pd.concat([shap_r00_g10_3000k, shap_r10_g00_3000k], ignore_index=True) # leave out dataset: shap_r10_g10_3000k
shap_x_train = shap_train.drop('target', axis=1)
shap_y_train = shap_train['target']
shap_x_test = shap_test.drop('target', axis=1)
shap_y_test = shap_test['target']
# Apply MinMaxScaler
shap_scaler = MinMaxScaler()
shap_x_train = pd.DataFrame(shap_scaler.fit_transform(shap_x_train), columns=shap_x_train.columns)
shap_x_test = pd.DataFrame(shap_scaler.transform(shap_x_test), columns=shap_x_test.columns)

shap_cols = [f"shap{i}{j}{c}" for i in range(7) for j in range(7) for c in ['r', 'g', 'b']]
non_shap_cols = [f"{i}{j}{c}" for i in range(7) for j in range(7) for c in ['r', 'g', 'b']]
shap_sum_x_train = shap_x_train.drop(columns=shap_cols).copy()
shap_sum_x_train[non_shap_cols] += shap_x_train[shap_cols].values
shap_sum_x_test = shap_x_test.drop(columns=shap_cols).copy()
shap_sum_x_test[non_shap_cols] += shap_x_test[shap_cols].values
shap_sum_y_train = shap_y_train
shap_sum_y_test = shap_y_test

pixels_train_from_act = act_train.drop([f'a{i}' for i in range(1, 129)], axis=1).copy()
pixels_train_from_shap = shap_train.drop([f'shap{i}{j}{c}' for i in range(7) for j in range(7) for c in ['r','g','b']], axis=1).copy()
pixels_test_from_act = act_test.drop([f'a{i}' for i in range(1, 129)], axis=1).copy()
pixels_test_from_shap = shap_test.drop([f'shap{i}{j}{c}' for i in range(7) for j in range(7) for c in ['r','g','b']], axis=1).copy()
#pixels_train = pd.concat([pixels_train_from_act, pixels_train_from_shap], axis=1)
#pixels_test = pd.concat([pixels_test_from_act, pixels_test_from_shap], axis=1)
pixels_act_x_train = pixels_train_from_act.drop('target', axis=1)
pixels_act_y_train = pixels_train_from_act['target']
pixels_act_x_test = pixels_test_from_act.drop('target', axis=1)
pixels_act_y_test = pixels_test_from_act['target']
pixels_shap_x_train = pixels_train_from_shap.drop('target', axis=1)
pixels_shap_y_train = pixels_train_from_shap['target']
pixels_shap_x_test = pixels_test_from_shap.drop('target', axis=1)
pixels_shap_y_test = pixels_test_from_shap['target']

# Apply MinMaxScaler
#pixels_scaler = MinMaxScaler()
#pixels_x_train = pd.DataFrame(pixels_scaler.fit_transform(pixels_x_train), columns=pixels_x_train.columns)
#pixels_x_test = pd.DataFrame(pixels_scaler.transform(pixels_x_test), columns=pixels_x_test.columns)


shap_sum_x_train.head()
pixels_train_from_act.head()

Unnamed: 0,target,00r,00g,00b,01r,01g,01b,02r,02g,02b,...,63b,64r,64g,64b,65r,65g,65b,66r,66g,66b
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
def train_and_test(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    if isinstance(y_pred[0], float):
        y_pred = np.clip(np.round(y_pred), -1, 1) # Round y_pred to int, max 1 and min -1
    return accuracy_score(y_test, y_pred)

dataDict = {
    'pixels_act': {
        'data': (pixels_act_x_train, pixels_act_y_train, pixels_act_x_test, pixels_act_y_test)},
    'pixels_shap': {
        'data': (pixels_shap_x_train, pixels_shap_y_train, pixels_shap_x_test, pixels_shap_y_test)},
    'act': {
        'data': (act_x_train, act_y_train, act_x_test, act_y_test)},
    'shap': {
        'data': (shap_x_train, shap_y_train, shap_x_test, shap_y_test)},
    'shap_sum': {
        'data': (shap_sum_x_train, shap_sum_y_train, shap_sum_x_test, shap_sum_y_test)}}

models = {
    #'linear_regression': lambda: LinearRegression(),
    'decision_tree': lambda: DecisionTreeClassifier(),
    #'random_forest': lambda: RandomForestClassifier(),
    'hist_gradient_boosting': lambda: HistGradientBoostingClassifier()}

for dataset in ['pixels_act', 'pixels_shap', 'act', 'shap', 'shap_sum']:  
    for model_name, model_fn in models.items():
        model = model_fn()  # Creating a new model instance, ensuring no transfer learning
        accuracy = train_and_test(model, *dataDict[dataset]['data'])
        print(f"{dataset}: {model_name} - Accuracy: {accuracy:.2%}")

pixels_act: decision_tree - Accuracy: 49.02%
pixels_act: hist_gradient_boosting - Accuracy: 50.42%
pixels_shap: decision_tree - Accuracy: 50.88%
pixels_shap: hist_gradient_boosting - Accuracy: 51.20%
act: decision_tree - Accuracy: 22.82%
act: hist_gradient_boosting - Accuracy: 25.75%
shap: decision_tree - Accuracy: 88.78%
shap: hist_gradient_boosting - Accuracy: 98.42%
shap_sum: decision_tree - Accuracy: 96.88%
shap_sum: hist_gradient_boosting - Accuracy: 98.47%


In [None]:
actLinRegModel = LinearRegression()
actLinRegModel.fit(act_x_train, act_y_train)
act_y_pred = actLinRegModel.predict(act_x_test)
act_y_pred = np.clip(np.round(act_y_pred), -1, 1) # Round y_pred to int, max 1 and min -1
total_correct = (act_y_test == act_y_pred).sum()
total_samples = len(act_y_test)
print(f"Linear regression, Activation dataset")
print(f"Total correct predictions: {total_correct} out of {total_samples}")
print(f"Accuracy: {accuracy_score(act_y_test, act_y_pred):.2%}")
print(f"\nConfusion matrix:\n{confusion_matrix(act_y_test, act_y_pred)}")
cm = confusion_matrix(act_y_test, act_y_pred)
labels=["red", "fail", "green"]

pd.DataFrame(cm, index=[f"Actual {label}" for label in labels], columns=[f"Predicted {label}" for label in labels])



Linear regression, Activation dataset
Total correct predictions: 2056 out of 4000
Accuracy: 51.40%

Confusion matrix:
[[2003    1    1]
 [   0    1    0]
 [1634  308   52]]


Unnamed: 0,Predicted red,Predicted fail,Predicted green
Actual red,2003,1,1
Actual fail,0,1,0
Actual green,1634,308,52


Earlier..

Done:
- Train 21 models
- Create all activation and shap datasets
- Precicted unseen network datasets

Answered questions:
- Should the models retain the 5% epsilon during dataset creation? Yes
- Does the SHAP background data sound sound? Compare
- Should the decision tree recieve coordinates too? Format? Yes, no feature engineering

### Meeting 2025-02-24

Next up:
- Implement Gradcam

Questions:
- Should I switch to an environment where the agent can't see the goal? Yes
- What other XAI methods makes sense to implement after gradcam? Wait
- Should I spend time visualising these results to be able to put them into the thesis? Yes

Todo:
- Visualise using shap
- Alter environment




- Create a new baseline for how the models perform if one only plots the pixels