# Supervised Bayesian model

This is a template notebook for supervised bayesian outlier detection With Halerium Causal Structures.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}


## How to use the notebook

The following cells:
- specify objective, variables, and dependencies,
- set up the outlier detection models,
- read dataset,
- present results from the models.

By default, the notebook is set up to run with an example (labeled height weight). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to bayesian_supervised.board for detailed instructions.

In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

# General Imports and Setup

In [0]:
import os
import shutil
from distutils.dir_util import copy_tree

import numpy as np
import pandas as pd
import halerium.core as hal
from halerium import CausalStructure

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

from joblib import dump, load

# Project

In [0]:
experiment_name = '{{cookiecutter.use_case_name}}'  # please provide a name for the outlier detection experiment

# Dataset

In [0]:
time_series = False # Specify if the data is time series
path = '{{cookiecutter.data_path}}' # Specify the path of the data
test_size = 0.25

if path == 'default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/outlier_detection/labeled_height_weight.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'], index_col = 'date')
else:
    df = pd.read_csv(path)

num_col = len(df.columns)

path = './../out'
isExist = os.path.exists(path)
if isExist:
  for root, dirs, files in os.walk(path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(path)

In [0]:
df

In [0]:
pairplot_hue = 'outlier'
palette ={0: "C0", 1: "C3"}
sns.pairplot(df, hue = pairplot_hue, palette=palette)

## Split the Data

In [0]:
df_train, df_test = train_test_split(df, test_size = test_size)

## Model the Causal structure

In [0]:
data = pd.DataFrame(data={"(age)": df_train["Age"],
                          "(height|age)": df_train["Height"],
                          "(outlier|age,height)": df_train["outlier"]})

test_data = pd.DataFrame(data={"(age)": df_test["Age"],
                          "(height|age)": df_test["Height"]})

In [0]:
data

In [0]:
dependencies = [
    ["(age)", "(height|age)"], # the column '(height|age)' depends on '(age)'
    [["(age)", "(height|age)"], "(outlier|age,height)"], # the column '(outlier|age,height)' depends on '(age)' and '(height|age)'
]

In [0]:
outlier_threshold = 0.5
causal_structure = CausalStructure(dependencies)
causal_structure.train(data)
prediction = causal_structure.predict(data=test_data)
prediction_mean, prediction_std = causal_structure.predict(
    data=test_data, return_std=True)

## Outlier Prediction

In [0]:
prediction.loc[prediction['(outlier|age,height)'] < outlier_threshold, '(outlier|age,height)'] = 0
prediction.loc[prediction['(outlier|age,height)'] >= outlier_threshold, '(outlier|age,height)'] = 1
prediction

In [0]:
y_test = df_test['outlier']
y_pred = prediction['(outlier|age,height)']
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, average='macro')
report = classification_report(y_test, y_pred, target_names=['Non-outlier', 'Outlier'])

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(pd.Series([tn, fp, fn, tp], index = ['True Negatives (Non-outliers)', 'False Positives (Non-outliers predicted as outliers)', 'False Negatives (Outliers predicted as non-outliers', 'True Positives (Outliers)']))
print(report)

In [0]:
# Export the causal structure
dump([data, dependencies, df, time_series, outlier_threshold], './../out/supervised_bayesian.joblib')