# Supervised Bayesian model

This is a template notebook for supervised bayesian outlier detection With Halerium Causal Structures.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}


In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

## How to use the notebook

The following cells:
- specify objective, variables, and dependencies,
- set up the outlier detection models,
- read dataset,
- present results from the models.

By default, the notebook is set up to run with an example (labeled height weight). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to bayesian_supervised.board for detailed instructions.

In [0]:
# <halerium id="eb15af0d-ec25-40b4-b26c-e7e1133a12e1">
# Link to bayesian_supervised.board
# </halerium id="eb15af0d-ec25-40b4-b26c-e7e1133a12e1">


## Imports

In [0]:
import os
import shutil
from distutils.dir_util import copy_tree

import numpy as np
import pandas as pd
import halerium.core as hal
from halerium import CausalStructure

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from joblib import dump, load

### 2. Import the Dataset

In [0]:
# <halerium id="1cf010bb-cad1-4cde-8d25-90c1bd5b2870">
time_series = False # Specify if the data is time series
path = 'default example' # Specify the path of the data
test_size = 0.25
# </halerium id="1cf010bb-cad1-4cde-8d25-90c1bd5b2870">


Importing the dataset

In [0]:
if path == 'default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/outlier_detection/labeled_height_weight.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'], index_col = 'date')
else:
    df = pd.read_csv(path)

num_col = len(df.columns)

Visualising the dataset

In [0]:
df

Creating the /out folder for artifacts

In [0]:
path = './out'
isExist = os.path.exists(path)
if isExist:
  for root, dirs, files in os.walk(path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(path)

In [0]:
pairplot_hue = 'outlier'
palette ={0: "C0", 1: "C3"}
sns.pairplot(df, hue = pairplot_hue, palette=palette)
plt.show()

Split the Data

In [0]:
df_train, df_test = train_test_split(df, test_size = test_size)

### 3. Model the Domain Knowledge

In [0]:
# <halerium id="cf94ca51-8797-46f0-90ef-c1f1ff1eab57">
data = pd.DataFrame(data={"(age)": df_train["Age"],
                          "(height|age)": df_train["Height"],
                          "(outlier|age,height)": df_train["outlier"]})
# </halerium id="cf94ca51-8797-46f0-90ef-c1f1ff1eab57">

# <halerium id="cf94ca51-8797-46f0-90ef-c1f1ff1eab57">
test_data = pd.DataFrame(data={"(age)": df_test["Age"],
                          "(height|age)": df_test["Height"]})
# </halerium id="cf94ca51-8797-46f0-90ef-c1f1ff1eab57">


In [0]:
data

In [0]:
# <halerium id="cf94ca51-8797-46f0-90ef-c1f1ff1eab57">
dependencies = [
    ["(age)", "(height|age)"], # the column '(height|age)' depends on '(age)'
    [["(age)", "(height|age)"], "(outlier|age,height)"], # the column '(outlier|age,height)' depends on '(age)' and '(height|age)'
]
# </halerium id="cf94ca51-8797-46f0-90ef-c1f1ff1eab57">


### 4. Run the Model

In [0]:
from functions.bayesian_supervised import run_model

outlier_threshold = 0.5
prediction = run_model(dependencies, data, outlier_threshold, test_data)
# <halerium id="bbac5424-c901-4981-bc2b-890c38bded3d">
prediction
# </halerium id="bbac5424-c901-4981-bc2b-890c38bded3d">


### 5. Get the results

In [0]:
from functions.bayesian_supervised import show_results

# <halerium id="6a35ae62-a925-4e36-8e0a-7586227a67a1">
show_results(df_test, prediction)
# </halerium id="6a35ae62-a925-4e36-8e0a-7586227a67a1">


### 7. Export and use Streamlit

In [0]:
# Export the causal structure
dump([data, dependencies, df, time_series, outlier_threshold], './out/supervised_bayesian.joblib')