# Unsupervised Bayesian model
This is a template notebook for unsupervised bayesian outlier detection With Halerium Graphs.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}

In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

## How to use the notebook

The following cells:
- specify objective, variables, and dependencies,
- set up the outlier detection models,
- read dataset,
- present results from the models.

By default, the notebook is set up to run with an example (height weight). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to bayesian_unsupervised.board for detailed instructions.

In [0]:
# <halerium id="ac2233b1-cc3d-497b-9b67-39c6f0365a86">
# Link to bayesian_unsupervised.board
# </halerium id="ac2233b1-cc3d-497b-9b67-39c6f0365a86">


### Imports

In [0]:
import os
import shutil
from distutils.dir_util import copy_tree

import numpy as np
import pandas as pd
import halerium.core as hal

import matplotlib.pyplot as plt

from joblib import dump, load

### 2. Import the Dataset

In [0]:
# <halerium id="35aaa8c6-7a71-468d-94fc-5a659a6e5098">
time_series = False # Specify if the data is time series
path = 'default example' # Specify the path of the data
# </halerium id="35aaa8c6-7a71-468d-94fc-5a659a6e5098">


Importing the dataset

In [0]:
if path == 'default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/outlier_detection/height_weight.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'], index_col = 'date')
else:
    df = pd.read_csv(path)

num_col = len(df.columns)

Visualising the dataset

In [0]:
df

Creating the /out folder for artifacts

In [0]:
out_path = './out'
isExist = os.path.exists(out_path)
if isExist:
  for root, dirs, files in os.walk(out_path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(out_path)

In [0]:
plt.title("Plot of age against height")
plt.scatter(df["Age"], df["Height"])
plt.xlabel('Age')
plt.ylabel('Height')
plt.show()

### 3. Model the Domain Knowledge

In [0]:
graph = hal.Graph("graph")
# <halerium id="ed8466e2-4290-4ea2-96c3-a56e6986fda5">
with graph:
    # Defining a variable you would like to model (usually a feature in the data)
    hal.Variable("age",
                 distribution="LogNormalDistribution") # we pick the log-normal distribution since age is a positive number
    hal.StaticVariable("age_mean_log", # Define the mean log of the variable
                       distribution="NormalDistribution", # the mean log can be negative or positive
                       mean=0, variance=4) # Define the known/assumed statistical properties
    hal.StaticVariable("age_variance_log",
                       distribution="LogNormalDistribution", # a variance has to be positive
                       mean_log=0, variance_log=1,)
    
    # Set the attributes of the variable
    age.mean_log = age_mean_log
    age.variance_log = age_variance_log

    hal.Variable("height")
    hal.StaticVariable("height_curve_parameters", shape=(2,), mean=0, variance=1)
    # You may set the mathematical formulation of properties based on other properties
    height.mean = height_curve_parameters[0] * hal.math.sqrt(age) / 2 + height_curve_parameters[1]
    hal.StaticVariable("height_variance",
                       distribution="LogNormalDistribution", # a variance has to be positive
                       mean_log=-3, variance_log=1,)
    # Set the attributes of the variable
    height.variance = height_variance
# </halerium id="ed8466e2-4290-4ea2-96c3-a56e6986fda5">


In [0]:
# Specify where hal variables extract data from
data={graph.age: df["Age"], graph.height: df["Height"]}

### 4. Run the Model

#### Training the model

In [0]:
from functions.bayesian_unsupervised import get_posterior_samples

# <halerium id="20f4eed3-a061-42a2-8e75-36be4bcece8b">
post_samples = get_posterior_samples(graph, data)
# </halerium id="20f4eed3-a061-42a2-8e75-36be4bcece8b">


In [0]:
plt.title("Plot of model parameters")
plt.scatter(np.array(post_samples)[:, 0], np.array(post_samples)[:, 1])
plt.show()

In [0]:
trained_graph = hal.Trainer(graph, data={graph.age: df["Age"], graph.height: df["Height"]})()

In [0]:
# Predicted statistical features of parameters
hal.Predictor(trained_graph, measure=('mean', 'standard_deviation'))([trained_graph.height_curve_parameters])

In [0]:
# Predicted statistical features of height variance
hal.Predictor(trained_graph, measure=('mean', 'standard_deviation'))([trained_graph.height_variance])

In [0]:
# Samples from the trained graph
samples = hal.get_generative_model(trained_graph).get_samples({"age": trained_graph.age,
                                                               "height": trained_graph.height},
                                                              n_samples=1000)

In [0]:
plt.title('Plot of points from original data and trained graph')
plt.scatter(np.reshape(samples["age"], -1), np.reshape(samples["height"], -1), label='From graph')
plt.scatter(df["Age"], df["Height"], label='From data')
plt.legend()
plt.show()

In [0]:
age_input = np.linspace(100, 250, 100)
height_pred = hal.Predictor(trained_graph, measure='mean', data={trained_graph.age: age_input})(trained_graph.height)

In [0]:
plt.plot(age_input, height_pred, color='r', label='Predicted trend')
plt.scatter(df["Age"], df["Height"], label='Original Data')
plt.legend()
plt.show()

#### Detecting the outliers

In [0]:
# Detect outliers in the data
out_detector = hal.objectives.OutlierDetector(trained_graph, data=data)

In [0]:
# Univariate outliers and outlier for entire data point
out_detector()

### 5. Get the Results

In [0]:
# <halerium id="2c00c626-48a2-4521-9ecf-74afdcfc9fcb">
outlier_flags = out_detector()['graph'] # outliers in context of whole graph
plt.title('Plot of outlier points')
plt.scatter(df["Age"][~outlier_flags], df["Height"][~outlier_flags], color="green", label='Non-outlier')
plt.scatter(df["Age"][outlier_flags], df["Height"][outlier_flags], color="red", label='Outlier')
plt.legend()
plt.show()
# </halerium id="2c00c626-48a2-4521-9ecf-74afdcfc9fcb">


In [0]:
from functions.bayesian_unsupervised import get_outlier_df

df_final = get_outlier_df(df, outlier_flags)
df_final

### 7. Export and use Streamlit

In [0]:
from functions.bayesian_unsupervised import export_model

export_model(df_final, graph, trained_graph, df, time_series)

In [0]:
# Show the graph
hal.show(trained_graph)