# Unsupervised Bayesian model
This is a template notebook for unsupervised bayesian outlier detection With Halerium Graphs.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}

## How to use the notebook

The following cells:
- specify objective, variables, and dependencies,
- set up the outlier detection models,
- read dataset,
- present results from the models.

By default, the notebook is set up to run with an example (height weight). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to bayesian_unsupervised.board for detailed instructions.

In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

# General Imports and Setup

In [0]:
import os
import shutil
from distutils.dir_util import copy_tree

import numpy as np
import pandas as pd
import halerium.core as hal

import matplotlib.pyplot as plt

from joblib import dump, load

# Project

In [0]:
experiment_name = '{{cookiecutter.use_case_name}}'  # please provide a name for the outlier detection experiment

# Dataset

In [0]:
time_series = False # Specify if the data is time series
path = '{{cookiecutter.data_path}}' # Specify the path of the data

if path == 'default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/outlier_detection/height_weight.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'], index_col = 'date')
else:
    df = pd.read_csv(path)

num_col = len(df.columns)

out_path = './../out'
isExist = os.path.exists(out_path)
if isExist:
  for root, dirs, files in os.walk(out_path):
      for f in files:
          os.unlink(os.path.join(root, f))
      for d in dirs:
          shutil.rmtree(os.path.join(root, d))
else:
  os.makedirs(out_path)

Visualising the dataset

In [0]:
df

In [0]:
plt.title("Plot of age against height")
plt.scatter(df["Age"], df["Height"])
plt.xlabel('Age')
plt.ylabel('Height')
plt.show()

# Defining the variables

In [0]:
graph = hal.Graph("graph")
with graph:
    # Defining a variable you would like to model (usually a feature in the data)
    hal.Variable("age",
                 distribution="LogNormalDistribution") # we pick the log-normal distribution since age is a positive number
    hal.StaticVariable("age_mean_log", # Define the mean log of the variable
                       distribution="NormalDistribution", # the mean log can be negative or positive
                       mean=0, variance=4) # Define the known/assumed statistical properties
    hal.StaticVariable("age_variance_log",
                       distribution="LogNormalDistribution", # a variance has to be positive
                       mean_log=0, variance_log=1,)
    
    # Set the attributes of the variable
    age.mean_log = age_mean_log
    age.variance_log = age_variance_log

    hal.Variable("height")
    hal.StaticVariable("height_curve_parameters", shape=(2,), mean=0, variance=1)
    # You may set the mathematical formulation of properties based on other properties
    height.mean = height_curve_parameters[0] * hal.math.sqrt(age) / 2 + height_curve_parameters[1]
    hal.StaticVariable("height_variance",
                       distribution="LogNormalDistribution", # a variance has to be positive
                       mean_log=-3, variance_log=1,)
    # Set the attributes of the variable
    height.variance = height_variance

## Training the model

In [0]:
posterior_model = hal.get_posterior_model(graph, data={graph.age: df["Age"], graph.height: df["Height"]})
posterior_model.solve()
post_samples = posterior_model.get_samples(graph.height_curve_parameters, n_samples=1000)

In [0]:
plt.title("Plot of model parameters")
plt.scatter(np.array(post_samples)[:, 0], np.array(post_samples)[:, 1])
plt.show()

In [0]:
trained_graph = hal.Trainer(graph, data={graph.age: df["Age"], graph.height: df["Height"]})()

In [0]:
# Predicted statistical features of parameters
hal.Predictor(trained_graph, measure=('mean', 'standard_deviation'))([trained_graph.height_curve_parameters])

In [0]:
# Predicted statistical features of height variance
hal.Predictor(trained_graph, measure=('mean', 'standard_deviation'))([trained_graph.height_variance])

In [0]:
# Samples from the trained graph
samples = hal.get_generative_model(trained_graph).get_samples({"age": trained_graph.age,
                                                               "height": trained_graph.height},
                                                              n_samples=1000)

In [0]:
plt.title('Plot of points from original data and trained graph')
plt.scatter(np.reshape(samples["age"], -1), np.reshape(samples["height"], -1), label='From graph')
plt.scatter(df["Age"], df["Height"], label='From data')
plt.legend()
plt.show()

In [0]:
age_input = np.linspace(100, 250, 100)
height_pred = hal.Predictor(trained_graph, measure='mean', data={trained_graph.age: age_input})(trained_graph.height)

In [0]:
plt.plot(age_input, height_pred, color='r', label='Predicted trend')
plt.scatter(df["Age"], df["Height"], label='Original Data')
plt.legend()
plt.show()

## Detecting the outliers

In [0]:
# Detect outliers in the data
out_detector = hal.objectives.OutlierDetector(trained_graph, data={graph.age: df["Age"], graph.height: df["Height"]})

In [0]:
# Univariate outliers and outlier for entire data point
out_detector()

In [0]:
outlier_flags = out_detector()['graph'] # outliers in context of whole graph
plt.title('Plot of outlier points')
plt.scatter(df["Age"][~outlier_flags], df["Height"][~outlier_flags], color="green", label='Non-outlier')
plt.scatter(df["Age"][outlier_flags], df["Height"][outlier_flags], color="red", label='Outlier')
plt.legend()
plt.show()

## Outlier prediction and export

In [0]:
df_final = df.copy()
df_final['outlier'] = 0
outliers = []
for i, outlier in enumerate(outlier_flags):
    if outlier:
        outliers.append(i)
df_final.loc[df_final.index[outliers], 'outlier'] = 1
df_final

In [0]:
# Export as CSV
df_final.to_csv('./../out/labeled_data')

In [0]:
# Exports the trained graph
graph.dump_file('./../out/graph.json')
trained_graph.dump_file("./../out/trained_graph.json")
dump([df, time_series], './../out/unsupervised_bayesian.joblib')

In [0]:
# Show the graph
hal.show(trained_graph)