# Equation fit

Using Halerium graphs.

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}

## How to use the notebook

The following cells:
- specify objective, variables, and variable types,
- read dataset,
- set up the equations,
- present results from the tests,

By default, the notebook is set up to run with an example (wine quality). To see how it works, run the notebook without changing the code.

For your project, adjust the code in the linked cells with your objectives, variables, dataset etc. and then execute all cells in order.

Please refer to equation.board for detailed instructions.

In [0]:
# Link to project experiments folder hypothesis_experiment_learnings.board (refresh and hit enter on this line to see the link)

### Project

In [0]:
experiment_name = '{{cookiecutter.use_case_name}}'  # please provide a name for the hypothesis testing experiment

### Imports

In [0]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import halerium.core as hal

from halerium.core import Graph, Entity, Variable, StaticVariable, link, DataLinker
from halerium.core import get_posterior_model, get_generative_model
from halerium.core.model import Trainer
from halerium.objectives import Predictor

from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt

In [0]:
time_series = False
test_size = 0.25
path = '{{cookiecutter.data_path}}' # Specify the path of the data

if path =='default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/hypothesis_testing/WineQT.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'])
else:
    df = pd.read_csv(path, sep=None)
df

In [0]:
train, test = train_test_split(df, test_size = test_size)
train.reset_index(inplace=True)
test.reset_index(inplace=True)
test

In [0]:
graph = Graph("graph")
with graph:
    with inputs:
        # Inputs for the equation (the x) you may specify the mean and variance if it is known
        fixed_acidity = Variable('fixed_acidity')
        volatile_acidity = Variable('volatile_acidity')
        # volatile_acidity = Variable('volatile_acidity', mean = 0.2, variance = 0.01)

    with outputs:
        pH = Variable('pH')

    model_parameters = Entity('model_parameters')
    with model_parameters:
        a0 = StaticVariable('a0', mean=0, variance=10**2)
        a1 = StaticVariable('a1', mean=0, variance=10**2)
        a2 = StaticVariable('a2', mean=0, variance=10**2)

    # The equation
    pH.mean = a0 + a1 * fixed_acidity + volatile_acidity ** a2
    # You may specify the variance if you have the domain knowledge
    # pH.variance = a0 + 1

In [0]:
# 'Training' the model
model = get_posterior_model(
    graph=graph,
    data={
        graph.inputs.fixed_acidity: train["fixed acidity"],
        graph.inputs.volatile_acidity: train["volatile acidity"],
        graph.outputs.pH: train["pH"],
    })
posterior_graph = model.get_posterior_graph()

In [0]:
model = get_generative_model(
    graph=posterior_graph,
    data={
        graph.inputs.fixed_acidity: test["fixed acidity"],
        graph.inputs.volatile_acidity: test["volatile acidity"],
    }
)

predicted_pH = model.get_means(graph.outputs.pH)
true_pH = list(test['pH'])
plt.title('Difference between true and predicted values')
plt.plot(true_pH-predicted_pH)
plt.show()

mse = mean_squared_error(true_pH, predicted_pH)
r2 = r2_score(true_pH, predicted_pH)
print('mse:', mse, 'r2', r2)