# Causal Structures
Using Halerium Causal Structures

Author: {{ cookiecutter.author_name }}
Created: {{ cookiecutter.timestamp }}

In [0]:
# Link to experiments card (refresh and hit enter on this line to see the link)

### Imports

In [0]:
import numpy as np
import pandas as pd
import halerium.core as hal

import itertools
from itertools import chain, combinations

from sklearn.model_selection import train_test_split

import networkx as nx
import matplotlib.pyplot as plt

### Project

In [0]:
experiment_name = '{{cookiecutter.use_case_name}}'  # please provide a name for the hypothesis testing experiment

### Dataset

In [0]:
time_series = False
test_size = 0.25
path = '{{cookiecutter.data_path}}' # Specify the path of the data

if path =='default example':
    path = 'https://raw.githubusercontent.com/erium/halerium-example-data/main/hypothesis_testing/WineQT.csv'

if time_series:
    df = pd.read_csv(path, parse_dates=['date'])
else:
    df = pd.read_csv(path, delimiter=';')
df

## Manual Modelling
Manually specify the dependencies in the causal structure

In [0]:
# Directed dependencies
dependencies = [['fixed acidity', 'pH'], ['volatile acidity', 'pH']]
features_input = ['fixed acidity', 'volatile acidity']
features_output = ['pH']

In [0]:
features = list(set([item for sublist in dependencies for item in sublist]))
data = df[features]
train, test = train_test_split(data, test_size = test_size)

causal_structure = hal.CausalStructure(dependencies)
causal_structure.train(train)
test_input = test[features_input]
test_output = test[features_output]
test_input.reset_index(inplace=True)
test_output.reset_index(inplace=True)

influences = []
for feature in features:
    influence = causal_structure.evaluate_objective(hal.InfluenceEstimator, target=feature)
    influences.append([feature, influence])
evaluation = causal_structure.evaluate_objective(hal.Evaluator, data=test,
                                    inputs=features_input, metric="r2")
prediction_mean, prediction_std = causal_structure.predict(data=test_input, return_std=True)

### Results

In [0]:
for feature_out in features_output:
    print("Output Feature:", feature_out)
    columns = features + [feature + ' std' for feature in features]
    prediction = pd.concat([prediction_mean, prediction_std], axis=1)
    prediction.columns = columns
    print("r2:", evaluation[feature_out])

    for feature_in in features_input:
        prediction.sort_values(by=[feature_in], inplace=True)
        for feature_out in features_output:
            prediction_mean = prediction[features]
            prediction_std = prediction[[feature + ' std' for feature in features]]
            plt.plot(prediction_mean[feature_in], prediction_mean[feature_out], color="red")
            plt.fill_between(prediction_mean[feature_in],
                 (prediction_mean - prediction_std)[feature_out],
                 (prediction_mean + prediction_std)[feature_out],
                 color="red", alpha=0.5)
            plt.scatter(test[feature_in], test[feature_out])
            plt.xlabel(feature_in)
            plt.ylabel(feature_out)
            plt.show()

    # Building and displaying the Directed Graph
    G = nx.MultiDiGraph()
    for feature in features:
        G.add_node(feature)
    G.add_edges_from(dependencies)

    color_map = []
    for node in G:
        if node in features_output:
            color_map.append('red')
        else: 
            color_map.append('green')  

    nx.draw(G, node_color=color_map, with_labels = True)
    plt.show()

## Automatic Modelling
Generate all possible DAGs
This becomes computationally slow with > 3 features

In [0]:
features = ['fixed acidity', 'volatile acidity', 'pH', 'residual sugar']
features_input = ['fixed acidity', 'volatile acidity']
features_output = ['pH', 'residual sugar']

In [0]:
# Generate all possible dependencies
dependencies = []
for i in itertools.permutations(features, 2):
    dependencies.append(list(i))
print("Number of dependencies:", len(dependencies))

In [0]:
# Powerset of sets of dependencies of at least size of number of features
def powerset(iterable):
    s = list(iterable)
    min_set_size = 1
    max_set_size = len(s)
    return chain.from_iterable(list(combinations(s, r)) for r in range(min_set_size, max_set_size))
dependency_powerset = list(powerset(dependencies))
print("Length of dependency powerset:", len(dependency_powerset))

In [0]:
dag = []
for dependency_set in dependency_powerset:
    try:
        hal.causal_structure.Dependencies(dependency_set)
    except:
        continue
    else:
        dependencies = list(dependency_set)
        all_dependencies = list(set([item for sublist in dependencies for item in sublist]))
        
        # If it does not include all features specified
        if set(all_dependencies) != set(features):
            continue
        dag.append(dependency_set)
print("Number of DAGs that include all features:", len(dag))

In [0]:
results = []
for dependencies in dag:
    dependencies = list(dependencies)

    data = df[features]
    train, test = train_test_split(data, test_size = test_size)
    causal_structure = hal.CausalStructure(dependencies)
    causal_structure.train(train)
    test_input = test[features_input]
    test_output = test[features_output]
    test_input.reset_index(inplace=True)
    test_output.reset_index(inplace=True)

    influences = []
    for feature in features:
        influence = causal_structure.evaluate_objective(hal.InfluenceEstimator, target=feature)
        influences.append([feature, influence])
    evaluation = causal_structure.evaluate_objective(hal.Evaluator, data=test,
                                     inputs=features_input, metric="r2")
    prediction_mean, prediction_std = causal_structure.predict(data=test_input, return_std=True)
    print(evaluation)

    results.append([dependencies, causal_structure, influences, evaluation, prediction_mean, prediction_std])

### Results

In [0]:
for feature_out in features_output:
    print("Output Feature:", feature_out)
    best_r2 = max(results, key= lambda x: x[3][feature_out])
    dependencies, causal_structure, influences, evaluation, prediction_mean, prediction_std = best_r2
    columns = features + [feature + ' std' for feature in features]
    prediction = pd.concat([prediction_mean, prediction_std], axis=1)
    prediction.columns = columns
    print("r2:", evaluation[feature_out])

    for feature_in in features_input:
        prediction.sort_values(by=[feature_in], inplace=True)
        for feature_out in features_output:
            prediction_mean = prediction[features]
            prediction_std = prediction[[feature + ' std' for feature in features]]
            plt.plot(prediction_mean[feature_in], prediction_mean[feature_out], color="red")
            plt.fill_between(prediction_mean[feature_in],
                 (prediction_mean - prediction_std)[feature_out],
                 (prediction_mean + prediction_std)[feature_out],
                 color="red", alpha=0.5)
            plt.scatter(test[feature_in], test[feature_out])
            plt.xlabel(feature_in)
            plt.ylabel(feature_out)
            plt.show()

    # Building and displaying the Directed Graph
    G = nx.MultiDiGraph()
    for feature in features:
        G.add_node(feature)
    G.add_edges_from(dependencies)

    color_map = []
    for node in G:
        if node in features_output:
            color_map.append('red')
        else: 
            color_map.append('green')  

    nx.draw(G, node_color=color_map, with_labels = True)
    plt.show()

### Copy another template

In [0]:
from cookiecutter.main import cookiecutter
from datetime import datetime

In [ ]:
# Copy a prediction template
path = './..'
repo = 'https://github.com/erium/welcome'
directory = 'use_case_templates/prediction'
#cookiecutter(repo, directory=directory, output_dir=path, extra_context={'timestamp': str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))})