# Baseline submission with the PC algorithm

In [1]:
%env API_BASE_URL=http://api.hub.crunchdao.io
%env WEB_BASE_URL=http://hub.crunchdao.io

env: API_BASE_URL=http://api.hub.crunchdao.io
env: WEB_BASE_URL=http://hub.crunchdao.io


In [None]:
# update the token via https://hub.crunchdao.io/competitions/causality-discovery/submit/via/notebook

!crunch setup causality-discovery . --token aaaabbbbccccddddeeeeffff --force

In [None]:
!pip install gcastle

In [1]:
"""
This is a basic example of what you need to do to participate to the tournament.
The code will not have access to the internet (or any socket related operation).
"""

import os
import typing

import joblib
import crunch
import pandas as pd
import numpy as np
import networkx as nx
import castle.algorithms
import torch
from tqdm import tqdm

2024-03-20 12:11:54,052 - c:\Users\cacer\AppData\Local\Programs\Python\Python310\lib\site-packages\castle\backend\__init__.py[line:36] - INFO: You can use `os.environ['CASTLE_BACKEND'] = backend` to set the backend(`pytorch` or `mindspore`).
2024-03-20 12:11:54,264 - c:\Users\cacer\AppData\Local\Programs\Python\Python310\lib\site-packages\castle\algorithms\__init__.py[line:36] - INFO: You are using ``pytorch`` as the backend.


In [2]:
crunch = crunch.load_notebook()

loaded inline runner with module: <module '__main__'>


The following function is provided to help you obtaining a DAG from your predicted graph, in case it is not a DAG, also ensuring that there is an edge from X to Y, as designed. This is just one way to obtain such result and not necessarily optimal for the competition. An improved algorithm to obtain a DAG from your predicted graph could lead to better scores.

In [3]:
def fix_DAG(g):
    """
    Ensure that the graph is a DAG and has an edge X→Y

    We look for cycles, and remove an edge in each cycle, until there are no cycles left.

    Inputs: g: nx.DiGraph
    Output: g: nx.DiGraph

    This function provides just a possible solution to the problem
    of DAG-ifying a graph. Other solutions can be conceived that could
    be better for the competition.
    """

    assert 'X' in g.nodes
    assert 'Y' in g.nodes

    gg = g.copy()

    # Add X→Y if it is missing
    if ('X', 'Y') not in gg.edges:
        gg.add_edge( 'X', 'Y' )

    # Look for cycles and remove them
    while not nx.is_directed_acyclic_graph(gg):

        h = gg.copy()

        # Remove all the sources and sinks
        while True:
            finished = True
            for i,v in nx.in_degree_centrality(h).items():
                if v == 0:
                    h.remove_node(i)
                    finished = False
            for i,v in nx.out_degree_centrality(h).items():
                if v == 0:
                    h.remove_node(i)
                    finished = False
            if finished:
                break

        # Find a cycle, with a random walk starting at a random node
        node = list( h.nodes )[0]
        cycle = [node]
        while True:
            edges = list( h.out_edges(node) )
            _, node = edges[ np.random.choice( len(edges) ) ]
            if node in cycle:
                break
            cycle.append( node )

        # We have a path that ends with a cycle: remove the begining, if it is not part of the cycle
        cycle = np.array(cycle)
        i = np.argwhere( cycle == node )[0][0]
        cycle = cycle[i:]
        cycle = cycle.tolist() + [node]

        # Edges in that cycle
        edges = list( zip( cycle[:-1], cycle[1:] ) )

        # Pick an edge at random, but make sure it is not X→Y -- we want to keep that one
        edges = [ e for e in edges if e != ('X', 'Y') ]
        edge = edges[ np.random.choice( len(edges) ) ]

        gg.remove_edge( *edge )

    return gg

This is the core of the solution's code, that reads one dataset at the time, applies the PC algorithm, ensures that the result is a DAG, and then put the result in a single dataframe with the required format, ready for being submitted.

In [4]:
# Uncomment what you need!
def train(
    X_train: typing.Dict[str, pd.DataFrame],
    y_train: typing.Dict[str, pd.DataFrame],
    # number_of_features: int,
    model_directory_path: str,
    # id_column_name: str,
    # prediction_column_name: str,
    # has_gpu: bool,
    # has_trained: bool,
) -> None:
    model = ...
    joblib.dump(
        model,
        os.path.join(model_directory_path, "model.joblib")
    )

In [5]:
# Uncomment what you need!
def infer(
    X_test: typing.Dict[str, pd.DataFrame],
    # number_of_features: int,
    model_directory_path: str,
    id_column_name: str,
    prediction_column_name: str,
    # has_gpu: bool,
    # has_trained: bool,
) -> pd.DataFrame:
    # model = joblib.load(os.path.join(model_directory_path, "model.joblib"))

    submission_file = {}
    for dataset_id in tqdm(X_test):
        print(dataset_id)
        X = X_test[dataset_id]

        nodes = X.columns
        model = castle.algorithms.PC()
        model.learn(X)

        A_hat = pd.DataFrame(model.causal_matrix, columns=nodes, index=nodes)
        g_hat = nx.from_pandas_adjacency(A_hat, create_using=nx.DiGraph)
        g_hat = fix_DAG(g_hat)

        G = pd.DataFrame(nx.to_numpy_array(g_hat).astype(int), columns=nodes, index=nodes)
        for i in nodes:
            for j in nodes:
                submission_file[f'{dataset_id}_{i}_{j}'] = int(G.loc[i, j])

    submission_file = pd.Series(submission_file)
    submission_file = submission_file.reset_index()
    submission_file.columns = [id_column_name, prediction_column_name]

    return submission_file

In [7]:
crunch.test()

[32m12:13:12[0m [33mno forbidden library found[0m
[32m12:13:12[0m [33m[0m
[32m12:13:12[0m started
[32m12:13:12[0m running local test
[32m12:13:12[0m [33minternet access isn't restricted, no check will be done[0m
[32m12:13:12[0m 
[32m12:13:13[0m starting dag process...
[32m12:13:13[0m [33mcall: train[0m
[32m12:13:13[0m [33mcall: infer[0m


download data\X_train.pickle from https://datacrunch-com.s3.eu-west-1.amazonaws.com/development/adia-tournament/data-releases/32/X_train.pickle (7591133 bytes)
already exists: file length match
download data\y_train.pickle from https://datacrunch-com.s3.eu-west-1.amazonaws.com/development/adia-tournament/data-releases/32/y_train.pickle (98523 bytes)
already exists: file length match
download data\X_test.pickle from https://datacrunch-com.s3.eu-west-1.amazonaws.com/development/adia-tournament/data-releases/32/X_test_reduced.pickle (329528 bytes)
already exists: file length match
download data\y_test.pickle from https://datacrunch-com.s3.eu-west-1.amazonaws.com/development/adia-tournament/data-releases/32/y_test_reduced.pickle (4935 bytes)
already exists: file length match
download data\example_prediction.parquet from https://datacrunch-com.s3.eu-west-1.amazonaws.com/development/adia-tournament/data-releases/32/example_prediction_reduced.parquet (3939 bytes)
already exists: file length m

  0%|          | 0/5 [00:00<?, ?it/s]

0397
0399


100%|██████████| 5/5 [00:00<00:00, 12.67it/s]
[32m12:13:13[0m [33msave prediction - path=data\prediction.csv[0m
[32m12:13:13[0m ended
[32m12:13:13[0m [33mduration - time=00:00:01[0m
[32m12:13:13[0m [33mmemory - before="334.20 MB" after="339.89 MB" consumed="5.69 MB"[0m


0401
0405
0408
