In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import pickle as pkl

from itertools import product
from pgmpy.models import LinearGaussianBayesianNetwork

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def _concordance_correlation_coefficient(x, y):
    # https://nirpyresearch.com/concordance-correlation-coefficient/
    sxy = np.sum((x - x.mean())*(y - y.mean()))/x.shape[0]
    rhoc = 2*sxy / (np.var(x) + np.var(y) + (x.mean() - y.mean())**2)
    return rhoc

def _compute_metrics(y_true, y_pred):
    pearson_r_arousal = np.corrcoef(y_true[:, 0], y_pred[:, 0])[0, 1]
    pearson_r_valence = np.corrcoef(y_true[:, 1], y_pred[:, 1])[0, 1]
    ccc_arousal = _concordance_correlation_coefficient(y_true[:, 0], y_pred[:, 0])
    ccc_valence = _concordance_correlation_coefficient(y_true[:, 1], y_pred[:, 1])
    return pearson_r_arousal, pearson_r_valence, ccc_arousal, ccc_valence

In [3]:
def bayesian_network_from_dag(dag_file_path:str, method:str, train_set:pd.DataFrame):
    # open the DAG in binary read mode
    with open(dag_file_path, "rb") as file:
        dag_data = pkl.load(file)

    graph_matrix = (
        dag_data["G"].graph if method == "ges" 
        else dag_data[0].graph if method in ["fci", "pruned_fci"]
        else dag_data.G.graph if method in ["pc", "pruned_pc"]
        else dag_data if method == "avg"
        else None
    )

    # extract edges from the learned DAG
    num_nodes = graph_matrix.shape[0]
    edges = []

    # iterate through all node pairs to determine edge types
    for i, j in product(range(num_nodes), range(num_nodes)):
        if graph_matrix[i, j] == -1 and graph_matrix[j, i] == 1:  # fully directed edge i --> j
            edges.append((str(i), str(j)))
        # elif graph_matrix[i, j] == graph_matrix[j, i] == 1:  # bidirected edge i <-> j
        #     edges.append((str(i), str(j)))
        #     edges.append((str(j), str(i)))
        # elif graph_matrix[i, j] == 2 and graph_matrix[j, i] == 1:  # i o-> j partially directed
        #     edges.append((str(i), str(j)))
            # partially_directed_edges_count += 1

    assert nx.is_directed_acyclic_graph(nx.DiGraph(edges)), "Graph is not a DAG."

    # train a LGBN on the corresponding participant's data
    lgbn_model = LinearGaussianBayesianNetwork(edges)
    lgbn_model.fit(train_set)

    return lgbn_model

In [4]:
# make sure to pass into `models` all but one model
def make_predictions(models: dict, test_set: pd.DataFrame) -> tuple[pd.DataFrame, dict]:
    clean_test_set = test_set.drop("Participant", axis=1).copy()
    rename_dict = {
        "median_arousal": "0",
        "median_valence": "1",
    }
    rename_dict.update({f"PC{i}": str(i + 1) for i in range(1, test_set.shape[1] - 2)})
    clean_test_set = clean_test_set.rename(columns=rename_dict).reset_index(drop=True)

    # splitting into labels and features
    y_test = clean_test_set[["0", "1"]].values
    X_test = clean_test_set.drop(["0", "1"], axis=1)

    all_results = []
    y_pred_arousal_ensemble = []
    y_pred_valence_ensemble = []
    
    for model_id, model in models.items():
        try:
            y_pred = model.predict(X_test)

            # NOTE:
            # it may be the case that the model (graph) does not contain
            # all the nodes contained in the training/test data, making it
            # impossible to make predictions. Here I am catering for such
            # cases by filling up missing predictions with `np.nan`.
            if len(y_pred[0]) == 1:
                y_pred[0].append('1' if '0' in y_pred[0] else '0')
                y_pred[1] = np.hstack((y_pred[1], np.full((len(y_pred[1]), 1), np.nan)))
        except Exception as e:
            # in case of failure, initialize fallback values
            y_pred = [None, None]
            y_pred[0] = ["1", "0"]
            y_pred[1] = np.full((len(X_test), 2), np.nan)

        predicted_means = y_pred[1]

        arousal_idx = y_pred[0].index("0")
        valence_idx = y_pred[0].index("1")

        y_pred_arousal = predicted_means[:, arousal_idx]
        y_pred_valence = predicted_means[:, valence_idx]

        # storing all predictions
        temp_df = test_set[["Participant", "median_arousal", "median_valence"]].copy()
        temp_df[f"prd_lgbn_{model_id}_arousal"] = y_pred_arousal
        temp_df[f"prd_lgbn_{model_id}_valence"] = y_pred_valence
        all_results.append(temp_df)

        # append predictions for ensemble calculation
        y_pred_arousal_ensemble.append(y_pred_arousal)
        y_pred_valence_ensemble.append(y_pred_valence)
    
    # NOTE: use nanmean? this ignores nan from computation
    y_pred_arousal_ensemble = np.nanmean(y_pred_arousal_ensemble, axis=0)
    y_pred_valence_ensemble = np.nanmean(y_pred_valence_ensemble, axis=0)

    all_predictions_df = pd.concat(all_results, axis = 1)
    all_predictions_df = all_predictions_df.loc[:, ~all_predictions_df.columns.duplicated()]

    # storing ensemble predictions
    temp_df = test_set[["Participant", "median_arousal", "median_valence"]].copy()
    temp_df[f"prd_lgbn_ensemble_arousal"] = y_pred_arousal_ensemble
    temp_df[f"prd_lgbn_ensemble_valence"] = y_pred_valence_ensemble

    participant_predictions_df = pd.concat([all_predictions_df, temp_df], axis=1)
    participant_predictions_df = participant_predictions_df.loc[:, ~participant_predictions_df.columns.duplicated()]

    # compute evaluation metrics
    (
        lgbn_pearson_r_arousal,
        lgbn_pearson_r_valence,
        lgbn_ccc_arousal,
        lgbn_ccc_valence
    ) = _compute_metrics(y_test, np.column_stack((y_pred_arousal_ensemble, y_pred_valence_ensemble)))

    evaluations = {
        "pearson_r_arousal": lgbn_pearson_r_arousal,
        "pearson_r_valence": lgbn_pearson_r_valence,
        "ccc_arousal": lgbn_ccc_arousal,
        "ccc_valence": lgbn_ccc_valence
    }

    return participant_predictions_df, evaluations

In [5]:
data = {
    "visual": {
        "ges": [16,19,23,25,26,28,30,34,37,39,42,45,46,56,64,65], # missing 21, 41
        "pruned_fci": [16,19,21,23,25,26,28,30,34,37,39,41,42,45,46,56,64,65],
        "pruned_pc": [16,19,21,23,25,26,28,30,34,37,39,41,42,45,46,56,64,65],
    },
    "audio": {
        "ges": [16,19,21,23,25,26,28,30,34,37,39,41,42,45,46,56,64,65],
        "pruned_fci": [16,19,21,23,25,26,28,30,34,37,39,41,42,45,46,56,64,65],
        "pruned_pc": [16,19,21,23,25,26,28,30,34,37,39,41,42,45,46,56,64,65],
    },
    "physio": {
        "ges": [16,19,23,25,26,28,30,34,37,39,41,42,45,46,56,64,65], # missing 21
        "pruned_fci": [16,19,21,23,25,26,28,30,34,37,39,41,42,45,46,56,64,65],
        "pruned_pc": [16,19,21,23,25,26,28,30,34,37,39,41,42,45,46,56,64,65],
    },
}

all_evals = []
all_modalities_results = []
for modality, v in data.items():
    all_methods_results = []
    for method, dag_participant_ids in v.items():
        print(f"Processing {modality}, {method}...")
        modality_df = pd.read_csv(f"../data/subset_{modality}_pca.csv")
        participants = modality_df["Participant"].unique()
        participants.sort()

        rename_dict = {
                "median_arousal": "0",
                "median_valence": "1",
            }
        rename_dict.update({f"PC{i}": str(i + 1) for i in range(1, modality_df.shape[1] - 2)})

        # getting all LGBN models for each modality-method combination
        model_bank = {}
        for dag_id in dag_participant_ids:
            # filtering data for relevant participant
            train_set = modality_df[modality_df["Participant"] == dag_id]
            train_set = train_set.drop("Participant", axis=1)
            train_set = train_set.rename(columns=rename_dict).reset_index(drop=True)

            # getting path to relevant DAG
            dag_file_path = f"../results_dag/{modality}/{method}_dag_participant_{dag_id}.pkl"

            model_bank[dag_id] = bayesian_network_from_dag(
                dag_file_path,
                method,
                train_set
            )

        # generate predictions per participant
        all_participants_results = []
        for participant in participants:
            # generating model from average graph
            train_set_avg_graph = modality_df[modality_df["Participant"] != participant]
            train_set_avg_graph = train_set_avg_graph.drop("Participant", axis=1)
            train_set_avg_graph = train_set_avg_graph.rename(columns=rename_dict).reset_index(drop=True)

            avg_dag_file_path = f"../results_dag/{modality}/avg_{method}_dag.pkl"
            average_graph_model = bayesian_network_from_dag(
                avg_dag_file_path,
                "avg",
                train_set_avg_graph
            )

            # considering all models except for current participant's
            models_to_consider = {k: v for k, v in model_bank.items() if k != participant}
            test_set = modality_df[modality_df["Participant"] == participant]

            participant_predictions_df, evaluations = make_predictions(models_to_consider, test_set)

            # generating predictions using average graph
            avg_dag_participant_predictions_df, avg_dag_evaluations = make_predictions(
                {"avg": average_graph_model},
                test_set
            )
            
            # ensemble results are redundant here
            avg_dag_participant_predictions_df = avg_dag_participant_predictions_df.drop(
                ["Participant", "median_arousal", "median_valence", "prd_lgbn_ensemble_arousal", "prd_lgbn_ensemble_valence"], 
                axis=1
            )
            
            evaluations = {
                f"{modality}_{method}_" + key: value 
                for key, value 
                in evaluations.items()
            }
            evaluations["participant"] = participant
            all_evals.append(evaluations)

            avg_dag_evaluations = {
                f"avg_{modality}_{method}_" + key: value 
                for key, value 
                in avg_dag_evaluations.items()
            }
            avg_dag_evaluations["participant"] = participant
            all_evals.append(avg_dag_evaluations)
            
            participant_predictions_df.columns = participant_predictions_df.columns.str.replace(
                "prd",
                f"{modality}_{method}"
            )
            avg_dag_participant_predictions_df.columns = avg_dag_participant_predictions_df.columns.str.replace(
                "prd",
                f"avg_{modality}_{method}"
            )

            all_participants_results.append(pd.concat([participant_predictions_df, avg_dag_participant_predictions_df], axis=1))
        all_methods_results.append(pd.concat(all_participants_results, axis=0))
    all_methods_results_df = pd.concat(all_methods_results, axis=1)
    all_methods_results_df = all_methods_results_df.loc[:, ~all_methods_results_df.columns.duplicated()]
    all_modalities_results.append(all_methods_results_df)

all_predicitons_df = pd.concat(all_modalities_results, axis=1)
all_predicitons_df = all_predicitons_df.loc[:, ~all_predicitons_df.columns.duplicated()]
all_predicitons_df.to_csv("../results/data_with_predictions.csv", index=False)

all_evals_df = pd.DataFrame(all_evals)
all_evals_df = all_evals_df.groupby("participant").first()
all_evals_df.to_csv("../results/lgbn_ensemble_evaluations.csv")

Processing visual, ges...
Processing visual, pruned_fci...
Processing visual, pruned_pc...
Processing audio, ges...
Processing audio, pruned_fci...
Processing audio, pruned_pc...
Processing physio, ges...
Processing physio, pruned_fci...


  y_pred_arousal_ensemble = np.nanmean(y_pred_arousal_ensemble, axis=0)
  y_pred_valence_ensemble = np.nanmean(y_pred_valence_ensemble, axis=0)


Processing physio, pruned_pc...
