In [21]:
from utils import math_utils, sys_utils
import numpy as np
import os
import dci
import yaml
import pandas as pd

# additional imports (just for plotting the difference graph in this notebook, not required for running DCI)
import networkx as nx
import graphviz

SAVE_FOLDER = 'data/real_data/'# folder where results will be saved

# the inputs below must be present in the correct folder
# data corresponding to condition 1 (in this case gene expression of activated CD8+ T-cells)
filename1 = 'data/real_data/input/gene_exp_sarkar_act.csv'
# data corresponding to condition 2 (in this case gene expression of naive CD8+ T-cells)
filename2 = 'data/real_data/input/gene_exp_sarkar_naive.csv'
# undirected Markov difference network (estimated via KLIEP https://arxiv.org/pdf/1304.6803.pdf)
difference_undirected_filename = 'data/real_data/input/diff_graph_sarkar_center_filt0.005_full.csv'

# this notebook will estimate the difference DAG based on the two tuning parameters below for demonstration purposes
# in practice the DCI algorithm should be run across a combination of tuning parameters (which was done in the paper)
# therefore results based on running this notebook are different from what is shown in the paper
DDAG_SKEL_ALPHA = 0.1
DDAG_ALPHA = 0.05

In [22]:
def determine_changed_nodes(mn_diff):
    # determine which nodes have any change from state 1 to state 2
    # a node i is included in the changed node set if there exists a node j such that 
    # precision(i,j) in state 1 is not equal to precision(i,j) in state 2
    # markov (undirected) difference graph gives how precision matrix changed across the two states
    # i.e a zero for entry (i,j) means that there is no change in precision(i,j) across the two states
    return set(np.where(np.sum(mn_diff, axis=0) !=0)[0])
    
def read_data(center=True):
    # read in the data and center the data around 0 if center is True (default)
    X1 = pd.read_csv(filename1, delimiter=',', index_col=0).T
    X2 = pd.read_csv(filename2, delimiter=',', index_col=0).T
    gene_names = X1.columns.values
    
    if center:
        X1 = X1 - X1.mean(axis=0)
        X2 = X2 - X2.mean(axis=0)
    
    # read in markov difference network
    mn_diff = np.loadtxt(difference_undirected_filename, delimiter = ',')
    # determine which nodes changed
    changed_nodes = determine_changed_nodes(mn_diff)
    # get all edges with nonzero precision
    est_dug = set(math_utils.upper_tri_ixs_nonzero(mn_diff))
    return X1.values, X2.values, est_dug, changed_nodes, gene_names

def make_graph(skeleton, oriented_edges, gene_names, known_edges=set()):
    # create a graph for plotting
    unoriented_edges = skeleton - oriented_edges - {(j, i) for i, j in oriented_edges}
    # make a directed graph
    g = nx.DiGraph()
    for i, j in oriented_edges:
        color = 'black'
        g.add_edge(gene_names[i], gene_names[j], color=color, penwidth=3)

    for i, j in unoriented_edges:
        color = 'black'
        g.add_edge(gene_names[i], gene_names[j], arrowhead='none', color=color, penwidth=3)
    
    for i, j in known_edges - oriented_edges - unoriented_edges:
        if (i, j) not in skeleton and (j, i) not in skeleton:
            g.add_edge(gene_names[i], gene_names[j], arrowhead='none', color='gray')
    return g

In [23]:
# read in data for DCI algorithm
X1, X2, est_dug, changed_nodes, gene_names = read_data()

In [24]:
# run DCI algorithm 1 (to limit the number of hypotheses tested limit max_set_size to 3)
print('Estimating skeleton of the difference DAG (running Algorithm 2 of DCI)...')
print('This might take a bit of time...')
retained_edges, _, _, _ = dci.estimate_ddag_skeleton(X1, X2, est_dug, changed_nodes, DDAG_SKEL_ALPHA, max_set_size = 3, verbose=False)
# save results
RES_FOLDER = SAVE_FOLDER + 'dci_results/'
sys_utils.ensure_dirs([RES_FOLDER])
yaml.dump(retained_edges, open(RES_FOLDER + 'estimated_ddag_skeleton.yaml', 'w'), indent=2)

Estimating skeleton of the difference DAG (running Algorithm 2 of DCI)...
This might take a bit of time...


In [25]:
# run DCI algorithm 2
print('Assigning edge directions (running Algorithm 3 of DCI)...')
print('This might take a bit of time...')
est_ddag = dci.estimate_ddag(X1, X2, retained_edges, changed_nodes, DDAG_ALPHA, max_set_size = 3, verbose=False)
# save results
yaml.dump(est_ddag, open(RES_FOLDER + 'estimated_ddag.yaml', 'w'), indent=2)

Assigning edge directions (running Algorithm 3 of DCI)...
This might take a bit of time...


In [26]:
print('Plot the graph if desired (requires graphviz, pydot and networkx packages in python)')
g = make_graph(retained_edges, est_ddag, gene_names)
fn = RES_FOLDER + 'graph_thres.gv'
nx.nx_pydot.write_dot(g, fn)
graphviz.render('dot', 'png', fn)

Plot the graph if desired (requires graphviz, pydot and networkx packages in python)


'data/real_data/dci_results/graph_thres.gv.png'