In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)

from collections import OrderedDict

import sys
sys.path.append("..")
import utilities, ontology

# Load brain and text data

In [2]:
act_bin = utilities.load_coordinates().astype(float)
print("Document N={}, Structure N={}".format(act_bin.shape[0], act_bin.shape[1]))

Document N=18155, Structure N=118


In [3]:
version = 190325
dtm_bin = utilities.load_doc_term_matrix(version=version, binarize=True)
print("Document N={}, Term N={}".format(dtm_bin.shape[0], dtm_bin.shape[1]))

Document N=18155, Term N=4107


In [4]:
frameworks = ["data-driven", "rdoc", "dsm"]
list_suffixes = ["", "_opsim", "_opsim"]
circuit_suffixes = ["_nn", "", ""]
lists, circuits = {}, {}
for fw, list_suffix, circuit_suffix in zip(frameworks, list_suffixes, circuit_suffixes):
    lists[fw], circuits[fw] = utilities.load_framework(fw, suffix=list_suffix, clf=circuit_suffix)

In [5]:
scores = {fw: utilities.score_lists(lists[fw], dtm_bin, label_var="DOMAIN") for fw in frameworks}

In [6]:
words = []
for fw in frameworks:
    words += list(lists[fw]["TOKEN"])
words = sorted(list(set(words)))
structures = list(act_bin.columns)
print("Term N={}, Structure N={}".format(len(words), len(structures)))

Term N=262, Structure N=118


In [7]:
domains = {fw: list(OrderedDict.fromkeys(lists[fw]["DOMAIN"])) for fw in frameworks}

In [8]:
pmids = act_bin.index.intersection(scores["rdoc"].index).intersection(scores["dsm"].index)
len(pmids)

18155

In [9]:
for fw in frameworks:
    scores[fw] = scores[fw].loc[pmids]

In [10]:
dtm_bin = dtm_bin.loc[pmids, words]
act_bin = act_bin.loc[pmids, structures]

# Load frameworks

In [11]:
systems = {}
for fw in frameworks:
    fw_df = pd.DataFrame(0.0, index=words+structures, columns=domains[fw])
    for dom in domains[fw]:
        for word in lists[fw].loc[lists[fw]["DOMAIN"] == dom, "TOKEN"]:
            fw_df.loc[word, dom] = 1.0
        for struct in structures:
            fw_df.loc[struct, dom] = circuits[fw].loc[struct, dom]
    fw_df[fw_df > 0.0] = 1.0
    systems[fw] = fw_df

# Similarity of RDoC and data-driven systems

## Observed values

In [12]:
from scipy.spatial.distance import dice, cdist

In [13]:
def compute_sim_obs(fw):
    sims = pd.DataFrame(index=domains["data-driven"], columns=domains[fw])
    for k in domains["data-driven"]:
        for r in domains[fw]:
            sims.loc[k,r] = 1.0 - dice(systems["data-driven"][k], systems[fw][r])
    return sims

In [14]:
sims = compute_sim_obs("rdoc")
sims

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
MEMORY,0.305882,0.05,0.0,0.295455,0.268293,0.0
REWARD,0.365854,0.545455,0.0,0.0705882,0.202532,0.0
COGNITION,0.311111,0.282353,0.108696,0.0430108,0.16092,0.0222222
MANIPULATION,0.0,0.0,0.480769,0.0,0.0,0.666667
VISION,0.0449438,0.0,0.21978,0.23913,0.0232558,0.0
LANGUAGE,0.0808081,0.0,0.0,0.156863,0.145833,0.121212


## Null distribution

In [15]:
def compute_sim_null(fw, n_iter=1000):
    sims_null = np.empty((len(domains["data-driven"]), len(domains[fw]), n_iter))
    for n in range(n_iter):
        null = np.random.choice(words+structures, 
                                size=len(words+structures), replace=False)
        sims_null[:,:,n] = 1.0 - cdist(systems["data-driven"].loc[null].values.T, 
                                       systems[fw].values.T, metric="dice")
        if n % (float(n_iter) / 10.0) == 0:
            print("Iteration {}".format(n))
    return sims_null

In [16]:
sims_null = compute_sim_null("rdoc")

Iteration 0
Iteration 100
Iteration 200
Iteration 300
Iteration 400
Iteration 500
Iteration 600
Iteration 700
Iteration 800
Iteration 900


## False discovery rate

In [17]:
from statsmodels.stats.multitest import multipletests

In [18]:
def compute_sim_fdr(fw, sims, sims_null):
    n_iter = sims_null.shape[2]
    pvals = pd.DataFrame(index=domains["data-driven"], columns=domains[fw])
    for i, k in enumerate(domains["data-driven"]):
        for j, r in enumerate(domains[fw]):
            pvals.loc[k,r] = np.sum(sims_null[i,j,:] > sims.loc[k,r]) / float(n_iter)
    fdrs = multipletests(pvals.values.ravel(), method="fdr_bh")[1]
    fdrs = np.reshape(fdrs, pvals.shape)
    fdrs = pd.DataFrame(fdrs, index=domains["data-driven"], columns=domains[fw])
    return fdrs

In [19]:
fdrs = compute_sim_fdr("rdoc", sims, sims_null)
fdrs

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
MEMORY,0,1,1.0,0.0,0.0,1.0
REWARD,0,0,1.0,1.0,0.0163636,1.0
COGNITION,0,0,0.864,1.0,0.221538,1.0
MANIPULATION,1,1,0.0,1.0,1.0,0.0
VISION,1,1,0.018,0.0036,1.0,1.0
LANGUAGE,1,1,1.0,0.491143,0.5352,0.946588


In [20]:
def compute_sim_star(fw, fdrs):
    stars = pd.DataFrame("", index=domains["data-driven"], columns=domains[fw])
    for k in domains["data-driven"]:
        for r in domains[fw]:
            fdr = fdrs.loc[k,r]
            if fdr < 0.05:
                stars.loc[k,r] = "*"
            if fdr < 0.01:
                stars.loc[k,r] = "**"
            if fdr < 0.001:
                stars.loc[k,r] = "***"
    return stars

In [21]:
stars = compute_sim_star("rdoc", fdrs)
stars

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
MEMORY,***,,,***,***,
REWARD,***,***,,,*,
COGNITION,***,***,,,,
MANIPULATION,,,***,,,***
VISION,,,*,**,,
LANGUAGE,,,,,,


## Weights for figure

In [22]:
sims[fdrs < 0.05] * 40

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
MEMORY,12.2353,,,11.8182,10.7317,
REWARD,14.6341,21.8182,,,8.10127,
COGNITION,12.4444,11.2941,,,,
MANIPULATION,,,19.2308,,,26.6667
VISION,,,8.79121,9.56522,,
LANGUAGE,,,,,,


# Similarity of DSM and data-driven systems

## Observed values

In [23]:
sims = compute_sim_obs("dsm")
sims

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,DISRUPTIVE,SUBSTANCE
MEMORY,0.0,0.0,0.097561,0.0851064,0.291667,0.0,0.380952,0.0,0.0
REWARD,0.0,0.0,0.105263,0.0454545,0.0,0.162162,0.0512821,0.35,0.157895
COGNITION,0.0246914,0.04,0.0869565,0.0769231,0.150943,0.0888889,0.0425532,0.0416667,0.0
MANIPULATION,0.236559,0.193548,0.0344828,0.0,0.0,0.0,0.0,0.0,0.0344828
VISION,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LANGUAGE,0.2,0.101695,0.0,0.0,0.0,0.037037,0.0,0.0,0.0


## Null distribution

In [24]:
sims_null = compute_sim_null("dsm")

Iteration 0
Iteration 100
Iteration 200
Iteration 300
Iteration 400
Iteration 500
Iteration 600
Iteration 700
Iteration 800
Iteration 900


## False discovery rate

In [25]:
fdrs = compute_sim_fdr("dsm", sims, sims_null)
fdrs

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,DISRUPTIVE,SUBSTANCE
MEMORY,0.983,0.977192,0.27,0.529875,0.0,0.977192,0.0,0.977192,0.977192
REWARD,0.978113,0.977192,0.27,0.977192,0.977192,0.0231429,0.693,0.0,0.10125
COGNITION,0.977192,0.977192,0.4464,0.693,0.144,0.290769,0.977192,0.977192,0.977192
MANIPULATION,0.0231429,0.0108,0.977192,0.977192,0.977192,0.977192,0.977192,0.977192,0.977192
VISION,0.0,0.977192,0.977192,0.977192,0.977192,0.977192,0.977192,0.977192,0.977192
LANGUAGE,0.1566,0.4464,0.977192,0.977192,0.977192,0.977192,0.977192,0.977192,0.977192


In [26]:
stars = compute_sim_star("dsm", fdrs)
stars

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,DISRUPTIVE,SUBSTANCE
MEMORY,,,,,***,,***,,
REWARD,,,,,,*,,***,
COGNITION,,,,,,,,,
MANIPULATION,*,*,,,,,,,
VISION,***,,,,,,,,
LANGUAGE,,,,,,,,,


## Weights for figure

In [27]:
sims[fdrs < 0.05] * 40

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,DISRUPTIVE,SUBSTANCE
MEMORY,,,,,11.6667,,15.2381,,
REWARD,,,,,,6.48649,,14.0,
COGNITION,,,,,,,,,
MANIPULATION,9.46237,7.74194,,,,,,,
VISION,12.0,,,,,,,,
LANGUAGE,,,,,,,,,
