In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
from collections import OrderedDict
from utilities import *

# Load brain and text data

In [2]:
act_bin = load_coordinates().astype(float)
print("Document N={}, Structure N={}".format(act_bin.shape[0], act_bin.shape[1]))

Document N=18155, Structure N=114


In [3]:
version = 190325
dtm_bin = load_doc_term_matrix(version=version, binarize=True)
print("Document N={}, Term N={}".format(dtm_bin.shape[0], dtm_bin.shape[1]))

Document N=18155, Term N=4107


In [4]:
frameworks = ["data-driven", "rdoc", "dsm"]
files = ["data-driven", "rdoc_opsim", "dsm_opsim"]
lists = {fw: pd.read_csv("lists/lists_{}.csv".format(file)) 
         for fw, file in zip(frameworks, files)}

In [5]:
circuits = {fw: pd.read_csv("circuits/circuits_{}.csv".format(fw), index_col=0) 
            for fw in frameworks}

In [6]:
scores = {fw: score_lists(lists[fw], dtm_bin, label_var="DOMAIN") 
          for fw in frameworks}

In [7]:
words = []
for fw in frameworks:
    words += list(lists[fw]["TOKEN"])
words = sorted(list(set(words)))
structures = list(act_bin.columns)
print("Term N={}, Structure N={}".format(len(words), len(structures)))

Term N=297, Structure N=114


In [8]:
domains = {fw: list(OrderedDict.fromkeys(lists[fw]["DOMAIN"])) for fw in frameworks}

In [9]:
pmids = act_bin.index.intersection(scores["rdoc"].index).intersection(scores["dsm"].index)
len(pmids)

18155

In [10]:
for fw in frameworks:
    scores[fw] = scores[fw].loc[pmids]

In [11]:
dtm_bin = dtm_bin.loc[pmids, words]
act_bin = act_bin.loc[pmids, structures]

# Load frameworks

In [12]:
systems = {}
for fw in frameworks:
    fw_df = pd.DataFrame(0.0, index=words+structures, columns=domains[fw])
    for dom in domains[fw]:
        for word in lists[fw].loc[lists[fw]["DOMAIN"] == dom, "TOKEN"]:
            fw_df.loc[word, dom] = 1.0
        for struct in structures:
            fw_df.loc[struct, dom] = circuits[fw].loc[struct, dom]
    fw_df[fw_df > 0.0] = 1.0
    systems[fw] = fw_df

# Similarity of RDoC and data-driven systems

## Observed values

In [13]:
from scipy.spatial.distance import dice, cdist

In [14]:
def compute_sim_obs(fw):
    sims = pd.DataFrame(index=domains["data-driven"], columns=domains[fw])
    for k in domains["data-driven"]:
        for r in domains[fw]:
            sims.loc[k,r] = 1.0 - dice(systems["data-driven"][k], systems[fw][r])
    return sims

In [15]:
sims = compute_sim_obs("rdoc")
sims

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,0.225,0.111111,0.0,0.206897,0.222222,0.0
ANTICIPATION,0.27027,0.424242,0.0,0.0246914,0.0,0.0322581
COGNITION,0.361905,0.412371,0.0576923,0.107143,0.113636,0.0215054
VISION,0.0,0.0,0.259259,0.172414,0.0217391,0.0412371
MANIPULATION,0.0,0.0,0.457143,0.0,0.0,0.574468
MEANING,0.129032,0.0,0.0217391,0.22,0.0789474,0.0
LANGUAGE,0.08,0.0217391,0.0,0.11215,0.0481928,0.204545


## Null distribution

In [16]:
def compute_sim_null(fw, n_iter=10000):
    sims_null = np.empty((len(domains["data-driven"]), len(domains[fw]), n_iter))
    for n in range(n_iter):
        null = np.random.choice(words+structures, 
                                size=len(words+structures), replace=False)
        sims_null[:,:,n] = 1.0 - cdist(systems["data-driven"].loc[null].values.T, 
                                       systems[fw].values.T, metric="dice")
        if n % (float(n_iter) / 10.0) == 0:
            print("Iteration {}".format(n))
    return sims_null

In [17]:
sims_null = compute_sim_null("rdoc")

Iteration 0
Iteration 1000
Iteration 2000
Iteration 3000
Iteration 4000
Iteration 5000
Iteration 6000
Iteration 7000
Iteration 8000
Iteration 9000


## False discovery rate

In [18]:
from statsmodels.stats.multitest import multipletests

In [19]:
def compute_sim_fdr(fw, sims, sims_null):
    n_iter = sims_null.shape[2]
    pvals = pd.DataFrame(index=domains["data-driven"], columns=domains[fw])
    for i, k in enumerate(domains["data-driven"]):
        for j, r in enumerate(domains[fw]):
            pvals.loc[k,r] = np.sum(sims_null[i,j,:] > sims.loc[k,r]) / float(n_iter)
    fdrs = multipletests(pvals.values.ravel(), method="fdr_bh")[1]
    fdrs = np.reshape(fdrs, pvals.shape)
    fdrs = pd.DataFrame(fdrs, index=domains["data-driven"], columns=domains[fw])
    return fdrs

In [20]:
fdrs = compute_sim_fdr("rdoc", sims, sims_null)
fdrs

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,0.0,0.3276,0.9998,0.0028,0.001575,0.9998
ANTICIPATION,0.0,0.0,0.9998,0.9998,0.9998,0.9998
COGNITION,0.0,0.0,0.9998,0.9998,0.861525,0.9998
VISION,0.9998,0.9998,0.00588,0.4791,0.9998,0.9998
MANIPULATION,0.9998,0.9998,0.0,0.9998,0.9998,0.0
MEANING,0.55804,0.9998,0.9998,0.0114545,0.9998,0.9998
LANGUAGE,0.9998,0.9998,0.9998,0.9998,0.9998,0.03745


In [21]:
def compute_sim_star(fw, fdrs):
    stars = pd.DataFrame("", index=domains["data-driven"], columns=domains[fw])
    for k in domains["data-driven"]:
        for r in domains[fw]:
            fdr = fdrs.loc[k,r]
            if fdr < 0.05:
                stars.loc[k,r] = "*"
            if fdr < 0.01:
                stars.loc[k,r] = "**"
            if fdr < 0.001:
                stars.loc[k,r] = "***"
    return stars

In [22]:
stars = compute_sim_star("rdoc", fdrs)
stars

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,***,,,**,**,
ANTICIPATION,***,***,,,,
COGNITION,***,***,,,,
VISION,,,**,,,
MANIPULATION,,,***,,,***
MEANING,,,,*,,
LANGUAGE,,,,,,*


## Weights for figure

In [23]:
sims[fdrs < 0.05] * 40

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,9.0,,,8.27586,8.88889,
ANTICIPATION,10.8108,16.9697,,,,
COGNITION,14.4762,16.4948,,,,
VISION,,,10.3704,,,
MANIPULATION,,,18.2857,,,22.9787
MEANING,,,,8.8,,
LANGUAGE,,,,,,8.18182


# Similarity of DSM and data-driven systems

## Observed values

In [24]:
sims = compute_sim_obs("dsm")
sims

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,0.0,0.0,0.0,0.171429,0.272727,0.0,0.318182,0.0
ANTICIPATION,0.0,0.0,0.0869565,0.15625,0.0526316,0.333333,0.0,0.0
COGNITION,0.0,0.0,0.0740741,0.252632,0.173913,0.0298507,0.0869565,0.0
VISION,0.242991,0.0,0.0,0.0,0.0,0.0,0.0273973,0.133333
MANIPULATION,0.211538,0.0350877,0.0,0.0,0.0,0.0,0.0,0.416667
MEANING,0.0659341,0.0,0.0,0.0,0.0,0.0,0.0701754,0.0
LANGUAGE,0.122449,0.0,0.0,0.0222222,0.0,0.0322581,0.0,0.0


## Null distribution

In [25]:
sims_null = compute_sim_null("dsm")

Iteration 0
Iteration 1000
Iteration 2000
Iteration 3000
Iteration 4000
Iteration 5000
Iteration 6000
Iteration 7000
Iteration 8000
Iteration 9000


## False discovery rate

In [26]:
fdrs = compute_sim_fdr("dsm", sims, sims_null)
fdrs

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,0.9993,0.9993,0.98028,0.0392,0.0028,0.9993,0.0,0.9993
ANTICIPATION,0.9993,0.98028,0.157309,0.0392,0.8328,0.0,0.9993,0.9993
COGNITION,0.9993,0.9993,0.2394,0.01344,0.0603556,0.9993,0.931467,0.9993
VISION,0.0392,0.9993,0.9993,0.9993,0.9993,0.9993,0.9993,0.293354
MANIPULATION,0.1036,0.98028,0.9993,0.9993,0.9993,0.9993,0.9993,0.0
MEANING,0.9993,0.9993,0.9993,0.9993,0.9993,0.9993,0.98028,0.9993
LANGUAGE,0.98028,0.9993,0.9993,0.9993,0.9993,0.9993,0.9993,0.9993


In [27]:
stars = compute_sim_star("dsm", fdrs)
stars

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,,,,*,**,,***,
ANTICIPATION,,,,*,,***,,
COGNITION,,,,*,,,,
VISION,*,,,,,,,
MANIPULATION,,,,,,,,***
MEANING,,,,,,,,
LANGUAGE,,,,,,,,


## Weights for figure

In [28]:
sims[fdrs < 0.05] * 40

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,,,,6.85714,10.9091,,12.7273,
ANTICIPATION,,,,6.25,,13.3333,,
COGNITION,,,,10.1053,,,,
VISION,9.71963,,,,,,,
MANIPULATION,,,,,,,,16.6667
MEANING,,,,,,,,
LANGUAGE,,,,,,,,
