In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import binarize
from collections import OrderedDict
from scipy.spatial.distance import dice
from scipy.stats import pointbiserialr
from listify import *

# Load the brain and text data

In [2]:
atlas_labels = pd.read_csv("../data/atlases/harvard-oxford.csv")
act_bin = pd.read_csv("../data/dcm_0mm_thres0.csv", index_col=0) # Previously binarized
act_bin = act_bin[atlas_labels["PREPROCESSED"]]
act_bin.mean().mean()

0.2984103746007817

In [3]:
def mean_thres(df):
    col_mean = df.mean()
    df_bin = np.empty((df.shape[0], df.shape[1]))
    i = 0
    for col, doc_mean in col_mean.iteritems():
        df_bin[:,i] = 1 * (df[col] > doc_mean)
        i += 1
    df_bin = pd.DataFrame(df_bin, columns=df.columns, index=df.index)
    return df_bin

In [4]:
dtm = pd.read_csv("../data/dtm_190325.csv.gz", compression="gzip", index_col=0)
dtm_bin = mean_thres(dtm)
dtm_bin.mean().mean()

0.018405182706581442

In [5]:
rdoc_lists = pd.read_csv("lists/lists_rdoc_domain_opsim.csv", index_col=None)
dsm_lists = pd.read_csv("lists/lists_dsm_class_opsim.csv", index_col=None)
kmeans_lists = pd.read_csv("lists/lists_k07_oplen.csv", index_col=None)
kmeans_circuits = pd.read_csv("circuits/circuits_k07.csv", index_col=None)

# Compute the brain systems

In [6]:
def score_lists(lists, dtm, label_var="LABEL"):
    dtm = pd.DataFrame(binarize(dtm, threshold=0), index=dtm.index, columns=dtm.columns)
    labels = OrderedDict.fromkeys(lists[label_var])
    list_counts = pd.DataFrame(index=dtm.index, columns=labels)
    for label in list_counts.columns:
        tkns = lists.loc[lists[label_var] == label, "TOKEN"]
        tkns = [token for token in tkns if token in dtm.columns]
        list_counts[label] = dtm[tkns].sum(axis=1)
    list_scores = mean_thres(list_counts)
    return list_scores

In [7]:
rdoc_scores = score_lists(rdoc_lists, dtm_bin, label_var="DOMAIN")
dsm_scores = score_lists(dsm_lists, dtm_bin, label_var="CLASS")

In [8]:
words = sorted(list(set(rdoc_lists["TOKEN"].append(dsm_lists["TOKEN"]).append(kmeans_lists["TOKEN"]))))
structures = sorted(list(set(kmeans_circuits["STRUCTURE"])))

In [9]:
ordered_k = [6, 3, 7, 5, 2, 1, 4]
k2name = {6: "EMOTION", 3: "ANTICIPATION", 7: "COGNITION",
          5: "VISION", 2: "MANIPULATION", 1: "MEANING", 4: "LANGUAGE"}
name2k = {v: k for k, v in k2name.items()}

In [10]:
rdoc_domains = list(OrderedDict.fromkeys(rdoc_lists["DOMAIN"]))
dsm_domains = list(OrderedDict.fromkeys(dsm_lists["CLASS"]))
kmeans_domains = [k2name[k] for k in ordered_k]

In [11]:
pmids = act_bin.index.intersection(rdoc_scores.index).intersection(dsm_scores.index)
len(pmids)

18155

In [12]:
rdoc_scores = rdoc_scores.loc[pmids]
dsm_scores = dsm_scores.loc[pmids]
dtm_bin = dtm_bin.loc[pmids, words]
act_bin = act_bin.loc[pmids, structures]

In [13]:
rdoc_circuits = pd.DataFrame(0.0, index=structures, columns=rdoc_domains)
dsm_circuits = pd.DataFrame(0.0, index=structures, columns=dsm_domains)
for struct in structures:
    for dom in rdoc_domains:
        r, p = pointbiserialr(act_bin[struct], rdoc_scores[dom])
        if r > 0 and p < 1e-8:
            rdoc_circuits.loc[struct, dom] = r
    for dom in dsm_domains:
        r, p = pointbiserialr(act_bin[struct], dsm_scores[dom])
        if r > 0 and p < 1e-2:
            dsm_circuits.loc[struct, dom] = r

In [14]:
rdoc_systems = pd.DataFrame(0.0, index=words+structures, columns=rdoc_domains)
for dom in rdoc_domains:
    for word in rdoc_lists.loc[rdoc_lists["DOMAIN"] == dom, "TOKEN"]:
        rdoc_systems.loc[word, dom] = 1.0
    for struct in structures:
        rdoc_systems.loc[struct, dom] = rdoc_circuits.loc[struct, dom]
rdoc_systems[rdoc_systems > 0.0] = 1.0

In [15]:
dsm_systems = pd.DataFrame(0.0, index=words+structures, columns=dsm_domains)
for dom in dsm_domains:
    for word in dsm_lists.loc[dsm_lists["CLASS"] == dom, "TOKEN"]:
        dsm_systems.loc[word, dom] = 1.0
    for struct in structures:
        dsm_systems.loc[struct, dom] = dsm_circuits.loc[struct, dom]
dsm_systems[dsm_systems > 0.0] = 1.0

In [16]:
kmeans_systems = pd.DataFrame(0.0, index=words+structures, columns=kmeans_domains)
for dom in kmeans_domains:
    k = name2k[dom]
    for word in kmeans_lists.loc[kmeans_lists["CLUSTER"] == k, "TOKEN"]:
        kmeans_systems.loc[word, dom] = 1.0
    for struct in kmeans_circuits.loc[kmeans_circuits["CLUSTER"] == k, "STRUCTURE"]:
        kmeans_systems.loc[struct, dom] = 1.0

# Similarity of RDoC and data-driven systems

## Observed values

In [17]:
sims = pd.DataFrame(index=kmeans_domains, columns=rdoc_domains)
for k in kmeans_domains:
    for r in rdoc_domains:
        sims.loc[k,r] = 1.0 - dice(kmeans_systems[k], rdoc_systems[r])
sims

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,0.229508,0.0810811,0.0,0.168675,0.125,0.0
ANTICIPATION,0.290909,0.411765,0.0,0.0519481,0.0,0.0909091
COGNITION,0.302326,0.444444,0.24,0.222222,0.0,0.0824742
VISION,0.0,0.0,0.325581,0.107143,0.0,0.0594059
MANIPULATION,0.0,0.02,0.428571,0.0,0.0,0.571429
MEANING,0.0,0.0,0.0,0.1875,0.0,0.0
LANGUAGE,0.0246914,0.0212766,0.0,0.0582524,0.0,0.130435


Multiplier for line weight: similarity * 30

## Null distribution

In [18]:
n_iter = 10000
sims_null = np.empty((len(kmeans_domains), len(rdoc_domains), n_iter))
for n in range(n_iter):
    for i, k in enumerate(kmeans_domains):
        for j, r in enumerate(rdoc_domains):
            null = np.random.choice(words+structures, size=len(words+structures), replace=False)
            sims_null[i,j,n] = 1.0 - dice(kmeans_systems.loc[null, k], rdoc_systems[r])

In [19]:
pvals = pd.DataFrame(index=kmeans_domains, columns=rdoc_domains)
for i, k in enumerate(kmeans_domains):
    for j, r in enumerate(rdoc_domains):
        pval = sum([1.0 for val in sims_null[i,j,:] if val > sims.loc[k,r] or not sims.loc[k,r] > 0]) / float(n_iter)
        pvals.loc[k,r] = pval
pvals

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,0.0003,0.2867,1.0,0.0084,0.0389,1.0
ANTICIPATION,0.0,0.0,1.0,0.3933,1.0,0.1019
COGNITION,0.0,0.0,0.0073,0.0127,1.0,0.7041
VISION,1.0,1.0,0.0,0.6595,1.0,0.895
MANIPULATION,1.0,0.9912,0.0,1.0,1.0,0.0
MEANING,1.0,1.0,1.0,0.0216,1.0,1.0
LANGUAGE,0.9325,0.9816,1.0,0.9018,1.0,0.238


In [20]:
pvals_cor = fdrcorrection(pvals.values.ravel())[1]
pvals_cor = np.reshape(pvals_cor, pvals.shape)
pvals_cor = pd.DataFrame(pvals_cor, index=kmeans_domains, columns=rdoc_domains)
pvals_cor

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,0.001575,0.752588,1.0,0.03528,0.125677,1.0
ANTICIPATION,0.0,0.0,1.0,0.971682,1.0,0.3057
COGNITION,0.0,0.0,0.0340667,0.0484909,1.0,1.0
VISION,1.0,1.0,0.0,1.0,1.0,1.0
MANIPULATION,1.0,1.0,0.0,1.0,1.0,0.0
MEANING,1.0,1.0,1.0,0.0756,1.0,1.0
LANGUAGE,1.0,1.0,1.0,1.0,1.0,0.6664


In [21]:
stars = pd.DataFrame("", index=kmeans_domains, columns=rdoc_domains)
for k in kmeans_domains:
    for r in rdoc_domains:
        pval = pvals_cor.loc[k,r]
        if pval < 0.05:
            stars.loc[k,r] = "*"
        if pval < 0.01:
            stars.loc[k,r] = "**"
        if pval < 0.001:
            stars.loc[k,r] = "***"
stars

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,**,,,*,,
ANTICIPATION,***,***,,,,
COGNITION,***,***,*,*,,
VISION,,,***,,,
MANIPULATION,,,***,,,***
MEANING,,,,,,
LANGUAGE,,,,,,


# Similarity of DSM and data-driven systems

## Observed values

In [25]:
sims = pd.DataFrame(index=kmeans_domains, columns=dsm_domains)
for k in kmeans_domains:
    for r in dsm_domains:
        sims.loc[k,r] = 1.0 - dice(kmeans_systems[k], dsm_systems[r])
sims

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,0.0,0.0,0.235294,0.196721,0.26087,0.130435,0.26087,0.0
ANTICIPATION,0.0,0.0,0.142857,0.181818,0.15,0.35,0.1,0.0
COGNITION,0.0769231,0.0357143,0.0677966,0.0930233,0.140845,0.056338,0.140845,0.0377358
VISION,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MANIPULATION,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MEANING,0.121212,0.0,0.0,0.0,0.0,0.0,0.0338983,0.0487805
LANGUAGE,0.0273973,0.0,0.0,0.0,0.030303,0.0,0.030303,0.0


## Null distribution

In [26]:
n_iter = 10000
sims_null = np.empty((len(kmeans_domains), len(dsm_domains), n_iter))
for n in range(n_iter):
    for i, k in enumerate(kmeans_domains):
        for j, r in enumerate(dsm_domains):
            null = np.random.choice(words+structures, size=len(words+structures), replace=False)
            sims_null[i,j,n] = 1.0 - dice(kmeans_systems.loc[null, k], dsm_systems.loc[words+structures,r])

In [27]:
pvals = pd.DataFrame(index=kmeans_domains, columns=dsm_domains)
for i, k in enumerate(kmeans_domains):
    for j, r in enumerate(dsm_domains):
        pval = sum([1.0 for val in sims_null[i,j,:] if val > sims.loc[k,r] or not sims.loc[k,r] > 0]) / float(n_iter)
        pvals.loc[k,r] = pval
pvals

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,1.0,1.0,0.0001,0.0013,0.0,0.0357,0.0001,1.0
ANTICIPATION,1.0,1.0,0.0099,0.0017,0.0104,0.0,0.0566,1.0
COGNITION,0.4952,0.308,0.1675,0.4715,0.0492,0.5516,0.0471,0.1746
VISION,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
MANIPULATION,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
MEANING,0.1103,1.0,1.0,1.0,1.0,1.0,0.6347,0.1047
LANGUAGE,0.855,1.0,1.0,1.0,0.7378,1.0,0.7325,1.0


In [28]:
pvals_cor = fdrcorrection(pvals.values.ravel())[1]
pvals_cor = np.reshape(pvals_cor, pvals.shape)
pvals_cor = pd.DataFrame(pvals_cor, index=kmeans_domains, columns=dsm_domains)
pvals_cor

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,1.0,1,0.0014,0.01456,0.0,0.222133,0.0014,1.0
ANTICIPATION,1.0,1,0.0728,0.0158667,0.0728,0.0,0.264133,1.0
COGNITION,1.0,1,0.6111,1.0,0.250473,1.0,0.250473,0.6111
VISION,1.0,1,1.0,1.0,1.0,1.0,1.0,1.0
MANIPULATION,1.0,1,1.0,1.0,1.0,1.0,1.0,1.0
MEANING,0.4412,1,1.0,1.0,1.0,1.0,1.0,0.4412
LANGUAGE,1.0,1,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
stars = pd.DataFrame("", index=kmeans_domains, columns=dsm_domains)
for k in kmeans_domains:
    for r in dsm_domains:
        pval = pvals_cor.loc[k,r]
        if pval < 0.05:
            stars.loc[k,r] = "*"
        if pval < 0.01:
            stars.loc[k,r] = "**"
        if pval < 0.001:
            stars.loc[k,r] = "***"
stars

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,,,**,*,***,,**,
ANTICIPATION,,,,*,,***,,
COGNITION,,,,,,,,
VISION,,,,,,,,
MANIPULATION,,,,,,,,
MEANING,,,,,,,,
LANGUAGE,,,,,,,,
