In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
from sklearn.preprocessing import binarize
from collections import OrderedDict
from scipy.spatial.distance import dice
from listify import *

# Load brain and text data

In [2]:
atlas_labels = pd.read_csv("../data/atlases/harvard-oxford.csv")
act_bin = pd.read_csv("../data/dcm_0mm_thres0.csv", index_col=0) # Previously binarized
act_bin = act_bin[atlas_labels["PREPROCESSED"]]
act_bin.mean().mean()

0.2984103746007817

In [3]:
def mean_thres(df):
    col_mean = df.mean()
    df_bin = np.empty((df.shape[0], df.shape[1]))
    i = 0
    for col, doc_mean in col_mean.iteritems():
        df_bin[:,i] = 1 * (df[col] > doc_mean)
        i += 1
    df_bin = pd.DataFrame(df_bin, columns=df.columns, index=df.index)
    return df_bin

In [4]:
dtm = pd.read_csv("../data/dtm_190325.csv.gz", compression="gzip", index_col=0)
dtm_bin = mean_thres(dtm)
dtm_bin.mean().mean()

0.018405182706581442

In [5]:
frameworks = ["kmeans", "rdoc", "dsm"]
files = ["k07_oplen", "rdoc_domain_opsim", "dsm_class_opsim"]
lists = {fw: pd.read_csv("lists/lists_{}.csv".format(file)) for fw, file in zip(frameworks, files)}

In [6]:
circuits = {fw: pd.read_csv("circuits/circuits_domain_{}.csv".format(fw), index_col=0) for fw in frameworks}

In [7]:
def score_lists(lists, dtm, label_var="LABEL"):
    dtm = pd.DataFrame(binarize(dtm, threshold=0), index=dtm.index, columns=dtm.columns)
    labels = OrderedDict.fromkeys(lists[label_var])
    list_counts = pd.DataFrame(index=dtm.index, columns=labels)
    for label in list_counts.columns:
        tkns = lists.loc[lists[label_var] == label, "TOKEN"]
        tkns = [token for token in tkns if token in dtm.columns]
        list_counts[label] = dtm[tkns].sum(axis=1)
    list_scores = mean_thres(list_counts)
    return list_scores

In [8]:
labels = ["CLUSTER", "DOMAIN", "CLASS"]
scores = {fw: score_lists(lists[fw], dtm_bin, label_var=lab) for fw, lab in zip(frameworks, labels)}

In [9]:
words = sorted(list(set(lists["rdoc"]["TOKEN"].append(lists["dsm"]["TOKEN"]).append(lists["kmeans"]["TOKEN"]))))
structures = sorted(list(set(circuits["kmeans"].index)))

In [10]:
ordered_k = [6, 3, 7, 5, 2, 1, 4]
k2name = {6: "EMOTION", 3: "ANTICIPATION", 7: "COGNITION",
          5: "VISION", 2: "MANIPULATION", 1: "MEANING", 4: "LANGUAGE"}
name2k = {v: k for k, v in k2name.items()}

In [11]:
domains = {fw: list(OrderedDict.fromkeys(lists[fw][lab])) for fw, lab in zip(frameworks, labels)}
domains["kmeans"] = [k2name[k] for k in ordered_k]

In [12]:
pmids = act_bin.index.intersection(scores["rdoc"].index).intersection(scores["dsm"].index)
len(pmids)

18155

In [13]:
for fw in frameworks:
    scores[fw] = scores[fw].loc[pmids]

In [14]:
dtm_bin = dtm_bin.loc[pmids, words]
act_bin = act_bin.loc[pmids, structures]

In [15]:
lists["kmeans"]["CLUSTER"] = [k2name[k] for k in lists["kmeans"]["CLUSTER"]]

# Load frameworks

In [16]:
systems = {}
for fw, lab in zip(frameworks, labels):
    fw_df = pd.DataFrame(0.0, index=words+structures, columns=domains[fw])
    for dom in domains[fw]:
        for word in lists[fw].loc[lists[fw][lab] == dom, "TOKEN"]:
            fw_df.loc[word, dom] = 1.0
        for struct in structures:
            fw_df.loc[struct, dom] = circuits[fw].loc[struct, dom]
    fw_df[fw_df > 0.0] = 1.0
    systems[fw] = fw_df

# Similarity of RDoC and data-driven systems

## Observed values

In [17]:
sims = pd.DataFrame(index=domains["kmeans"], columns=domains["rdoc"])
for k in domains["kmeans"]:
    for r in domains["rdoc"]:
        sims.loc[k,r] = 1.0 - dice(systems["kmeans"][k], systems["rdoc"][r])
sims

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,0.225,0.111111,0.0,0.206897,0.222222,0.0
ANTICIPATION,0.27027,0.424242,0.0,0.0246914,0.0,0.0322581
COGNITION,0.361905,0.412371,0.0576923,0.107143,0.113636,0.0215054
VISION,0.0,0.0,0.259259,0.172414,0.0217391,0.0412371
MANIPULATION,0.0,0.0,0.457143,0.0,0.0,0.574468
MEANING,0.129032,0.0,0.0217391,0.22,0.0789474,0.0
LANGUAGE,0.08,0.0217391,0.0,0.11215,0.0481928,0.204545


## Null distribution

In [18]:
n_iter = 10000
sims_null = np.empty((len(domains["kmeans"]), len(domains["rdoc"]), n_iter))
for n in range(n_iter):
    null = np.random.choice(words+structures, 
                            size=len(words+structures), replace=False)
    sims_null[:,:,n] = 1.0 - cdist(systems["kmeans"].loc[null].values.T, 
                                   systems["rdoc"].values.T, metric="dice")
    if n % (float(n_iter) / 10.0) == 0:
        print("Iteration {}".format(n))

Iteration 0
Iteration 1000
Iteration 2000
Iteration 3000
Iteration 4000
Iteration 5000
Iteration 6000
Iteration 7000
Iteration 8000
Iteration 9000


In [19]:
pvals = pd.DataFrame(index=domains["kmeans"], columns=domains["rdoc"])
for i, k in enumerate(domains["kmeans"]):
    for j, r in enumerate(domains["rdoc"]):
        pval = sum([1.0 for val in sims_null[i,j,:] if val > sims.loc[k,r]]) / float(n_iter)
        pvals.loc[k,r] = pval
pvals

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,0.0002,0.106,0.9643,0.0005,0.0003,0.93
ANTICIPATION,0.0,0.0,0.9022,0.7262,0.8071,0.5265
COGNITION,0.0,0.0,0.916,0.6411,0.3266,0.9781
VISION,0.9998,0.9993,0.0013,0.1615,0.9749,0.9455
MANIPULATION,0.9998,0.9983,0.0,0.9998,0.9949,0.0
MEANING,0.1996,0.9904,0.9697,0.0039,0.4576,0.9869
LANGUAGE,0.7372,0.9744,0.9981,0.5007,0.8198,0.0121


In [20]:
pvals_cor = fdrcorrection(pvals.values.ravel())[1]
pvals_cor = np.reshape(pvals_cor, pvals.shape)
pvals_cor = pd.DataFrame(pvals_cor, index=domains["kmeans"], columns=domains["rdoc"])
pvals_cor

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,0.0012,0.342462,0.9998,0.00233333,0.001575,0.9998
ANTICIPATION,0.0,0.0,0.9998,0.9998,0.9998,0.9998
COGNITION,0.0,0.0,0.9998,0.9998,0.857325,0.9998
VISION,0.9998,0.9998,0.00546,0.4845,0.9998,0.9998
MANIPULATION,0.9998,0.9998,0.0,0.9998,0.9998,0.0
MEANING,0.55888,0.9998,0.9998,0.0148909,0.9998,0.9998
LANGUAGE,0.9998,0.9998,0.9998,0.9998,0.9998,0.04235


In [21]:
stars = pd.DataFrame("", index=domains["kmeans"], columns=domains["rdoc"])
for k in domains["kmeans"]:
    for r in domains["rdoc"]:
        pval = pvals_cor.loc[k,r]
        if pval < 0.05:
            stars.loc[k,r] = "*"
        if pval < 0.01:
            stars.loc[k,r] = "**"
        if pval < 0.001:
            stars.loc[k,r] = "***"
stars

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,**,,,**,**,
ANTICIPATION,***,***,,,,
COGNITION,***,***,,,,
VISION,,,**,,,
MANIPULATION,,,***,,,***
MEANING,,,,*,,
LANGUAGE,,,,,,*


## Weights for figure

In [22]:
sims[pvals_cor < 0.05] * 40

Unnamed: 0,NEGATIVE_VALENCE,POSITIVE_VALENCE,COGNITIVE_SYSTEMS,SOCIAL_PROCESSES,AROUSAL_REGULATION,SENSORIMOTOR_SYSTEMS
EMOTION,9.0,,,8.27586,8.88889,
ANTICIPATION,10.8108,16.9697,,,,
COGNITION,14.4762,16.4948,,,,
VISION,,,10.3704,,,
MANIPULATION,,,18.2857,,,22.9787
MEANING,,,,8.8,,
LANGUAGE,,,,,,8.18182


# Similarity of DSM and data-driven systems

## Observed values

In [23]:
sims = pd.DataFrame(index=domains["kmeans"], columns=domains["dsm"])
for k in domains["kmeans"]:
    for r in domains["dsm"]:
        sims.loc[k,r] = 1.0 - dice(systems["kmeans"][k], systems["dsm"][r])
sims

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,0.0,0.0,0.0,0.171429,0.272727,0.0,0.318182,0.0
ANTICIPATION,0.0,0.0,0.0869565,0.15625,0.0526316,0.333333,0.0,0.0
COGNITION,0.0,0.0,0.0740741,0.252632,0.173913,0.0298507,0.0869565,0.0
VISION,0.242991,0.0,0.0,0.0,0.0,0.0,0.0273973,0.133333
MANIPULATION,0.211538,0.0350877,0.0,0.0,0.0,0.0,0.0,0.416667
MEANING,0.0659341,0.0,0.0,0.0,0.0,0.0,0.0701754,0.0
LANGUAGE,0.122449,0.0,0.0,0.0222222,0.0,0.0322581,0.0,0.0


## Null distribution

In [24]:
n_iter = 10000
sims_null = np.empty((len(domains["kmeans"]), len(domains["dsm"]), n_iter))
for n in range(n_iter):
    null = np.random.choice(words+structures, 
                            size=len(words+structures), replace=False)
    sims_null[:,:,n] = 1.0 - cdist(systems["kmeans"].loc[null].values.T, 
                                   systems["dsm"].values.T, metric="dice")
    if n % (float(n_iter) / 10.0) == 0:
        print("Iteration {}".format(n))

Iteration 0
Iteration 1000
Iteration 2000
Iteration 3000
Iteration 4000
Iteration 5000
Iteration 6000
Iteration 7000
Iteration 8000
Iteration 9000


In [25]:
pvals = pd.DataFrame(index=domains["kmeans"], columns=domains["dsm"])
for i, k in enumerate(domains["kmeans"]):
    for j, r in enumerate(domains["dsm"]):
        pval = sum([1.0 for val in sims_null[i,j,:] if val > sims.loc[k,r]]) / float(n_iter)
        pvals.loc[k,r] = pval
pvals

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,0.9638,0.4118,0.3399,0.0078,0.0001,0.6755,0.0,0.7502
ANTICIPATION,0.8992,0.3123,0.0303,0.0046,0.2079,0.0,0.5862,0.6098
COGNITION,0.9993,0.6946,0.0469,0.0016,0.0089,0.7117,0.2493,0.9492
VISION,0.0035,0.7257,0.6525,0.9991,0.9542,0.9421,0.7898,0.0659
MANIPULATION,0.0166,0.3184,0.6225,0.999,0.9423,0.9312,0.9423,0.0
MEANING,0.7289,0.5731,0.5043,0.9903,0.8686,0.8491,0.2977,0.8939
LANGUAGE,0.3326,0.6528,0.5785,0.9718,0.9173,0.6439,0.9103,0.9288


In [26]:
pvals_cor = fdrcorrection(pvals.values.ravel())[1]
pvals_cor = np.reshape(pvals_cor, pvals.shape)
pvals_cor = pd.DataFrame(pvals_cor, index=domains["kmeans"], columns=domains["dsm"])
pvals_cor

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,0.9993,0.9993,0.95172,0.0546,0.0014,0.9993,0.0,0.9993
ANTICIPATION,0.9993,0.95172,0.154255,0.0368,0.8316,0.0,0.9993,0.9993
COGNITION,0.9993,0.9993,0.218867,0.01792,0.0553778,0.9993,0.93072,0.9993
VISION,0.0326667,0.9993,0.9993,0.9993,0.9993,0.9993,0.9993,0.283877
MANIPULATION,0.09296,0.95172,0.9993,0.9993,0.9993,0.9993,0.9993,0.0
MEANING,0.9993,0.9993,0.9993,0.9993,0.9993,0.9993,0.95172,0.9993
LANGUAGE,0.95172,0.9993,0.9993,0.9993,0.9993,0.9993,0.9993,0.9993


In [27]:
stars = pd.DataFrame("", index=domains["kmeans"], columns=domains["dsm"])
for k in domains["kmeans"]:
    for r in domains["dsm"]:
        pval = pvals_cor.loc[k,r]
        if pval < 0.05:
            stars.loc[k,r] = "*"
        if pval < 0.01:
            stars.loc[k,r] = "**"
        if pval < 0.001:
            stars.loc[k,r] = "***"
stars

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,,,,,**,,***,
ANTICIPATION,,,,*,,***,,
COGNITION,,,,*,,,,
VISION,*,,,,,,,
MANIPULATION,,,,,,,,***
MEANING,,,,,,,,
LANGUAGE,,,,,,,,


## Weights for figure

In [28]:
sims[pvals_cor < 0.05] * 40

Unnamed: 0,DEVELOPMENTAL,PSYCHOTIC,BIPOLAR,DEPRESSIVE,ANXIETY,OBSESSIVE_COMPULSIVE,TRAUMA_STRESSOR,COGNITIVE
EMOTION,,,,,10.9091,,12.7273,
ANTICIPATION,,,,6.25,,13.3333,,
COGNITION,,,,10.1053,,,,
VISION,9.71963,,,,,,,
MANIPULATION,,,,,,,,16.6667
MEANING,,,,,,,,
LANGUAGE,,,,,,,,
