# Introduction

This notebook translates the disorders in the [DSM-5](https://dsm.psychiatryonline.org/doi/book/10.1176/appi.books.9780890425596) into the language of the psychiatric neuroimaging literature. The objective is to maximize semantic similarity between the centroid of seed terms from the DSM-5 (i.e., disorder names) and the centroid of a new list of terms for mental functions and dysfunctions. Seed terms are grouped by the headings in Section II of the DSM-5. The work flow is as follows:

1. Identify the most semantically similar terms to each domain centroid across list lengths.
2. Select the word list length that maximizes semantic similarity to the domain centroid.
3. Filter domains by document frequency of their assigned terms.

Our vector space model for semantic content is [GloVe](https://github.com/stanfordnlp/GloVe), which was trained on 26,070 psychiatric neuroimaging articles. The parameters for the GloVe model were a minimum word count of 5, window size of 15, and embedding dimension of 100 over 500 iterations.

# Load the data

In [1]:
import pandas as pd
from collections import OrderedDict
import sys
sys.path.append("..")
import utilities, ontology

In [2]:
vsm_version = 190428 # Version of GloVe embeddings
dtm_version = 190325 # Version of document-term matrix

## Vector space model

In [3]:
vsm = pd.read_csv("../data/text/glove_psy_n100_win15_min5_iter500_{}.txt".format(vsm_version), 
                  index_col=0, header=None, sep=" ")
print("Vocab N={}, Embedding N={}".format(vsm.shape[0], vsm.shape[1]))

Vocab N=320502, Embedding N=100


## Document-term matrix

In [4]:
dtm = utilities.load_doc_term_matrix(version=dtm_version, binarize=False)
dtm = dtm.loc[:, (dtm != 0).any(axis=0)]
print("Document N={}, Term N={}".format(dtm.shape[0], dtm.shape[1]))

Document N=18155, Term N=4107


In [5]:
dtm_bin = utilities.doc_mean_thres(dtm)

## DSM-5 seed terms

In [6]:
seed_df = pd.read_csv("../data/text/seeds_dsm5.csv", index_col=None, header=0)
seed_df.head()

Unnamed: 0,ORDER,NAME,DOMAIN,CONSTRUCT,TOKEN
0,0,Neurodevelopmental Disorders,DEVELOPMENTAL,INTELLECTUAL_DISABILITY,borderline_intellectual_functioning
1,0,Neurodevelopmental Disorders,DEVELOPMENTAL,INTELLECTUAL_DISABILITY,global_developmental_delay
2,0,Neurodevelopmental Disorders,DEVELOPMENTAL,INTELLECTUAL_DISABILITY,intellectual_developmental_disorder
3,0,Neurodevelopmental Disorders,DEVELOPMENTAL,INTELLECTUAL_DISABILITY,intellectual_disability
4,0,Neurodevelopmental Disorders,DEVELOPMENTAL,INTELLECTUAL_DISABILITY,unspecified_intellectual_developmental_disorder


In [7]:
doms = list(OrderedDict.fromkeys(seed_df["DOMAIN"]))

## Lexicon

In [8]:
lexicon = utilities.load_lexicon(["cogneuro", "dsm", "psychiatry"])
lexicon = sorted(list(set(lexicon).intersection(vsm.index).intersection(dtm.columns)))
len(lexicon)

2170

# Generate term lists

## 1. Identify semantically similar terms

In [9]:
import numpy as np
np.random.seed(42)
from scipy.spatial.distance import cdist

In [10]:
class_tkns = []
for dom in doms:
    class_tkns += set(seed_df.loc[seed_df["DOMAIN"] == dom, "TOKEN"])
unique = [tkn for tkn in class_tkns if class_tkns.count(tkn) == 1]

In [11]:
list_len = 25
n_terms = range(5, list_len+1)
lists = ontology.load_dsm_lists(lexicon, vsm, seed_df, n_terms=n_terms, verbose=True)
lists.to_csv("lists/lists_dsm.csv", index=None)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  seed_centroid = np.mean(vsm.loc[seed_tkns])


## 2. Optimize length by similarity to seeds

In [12]:
from scipy.spatial.distance import cosine

In [13]:
ops = []
df = pd.DataFrame(index=doms, columns=list_lens)
for dom in doms:
    seed_tkns = seed_df.loc[seed_df["DOMAIN"] == dom, "TOKEN"]
    seed_centroid = np.mean(vsm.loc[seed_tkns])
    for list_len in n_terms:
        len_tkns = lists.loc[lists["DOMAIN"] == dom, "TOKEN"][:list_len]
        len_centroid = np.mean(vsm.loc[len_tkns])
        df.loc[dom, list_len] = 1.0 - cosine(seed_centroid, len_centroid)
    sims = list(df.loc[dom])
    idx = sims.index(max(sims))
    ops.append(np.array(list_lens)[idx])
df["OPTIMAL"] = ops
df.head()

NameError: name 'list_lens' is not defined

In [None]:
columns = ["ORDER", "DOMAIN", "TOKEN", "SOURCE", "DISTANCE"]
new = pd.DataFrame(columns=columns)
for order, dom in enumerate(doms):
    list_len = df.loc[dom, "OPTIMAL"]
    dom_df = lists.loc[lists["DOMAIN"] == dom][:list_len]
    new = new.append(dom_df)

## 3. Filter domains by document frequency of terms

In [None]:
doms = list(OrderedDict.fromkeys(seed_df["DOMAIN"]))
filt_doms = []
for dom in doms: 
    tkns = set(new.loc[new["DOMAIN"] == dom, "TOKEN"])
    freq = sum([1.0 for doc in dtm_bin[tkns].sum(axis=1) if doc > 0]) / float(len(dtm))
    if freq > 0.05:
        print("{:20s} {:6.4f}".format(dom, freq))
        filt_doms.append(dom)
doms = filt_doms

In [None]:
new = new.loc[new["DOMAIN"].isin(filt_doms)]
new = new.loc[new["DISTANCE"] > 0]
print("Domains after filtering: N={}".format(len(set(new["DOMAIN"]))))

In [None]:
df.to_csv("data/df_dsm_opsim.csv")
new.to_csv("lists/lists_dsm_opsim.csv", index=None)

# Assess similarity to seeds

In [None]:
n_iter = 10000

### Centroids

In [None]:
seed_centroid = ontology.compute_centroid(seed_df, doms, vsm)
new_centroid = ontology.compute_centroid(new, doms, vsm)

### Bootstrap distribution

In [None]:
sim_boot = np.zeros((len(doms), n_iter))
for n in range(n_iter):
    sim_boot[:,n] = 1.0 - ontology.compute_sims_sample(new_centroid, seed_centroid, vsm)

### Null distribution

In [None]:
sim_null = np.empty((len(doms), n_iter))
for n in range(n_iter):
    sim_null[:,n] = 1.0 - ontology.compute_sims_shuffle(new_centroid, seed_centroid, vsm)

### Observed values

In [None]:
sim_obs = np.reshape(1.0 - ontology.compute_sims(new, seed_centroid, doms, vsm), (len(doms),1))

### Comparison test

Is similarity to DSM seed centroids higher for **new vs. null** models?

In [None]:
from statsmodels.stats.multitest import multipletests

In [None]:
alphas = [0.01, 0.001, 0.0001]

In [None]:
pvals = np.sum(np.less(sim_obs, sim_null), axis=1) / n_iter
fdrs = multipletests(pvals, method="fdr_bh")[1]
ontology.report_significance(fdrs, doms, alphas=alphas)

### Null confidence interval

In [None]:
interval = 0.95
lower = [sorted(sim_null[i,:])[int(n_iter*(1.0-interval))] for i in range(len(doms))]
upper = [sorted(sim_null[i,:])[int(n_iter*interval)] for i in range(len(doms))]

## Plot results

Comparison of new method vs. method of McCoy *et al.* in terms of similarity between the synonyms and RDoC seed terms. Statistics based on bootstrapping and permutation testing.

In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm, font_manager, rcParams
%matplotlib inline

In [None]:
arial = "../style/Arial Unicode.ttf"
prop_md = font_manager.FontProperties(fname=arial, size=16)
prop_lg = font_manager.FontProperties(fname=arial, size=22)
prop_xlg = font_manager.FontProperties(fname=arial, size=25)
rcParams["axes.linewidth"] = 1.5

In [None]:
dom_names = [dom.replace("_", "-").title() for dom in doms]
palette = utilities.palettes["dsm"]

In [None]:
# Set up figure
fig = plt.figure(figsize=(3.5, 4.5))
ax = fig.add_axes([0,0,1,1])

# Null distributions as line plot
plt.plot(range(len(doms)), sim_null.mean(axis=1),
         "gray", linestyle="dashed", linewidth=2)
plt.fill_between(range(len(doms)), lower, y2=upper, 
                 color="gray", alpha=0.2)

# Bootstrap distributions as violin plots
for i, dom in enumerate(doms):
    for data, obs in [(sim_boot, sim_obs)]:
        data = sorted(data[i])
        v = ax.violinplot(data, positions=[i], 
                          showmeans=False, showmedians=False, widths=0.8)
        for pc in v["bodies"]:
            pc.set_facecolor(palette[i])
            pc.set_edgecolor(palette[i])
            pc.set_linewidth(0.5)
            pc.set_alpha(0.6)
        for line in ["cmaxes", "cmins", "cbars"]:
            v[line].set_edgecolor("none")
        plt.plot([i-0.33, i+0.33], [np.mean(obs[i]), np.mean(obs[i])], 
                 c=palette[i], alpha=1, lw=2)

# Set plot parameters
ax.set_xticks(range(len(doms)))
ax.set_xticklabels(dom_names, rotation=60, 
                   ha="right", fontproperties=prop_md)
plt.xticks(fontproperties=prop_md)
plt.yticks(fontproperties=prop_xlg)
ax.yaxis.set_label_coords(-0.35, 0.5)
plt.xlim([-1, len(doms)])
plt.ylim([-0.2, 1])
ax.spines["right"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.xaxis.set_tick_params(width=1.5)
ax.yaxis.set_tick_params(width=1.5, length=5)

# Export figure
plt.savefig("figures/dsm_seed_sim.png", dpi=250, bbox_inches="tight")
plt.show()

# Visualize the term lists

In [None]:
dtm_bin = utilities.load_doc_term_matrix(version=dtm_version, binarize=True)

In [None]:
ontology.plot_wordclouds("dsm", doms, new, dtm_bin)

# Visualize the circuits

## Compute PPMI-weighted links

In [None]:
act_bin = utilities.load_coordinates()
print("Document N={}, Structure N={}".format(act_bin.shape[0], act_bin.shape[1]))

In [None]:
scores = utilities.score_lists(new, dtm_bin, label_var="DOMAIN")
print("Document N={}, Structure N={}".format(scores.shape[0], scores.shape[1]))

In [None]:
pmids = act_bin.index.intersection(scores.index)
len(pmids)

In [None]:
act_bin = act_bin.loc[pmids]
scores = scores.loc[pmids]

In [None]:
dom_links = ontology.compute_cooccurrences(act_bin, scores)

In [None]:
n_iter = 10000
dom_links_null = ontology.compute_cooccurrences_null(act_bin, scores, 
                                                     n_iter=n_iter, verbose=True)

In [None]:
p = pd.DataFrame(index=act_bin.columns, columns=scores.columns)
for i, struct in enumerate(act_bin.columns):
    for j, dom in enumerate(scores.columns):
        obs = dom_links.values[i,j]
        null = dom_links_null[i,j,:]
        p.loc[struct, dom] = np.sum(null > obs) / float(n_iter)

In [None]:
fdr = multipletests(p.values.ravel(), method="fdr_bh")[1]
fdr = pd.DataFrame(fdr.reshape(p.shape), 
                   index=act_bin.columns, columns=scores.columns)

In [None]:
dom_links_thres = dom_links[fdr < 0.01]
dom_links_thres = dom_links_thres.fillna(0.0)

In [None]:
dom_links_thres.to_csv("circuits/circuits_dsm.csv")

## Map PPMI-weighted links

In [None]:
atlas = utilities.load_atlas()

In [None]:
purples = utilities.make_cmap([(1,1,1), (0.365,0,0.878)])
chartreuses = utilities.make_cmap([(1,1,1), (0.345,0.769,0)])
magentas = utilities.make_cmap([(1,1,1), (0.620,0,0.686)])
yellows = utilities.make_cmap([(1,1,1), (0.937,0.749,0)])
browns = utilities.make_cmap([(1,1,1), (0.82,0.502,0)])
cmaps = [purples, chartreuses, "Oranges", "Blues", 
         "Reds", magentas, yellows, "Greens", browns]

In [None]:
utilities.map_plane(dom_links_thres, atlas, "figures/circuits/dsm", suffix="_z", 
                    cmaps=cmaps, plane="z", cbar=True, vmin=0.0, vmax=0.4,
                    verbose=True, print_fig=True, annotate=True)

In [None]:
for plane in ["x", "y"]:
    utilities.map_plane(dom_links_thres, atlas, "figures/circuits/dsm", suffix="_"+plane, 
                        cmaps=cmaps, plane=plane, cbar=True, vmin=0.0, vmax=0.4,
                        verbose=False, print_fig=False, annotate=True)