In [1]:
import sys
sys.path.append("..")
import hierarchy, style, utilities
from ontology import ontology

# Load the data

In [5]:
cerebellum = "seg"

## Document-term matrix

In [2]:
lexicon = utilities.load_lexicon(["cogneuro"])
len(lexicon)

2208

In [3]:
version = 190325
dtm_bin = utilities.load_doc_term_matrix(version=version, binarize=True)
lexicon = sorted(list(set(lexicon).intersection(dtm_bin.columns)))
len(lexicon)

1683

In [4]:
dtm_bin = dtm_bin[lexicon]
print("Document N={}, Term N={}".format(
      dtm_bin.shape[0], dtm_bin.shape[1]))

Document N=18155, Term N=1683


## Brain activation coordinates

In [6]:
act_bin = utilities.load_coordinates(cerebellum=cerebellum)
print("Document N={}, Structure N={}".format(
      act_bin.shape[0], act_bin.shape[1]))

Document N=18155, Structure N=148


In [7]:
structures = list(act_bin.columns)

## Data-driven domains

In [8]:
from collections import OrderedDict

In [9]:
domain_range = [6, 12, 24]
domain_difs = [domain_range[i] - domain_range[i-1] for i in range(1, len(domain_range))]
domain_difs

[6, 12]

In [10]:
def name_domains(lists, dtm):
    
    k = len(set(lists["CLUSTER"]))
    names, degs = [""]*k, [0]*k
    terms = {i: list(set(lists.loc[lists["CLUSTER"] == i+1, "TOKEN"])) for i in range(k)}
    
    while "" in names:
        for i in range(k):
            degrees = ontology.term_degree_centrality(i+1, lists, dtm, dtm.index)
            degrees = degrees.loc[terms[i]].sort_values(ascending=False)
            name = degrees.index[0].upper()
            
            if name not in names:
                names[i] = name
                degs[i] = max(degrees)
                
            elif name in names:
                name_idx = names.index(name)
                if degs[name_idx] > degs[i]:
                    terms[i] = [term for term in terms[i] if term != name.lower()]
    return names

In [21]:
path = "../ontology/"
suffix = "_logreg"
domains = {}

for k in domain_range:
    
    lists, circuit = ontology.load_ontology(k, path=path, suffix=suffix, cerebellum=cerebellum)
    
    if "DOMAIN" not in lists.columns or "DOMAIN" not in circuit.columns:
        names = name_domains(lists, dtm_bin)

        lists["DOMAIN"] = [names[i-1] for i in lists["CLUSTER"]]
        lists = lists[["DOMAIN", "CLUSTER", "TOKEN", "R"]]
        lists.to_csv("{}lists/lists_k{:02d}_oplen{}.csv".format(path, k, suffix), index=None)

        circuit["DOMAIN"] = [names[i-1] for i in circuit["CLUSTER"]]
        circuit = circuit[["DOMAIN", "CLUSTER", "STRUCTURE"]]
        circuit.to_csv("{}circuits/circuits_k{:02d}.csv".format(path, k), index=None)
    
    domains[k] = {"lists": lists, "circuit": circuit}

# Re-index k=6 domains

In [22]:
list(OrderedDict.fromkeys(domains[6]["lists"]["DOMAIN"]))

['HEARING', 'VISION', 'MOVEMENT', 'REWARD', 'MEMORY', 'COGNITIVE']

In [23]:
old2new = {1:6, 2:5, 3:4, 4:2, 5:1, 6:3}

In [24]:
domains[6]["lists"]["CLUSTER"] = [old2new[old] for old in domains[6]["lists"]["CLUSTER"]]
domains[6]["lists"] = domains[6]["lists"].sort_values("CLUSTER")
list(OrderedDict.fromkeys(domains[6]["lists"]["DOMAIN"]))

['MEMORY', 'REWARD', 'COGNITIVE', 'MOVEMENT', 'VISION', 'HEARING']

In [25]:
domains[6]["circuit"]["CLUSTER"] = [old2new[old] for old in domains[6]["circuit"]["CLUSTER"]]
domains[6]["circuit"] = domains[6]["circuit"].sort_values("CLUSTER")
list(OrderedDict.fromkeys(domains[6]["circuit"]["DOMAIN"]))

['MEMORY', 'REWARD', 'COGNITIVE', 'MOVEMENT', 'VISION', 'HEARING']

# Compute domain similarity

## Observed values

Similarity between pairs <i>k</i> = 2 and 3, 3 and 4, etc.

In [26]:
import pandas as pd
import numpy as np
np.random.seed(42)
from scipy.spatial.distance import dice, cdist

In [27]:
def load_system(k, domain, terms, structures):
    
    lists = domains[k]["lists"]
    circuit = domains[k]["circuit"]
    
    systems = pd.Series(0, index=terms+structures)
    systems.loc[list(lists.loc[lists["DOMAIN"] == domain, "TOKEN"])] = 1
    systems.loc[list(circuit.loc[circuit["DOMAIN"] == domain, "STRUCTURE"])] = 1
    
    return systems

In [28]:
def compute_sim_obs(k, dw):
    
    domains_ki = list(OrderedDict.fromkeys(domains[k]["lists"]["DOMAIN"]))
    domains_kj = list(OrderedDict.fromkeys(domains[k+dw]["lists"]["DOMAIN"]))
    
    terms = sorted(list(set(domains[k]["lists"]["TOKEN"]).union(domains[k+dw]["lists"]["TOKEN"])))
    systems_ki = {domain: load_system(k, domain, terms, structures) for domain in domains_ki}
    systems_kj = {domain: load_system(k+dw, domain, terms, structures) for domain in domains_kj}
    
    sims = pd.DataFrame(index=domains_ki, columns=domains_kj)
    for di in domains_ki:
        for dj in domains_kj:
            sims.loc[di, dj] = 1.0 - dice(systems_ki[di], systems_kj[dj])
            
    return sims

In [29]:
sims = {k: compute_sim_obs(k, dw) for k, dw in zip(domain_range[:-1], domain_difs)}
sims[6]

Unnamed: 0,HEARING,VISION,EXECUTION,COGNITIVE,LANGUAGE,REST,MANIPULATION,EMOTION,VESTIBULAR,MOVEMENT,REWARD,MEMORY
MEMORY,0.0,0.235294,0.0,0.159091,0.025974,0.0253165,0.0,0.517241,0.0,0.0,0.0327869,0.666667
REWARD,0.0,0.0,0.0,0.253521,0.0,0.0,0.0,0.097561,0.0,0.0,0.5,0.0
COGNITIVE,0.025641,0.0217391,0.0,0.8,0.0,0.0,0.153846,0.0615385,0.0,0.0,0.294118,0.0263158
MOVEMENT,0.0,0.0,0.392857,0.0,0.0,0.450704,0.0263158,0.0,0.222222,0.458333,0.0,0.0
VISION,0.0,0.465517,0.168421,0.0672269,0.0,0.0363636,0.608696,0.0224719,0.0,0.0229885,0.0,0.06
HEARING,0.576923,0.0,0.0,0.0,0.482759,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Null distributions

In [30]:
def compute_sim_null(k, dw, n_iter=1000):
    
    print("Processing k={}".format(k))
    
    domains_ki = list(OrderedDict.fromkeys(domains[k]["lists"]["DOMAIN"]))
    domains_kj = list(OrderedDict.fromkeys(domains[k+dw]["lists"]["DOMAIN"]))
    
    terms = sorted(list(set(domains[k]["lists"]["TOKEN"]).union(domains[k+dw]["lists"]["TOKEN"])))
    systems_ki = {domain: load_system(k, domain, terms, structures) for domain in domains_ki}
    systems_kj = {domain: load_system(k+dw, domain, terms, structures) for domain in domains_kj}
    
    sims_null_k = np.empty((k, k+dw, n_iter))
    for n in range(n_iter):
        null = np.random.choice(terms+structures, size=len(terms+structures), replace=False)
        for i, di in enumerate(domains_ki):
            for j, dj in enumerate(domains_kj):
                sims_null_k[i, j, n] = 1.0 - dice(systems_ki[di], systems_kj[dj].loc[null])
    
    return sims_null_k

In [31]:
sims_null = {k: compute_sim_null(k, dw) for k, dw in zip(domain_range[:-1], domain_difs)}

Processing k=6
Processing k=12


## False discovery rates

In [32]:
from statsmodels.stats.multitest import multipletests

In [33]:
def compute_sim_fdr(sims, sims_null):
    
    n_iter = sims_null.shape[2]
    
    domains_ki = sims.index
    domains_kj = sims.columns
    
    pvals = pd.DataFrame(index=domains_ki, columns=domains_kj)
    for i, di in enumerate(domains_ki):
        for j, dj in enumerate(domains_kj):
            sims_null[i,j,:]
            pvals.loc[di,dj] = np.sum(sims_null[i,j,:] > sims.loc[di,dj]) / float(n_iter)
    
    fdrs = multipletests(pvals.values.ravel(), method="fdr_bh")[1]
    fdrs = np.reshape(fdrs, pvals.shape)
    fdrs = pd.DataFrame(fdrs, index=domains_ki, columns=domains_kj)
    
    return fdrs

In [34]:
fdrs = {k: compute_sim_fdr(sims[k], sims_null[k]) for k in domain_range[:-1]}
fdrs[6]

Unnamed: 0,HEARING,VISION,EXECUTION,COGNITIVE,LANGUAGE,REST,MANIPULATION,EMOTION,VESTIBULAR,MOVEMENT,REWARD,MEMORY
MEMORY,1,0.1584,1.0,1.0,1,1,1,0.0,1.0,1,1,0
REWARD,1,1.0,1.0,0.0308571,1,1,1,0.609882,1.0,1,0,1
COGNITIVE,1,1.0,1.0,0.0,1,1,1,1.0,1.0,1,0,1
MOVEMENT,1,1.0,0.0,1.0,1,0,1,1.0,0.0166154,0,1,1
VISION,1,0.0,0.3015,1.0,1,1,0,1.0,1.0,1,1,1
HEARING,0,1.0,1.0,1.0,0,1,1,1.0,1.0,1,1,1


# Build the hierarchy

In [35]:
import networkx as nx
from style import style

In [36]:
id = 1
id2node, id2k, id2image = {}, {}, {}
node2id = {k: {} for k in domain_range}
nodes = pd.DataFrame(columns=["node", "name"])
for k in domain_range:
    names = list(OrderedDict.fromkeys(domains[k]["lists"]["DOMAIN"]))
    for domain in names:
        image = "figures/k{:02d}/{}".format(k, domain)
        node2id[k][domain] = id
        id2k[id] = k
        id2node[id] = domain
        id2image[id] = image
        nodes = nodes.append({"node": id,
                              "name": domain, 
                              "image": image}, 
                             ignore_index=True)
        id += 1
nodes.head(10)

Unnamed: 0,node,name,image
0,1,MEMORY,figures/k06/MEMORY
1,2,REWARD,figures/k06/REWARD
2,3,COGNITIVE,figures/k06/COGNITIVE
3,4,MOVEMENT,figures/k06/MOVEMENT
4,5,VISION,figures/k06/VISION
5,6,HEARING,figures/k06/HEARING
6,7,HEARING,figures/k12/HEARING
7,8,VISION,figures/k12/VISION
8,9,EXECUTION,figures/k12/EXECUTION
9,10,COGNITIVE,figures/k12/COGNITIVE


In [37]:
columns = ["source", "source_name", "target", "target_name", "weight"]
edges = pd.DataFrame(columns=columns)
for k, dw in zip(domain_range[:-1], domain_difs):
    for source in sims[k].index:
        for target in sims[k].columns:
            weight = sims[k].loc[source, target]
            fdr = fdrs[k].loc[source, target]
            if weight == max(sims[k][target]) and weight > 0 and fdr < 0.001:
                edges = edges.append({"source": node2id[k][source], 
                                      "source_name": source,
                                      "target": node2id[k+dw][target], 
                                      "target_name": target,
                                      "weight": weight}, 
                                     ignore_index=True)
edges.head(10)

Unnamed: 0,source,source_name,target,target_name,weight
0,1,MEMORY,14,EMOTION,0.517241
1,1,MEMORY,18,MEMORY,0.666667
2,2,REWARD,17,REWARD,0.5
3,3,COGNITIVE,10,COGNITIVE,0.8
4,4,MOVEMENT,9,EXECUTION,0.392857
5,4,MOVEMENT,12,REST,0.450704
6,4,MOVEMENT,16,MOVEMENT,0.458333
7,5,VISION,8,VISION,0.465517
8,5,VISION,13,MANIPULATION,0.608696
9,6,HEARING,7,HEARING,0.576923


In [38]:
G = nx.from_pandas_edgelist(edges, "source", "target", "weight", 
                            create_using=nx.DiGraph())

In [39]:
trees = []
for i in range(1, domain_range[0]+1):
    g = nx.bfs_tree(G,i)
    for id in g._node.keys():
        g._node[id]["name"] = id2node[id]
        g._node[id]["image"] = id2image[id]
        g._node[id]["color"] = style.palettes["data-driven"][i-1]
        g._node[id]["weight"] = 0
        if id in list(edges["target"]):
            g._node[id]["weight"] = float(edges.loc[edges["target"] == id, "weight"].values[0])
    trees.append((g, i))
tree = nx.join(trees)

In [40]:
hierarchy = [nx.readwrite.json_graph.tree_data(tree, root=0)]
hierarchy[0]

{'id': 0,
 'children': [{'name': 'MEMORY',
   'image': 'figures/k06/MEMORY',
   'color': '#7597D0',
   'weight': 0,
   'id': 1,
   'children': [{'name': 'EMOTION',
     'image': 'figures/k12/EMOTION',
     'color': '#7597D0',
     'weight': 0.5172413793103448,
     'id': 2,
     'children': [{'name': 'VALENCE',
       'image': 'figures/k24/VALENCE',
       'color': '#7597D0',
       'weight': 1.0,
       'id': 4}]},
    {'name': 'MEMORY',
     'image': 'figures/k12/MEMORY',
     'color': '#7597D0',
     'weight': 0.6666666666666667,
     'id': 3,
     'children': [{'name': 'MEMORY',
       'image': 'figures/k24/MEMORY',
       'color': '#7597D0',
       'weight': 0.9811320754716981,
       'id': 5},
      {'name': 'EPISODIC_MEMORY',
       'image': 'figures/k24/EPISODIC_MEMORY',
       'color': '#7597D0',
       'weight': 0.43137254901960786,
       'id': 6}]}]},
  {'name': 'REWARD',
   'image': 'figures/k06/REWARD',
   'color': '#B07EB6',
   'weight': 0,
   'id': 7,
   'children': [{'

In [41]:
text = str(hierarchy).replace("'", '"')
with open("hierarchy.json", "w+") as outfile:
    outfile.write(text)

# Plot node images

In [42]:
import os
from nilearn import image, plotting
%matplotlib inline



## Brain slices

In [43]:
atlas = utilities.load_atlas(cerebellum="seg")

In [44]:
purples = style.make_cmap([(1,1,1), (0.365,0,0.878)])
magentas = style.make_cmap([(1,1,1), (0.620,0,0.686)])
yellows = style.make_cmap([(1,1,1), (0.937,0.749,0)])
cmaps = ["Blues", magentas, yellows, "Greens", "Reds", purples]

In [45]:
node2color_id = {k: {} for k in domain_range}
for i in range(1, domain_range[0]+1):
    g = nx.bfs_tree(G,i)
    for id in g._node.keys():
        k = id2k[id]
        node = id2node[id]
        node2color_id[k][node] = i-1

In [46]:
def map_domain(data, domain, atlas, path, cmap=cmaps[0], suffix="", plane="z", 
               cbar=False, annotate=False, vmin=None, vmax=None, print_fig=True):

    stat_map = image.copy_img(atlas).get_data()
    for i, value in enumerate(data):
        stat_map[stat_map == i+1] = value
    stat_map = image.new_img_like(atlas, stat_map)
    display = plotting.plot_stat_map(stat_map,
                                     cut_coords=1,
                                     display_mode=plane, 
                                     symmetric_cbar=False, colorbar=cbar,
                                     cmap=cmap, threshold=vmin, 
                                     vmax=vmax, alpha=0.5,
                                     annotate=annotate, draw_cross=False)
    if print_fig:
        plotting.show()
    file_name = "{}/{}{}.png".format(path, domain, suffix)
    display.savefig(file_name, dpi=250)
    utilities.transparent_background(file_name)
    display.close()

In [47]:
for k in domain_range:
    path = "figures/k{:02}".format(k)
    if not os.path.exists(path):
        os.makedirs(path)
    circuit = domains[k]["circuit"]
    names = list(OrderedDict.fromkeys(circuit["DOMAIN"]))
    circuit_mat = pd.DataFrame(0.0, index=act_bin.columns, columns=names)
    for i, name in enumerate(names):
        if name in node2color_id[k].keys():
            structures = circuit.loc[circuit["DOMAIN"] == name, "STRUCTURE"]
            for structure in structures:
                circuit_mat.loc[structure, name] = 1.0
            map_domain(circuit_mat[name], name, atlas, path, 
                       cmap=cmaps[node2color_id[k][name]], suffix="_z", plane="z", cbar=False, 
                       vmin=0.0, vmax=2.0, print_fig=False, annotate=False)

  fraction * (x1 - x0), y1 - y0])


## Word clouds

In [48]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [49]:
def plot_wordclouds(k, domains, lists, dtm, colors, path="", suffix="", font=style.font):

    for i, dom in enumerate(domains):
        def color_func(word, font_size, position, orientation, 
                       random_state=None, idx=0, **kwargs):
            return colors[i]

        tkns = lists.loc[lists["DOMAIN"] == dom, "TOKEN"]
        freq = dtm[tkns].sum().values
        tkns = [t.replace("_", " ") for t in tkns]
        dic = {tkn: f for tkn, f in zip(tkns, freq)}

        cloud = WordCloud(background_color="rgba(255, 255, 255, 0)", mode="RGB", 
                          max_font_size=100, prefer_horizontal=1, scale=20, margin=3,
                          width=600, height=300, font_path=font, 
                          random_state=42).generate_from_frequencies(dic)

        fig = plt.figure(1, figsize=(2,10))
        plt.axis("off")
        plt.imshow(cloud.recolor(color_func=color_func, random_state=42))
        file_name = "{}figures/k{:02d}/{}_words{}.png".format(path, k, dom, suffix)
        plt.savefig(file_name, dpi=800, bbox_inches="tight")
        utilities.transparent_background(file_name)
        plt.close()

In [52]:
for k in domain_range:
    nodes = node2color_id[k].keys()
    colors = [style.palettes["data-driven"][node2color_id[k][node]] for node in nodes]
    plot_wordclouds(k, nodes, domains[k]["lists"], dtm_bin, colors, font="../style/Avenir.ttf", suffix="_viewer")