In [1]:
import sys
sys.path.append("..")
import style, utilities
from ontology import ontology

# Load the data

## Document-term matrix

In [2]:
lexicon = utilities.load_lexicon(["cogneuro"])
len(lexicon)

2208

In [3]:
version = 190325
dtm_bin = utilities.load_doc_term_matrix(version=version, binarize=True)
lexicon = sorted(list(set(lexicon).intersection(dtm_bin.columns)))
len(lexicon)

1683

In [4]:
dtm_bin = dtm_bin[lexicon]
print("Document N={}, Term N={}".format(
      dtm_bin.shape[0], dtm_bin.shape[1]))

Document N=18155, Term N=1683


## Brain activation coordinates

In [5]:
act_bin = utilities.load_coordinates()
print("Document N={}, Structure N={}".format(
      act_bin.shape[0], act_bin.shape[1]))

Document N=18155, Structure N=118


In [6]:
structures = list(act_bin.columns)

## Data-driven domains

In [7]:
from collections import OrderedDict
import pandas as pd

In [8]:
domain_range = [6, 12, 24]
domain_difs = [domain_range[i] - domain_range[i-1] for i in range(1, len(domain_range))]
domain_difs

[6, 12]

In [9]:
path = "../ontology/"
suffix = "_logreg"
domains = {}

for k in domain_range:
    lists, circuit = ontology.load_ontology(k, path=path, suffix=suffix)
    domains[k] = {"lists": lists, "circuit": circuit}

# Re-index k=6 domains

In [10]:
list(OrderedDict.fromkeys(domains[6]["lists"]["DOMAIN"]))

['VISION', 'LANGUAGE', 'MANIPULATION', 'REWARD', 'COGNITION', 'MEMORY']

In [11]:
old2new = {1:4, 2:6, 3:5, 4:2, 5:3, 6:1}

In [12]:
domains[6]["lists"]["CLUSTER"] = [old2new[old] for old in domains[6]["lists"]["CLUSTER"]]
domains[6]["lists"] = domains[6]["lists"].sort_values("CLUSTER")
list(OrderedDict.fromkeys(domains[6]["lists"]["DOMAIN"]))

['MEMORY', 'REWARD', 'COGNITION', 'VISION', 'MANIPULATION', 'LANGUAGE']

In [13]:
domains[6]["circuit"]["CLUSTER"] = [old2new[old] for old in domains[6]["circuit"]["CLUSTER"]]
domains[6]["circuit"] = domains[6]["circuit"].sort_values("CLUSTER")
list(OrderedDict.fromkeys(domains[6]["circuit"]["DOMAIN"]))

['MEMORY', 'REWARD', 'COGNITION', 'VISION', 'MANIPULATION', 'LANGUAGE']

# Compute domain similarity

## Observed values

Similarity between pairs <i>k</i> = 2 and 3, 3 and 4, etc.

In [14]:
import numpy as np
np.random.seed(42)
from scipy.spatial.distance import dice, cdist

In [15]:
def load_system(k, domain, terms, structures):
    
    lists = domains[k]["lists"]
    circuit = domains[k]["circuit"]
    
    systems = pd.Series(0, index=terms+structures)
    systems.loc[list(lists.loc[lists["DOMAIN"] == domain, "TOKEN"])] = 1
    systems.loc[list(circuit.loc[circuit["DOMAIN"] == domain, "STRUCTURE"])] = 1
    
    return systems

In [16]:
def compute_sim_obs(k, dw):
    
    domains_ki = list(OrderedDict.fromkeys(domains[k]["lists"]["DOMAIN"]))
    domains_kj = list(OrderedDict.fromkeys(domains[k+dw]["lists"]["DOMAIN"]))
    
    terms = sorted(list(set(domains[k]["lists"]["TOKEN"]).union(domains[k+dw]["lists"]["TOKEN"])))
    systems_ki = {domain: load_system(k, domain, terms, structures) for domain in domains_ki}
    systems_kj = {domain: load_system(k+dw, domain, terms, structures) for domain in domains_kj}
    
    sims = pd.DataFrame(index=domains_ki, columns=domains_kj)
    for di in domains_ki:
        for dj in domains_kj:
            sims.loc[di, dj] = 1.0 - dice(systems_ki[di], systems_kj[dj])
            
    return sims

In [17]:
sims = {k: compute_sim_obs(k, dw) for k, dw in zip(domain_range[:-1], domain_difs)}
sims[6]

Unnamed: 0,HEARING,REACTION_TIME,MEMORY,EPISODIC_MEMORY,LANGUAGE,VISION,EXECUTION,MANIPULATION,REWARD,ANTICIPATION,REPRESENTATION,EMOTION
MEMORY,0.0,0.109589,0.666667,0.285714,0.0,0.0983607,0.0,0.030303,0.0344828,0.0,0.0,0.588235
REWARD,0.0,0.271186,0.0,0.0,0.0,0.0,0.0,0.0,0.409091,0.722222,0.0,0.0
COGNITION,0.0,0.592593,0.103896,0.128205,0.03125,0.0,0.0,0.378378,0.0909091,0.0344828,0.0555556,0.135593
VISION,0.0,0.0,0.24,0.421053,0.0322581,0.686567,0.0,0.0833333,0.0,0.0,0.0,0.0350877
MANIPULATION,0.0,0.075,0.0,0.0779221,0.0,0.0,0.327273,0.219178,0.0,0.0,0.732394,0.0
LANGUAGE,0.636364,0.0,0.0,0.0,0.603175,0.0,0.0,0.164384,0.0,0.0,0.0,0.0


## Null distributions

In [18]:
def compute_sim_null(k, dw, n_iter=1000):
    
    print("Processing k={}".format(k))
    
    domains_ki = list(OrderedDict.fromkeys(domains[k]["lists"]["DOMAIN"]))
    domains_kj = list(OrderedDict.fromkeys(domains[k+dw]["lists"]["DOMAIN"]))
    
    terms = sorted(list(set(domains[k]["lists"]["TOKEN"]).union(domains[k+dw]["lists"]["TOKEN"])))
    systems_ki = {domain: load_system(k, domain, terms, structures) for domain in domains_ki}
    systems_kj = {domain: load_system(k+dw, domain, terms, structures) for domain in domains_kj}
    
    sims_null_k = np.empty((k, k+dw, n_iter))
    for n in range(n_iter):
        null = np.random.choice(terms+structures, size=len(terms+structures), replace=False)
        for i, di in enumerate(domains_ki):
            for j, dj in enumerate(domains_kj):
                sims_null_k[i, j, n] = 1.0 - dice(systems_ki[di], systems_kj[dj].loc[null])
    
    return sims_null_k

In [19]:
sims_null = {k: compute_sim_null(k, dw) for k, dw in zip(domain_range[:-1], domain_difs)}

Processing k=6
Processing k=12


## False discovery rates

In [20]:
from statsmodels.stats.multitest import multipletests

In [21]:
def compute_sim_fdr(sims, sims_null):
    
    n_iter = sims_null.shape[2]
    
    domains_ki = sims.index
    domains_kj = sims.columns
    
    pvals = pd.DataFrame(index=domains_ki, columns=domains_kj)
    for i, di in enumerate(domains_ki):
        for j, dj in enumerate(domains_kj):
            sims_null[i,j,:]
            pvals.loc[di,dj] = np.sum(sims_null[i,j,:] > sims.loc[di,dj]) / float(n_iter)
    
    fdrs = multipletests(pvals.values.ravel(), method="fdr_bh")[1]
    fdrs = np.reshape(fdrs, pvals.shape)
    fdrs = pd.DataFrame(fdrs, index=domains_ki, columns=domains_kj)
    
    return fdrs

In [22]:
fdrs = {k: compute_sim_fdr(sims[k], sims_null[k]) for k in domain_range[:-1]}
fdrs[6]

Unnamed: 0,HEARING,REACTION_TIME,MEMORY,EPISODIC_MEMORY,LANGUAGE,VISION,EXECUTION,MANIPULATION,REWARD,ANTICIPATION,REPRESENTATION,EMOTION
MEMORY,1,1.0,0.0,0.0154286,1,1,1,1.0,1,1,1,0.0
REWARD,1,0.00553846,1.0,1.0,1,1,1,1.0,0,0,1,1.0
COGNITION,1,0.0,1.0,1.0,1,1,1,0.0,1,1,1,0.436235
VISION,1,1.0,0.168,0.0,1,0,1,1.0,1,1,1,1.0
MANIPULATION,1,1.0,1.0,1.0,1,1,0,0.1845,1,1,0,1.0
LANGUAGE,0,1.0,1.0,1.0,0,1,1,0.936,1,1,1,1.0


# Build the hierarchy

In [23]:
import networkx as nx
from style import style

In [24]:
id = 1
id2node, id2k, id2image = {}, {}, {}
node2id = {k: {} for k in domain_range}
nodes = pd.DataFrame(columns=["node", "name"])
for k in domain_range:
    names = list(OrderedDict.fromkeys(domains[k]["lists"]["DOMAIN"]))
    for domain in names:
        image = "figures/k{:02d}/{}".format(k, domain)
        node2id[k][domain] = id
        id2k[id] = k
        id2node[id] = domain
        id2image[id] = image
        nodes = nodes.append({"node": id,
                              "name": domain, 
                              "image": image}, 
                             ignore_index=True)
        id += 1
nodes.head(10)

Unnamed: 0,node,name,image
0,1,MEMORY,figures/k06/MEMORY
1,2,REWARD,figures/k06/REWARD
2,3,COGNITION,figures/k06/COGNITION
3,4,VISION,figures/k06/VISION
4,5,MANIPULATION,figures/k06/MANIPULATION
5,6,LANGUAGE,figures/k06/LANGUAGE
6,7,HEARING,figures/k12/HEARING
7,8,REACTION_TIME,figures/k12/REACTION_TIME
8,9,MEMORY,figures/k12/MEMORY
9,10,EPISODIC_MEMORY,figures/k12/EPISODIC_MEMORY


In [25]:
columns = ["source", "source_name", "target", "target_name", "weight"]
edges = pd.DataFrame(columns=columns)
for k, dw in zip(domain_range[:-1], domain_difs):
    for source in sims[k].index:
        for target in sims[k].columns:
            weight = sims[k].loc[source, target]
            fdr = fdrs[k].loc[source, target]
            if weight == max(sims[k][target]) and weight > 0 and fdr < 0.001:
                edges = edges.append({"source": node2id[k][source], 
                                      "source_name": source,
                                      "target": node2id[k+dw][target], 
                                      "target_name": target,
                                      "weight": weight}, 
                                     ignore_index=True)
edges.head(10)

Unnamed: 0,source,source_name,target,target_name,weight
0,1,MEMORY,9,MEMORY,0.666667
1,1,MEMORY,18,EMOTION,0.588235
2,2,REWARD,15,REWARD,0.409091
3,2,REWARD,16,ANTICIPATION,0.722222
4,3,COGNITION,8,REACTION_TIME,0.592593
5,3,COGNITION,14,MANIPULATION,0.378378
6,4,VISION,10,EPISODIC_MEMORY,0.421053
7,4,VISION,12,VISION,0.686567
8,5,MANIPULATION,13,EXECUTION,0.327273
9,5,MANIPULATION,17,REPRESENTATION,0.732394


In [26]:
G = nx.from_pandas_edgelist(edges, "source", "target", "weight", 
                            create_using=nx.DiGraph())

In [27]:
trees = []
for i in range(1, domain_range[0]+1):
    g = nx.bfs_tree(G,i)
    for id in g._node.keys():
        g._node[id]["name"] = id2node[id]
        g._node[id]["image"] = id2image[id]
        g._node[id]["color"] = style.palettes["data-driven"][i-1]
        g._node[id]["weight"] = 0
        if id in list(edges["target"]):
            g._node[id]["weight"] = float(edges.loc[edges["target"] == id, "weight"].values[0])
    trees.append((g, i))
tree = nx.join(trees)

In [28]:
hierarchy = [nx.readwrite.json_graph.tree_data(tree, root=0)]
hierarchy[0]

{'id': 0,
 'children': [{'name': 'MEMORY',
   'image': 'figures/k06/MEMORY',
   'color': '#5B81BD',
   'weight': 0,
   'id': 1,
   'children': [{'name': 'MEMORY',
     'image': 'figures/k12/MEMORY',
     'color': '#5B81BD',
     'weight': 0.6666666666666667,
     'id': 2,
     'children': [{'name': 'MEMORY',
       'image': 'figures/k24/MEMORY',
       'color': '#5B81BD',
       'weight': 0.5098039215686274,
       'id': 4},
      {'name': 'EPISODIC_MEMORY',
       'image': 'figures/k24/EPISODIC_MEMORY',
       'color': '#5B81BD',
       'weight': 0.7540983606557377,
       'id': 5}]},
    {'name': 'EMOTION',
     'image': 'figures/k12/EMOTION',
     'color': '#5B81BD',
     'weight': 0.5882352941176471,
     'id': 3,
     'children': [{'name': 'EMOTION',
       'image': 'figures/k24/EMOTION',
       'color': '#5B81BD',
       'weight': 0.5641025641025641,
       'id': 6},
      {'name': 'RECALL',
       'image': 'figures/k24/RECALL',
       'color': '#5B81BD',
       'weight': 0.60465

In [29]:
text = str(hierarchy).replace("'", '"')
with open("hierarchy.json", "w+") as outfile:
    outfile.write(text)

# Plot node images

In [30]:
import os
from nilearn import image, plotting
%matplotlib inline



## Brain slices

In [31]:
atlas = utilities.load_atlas()

In [32]:
colors = style.palettes["data-driven"]
cmaps = [style.make_cmap(color) for color in colors]

In [33]:
node2color_id = {k: {} for k in domain_range}
for i in range(1, domain_range[0]+1):
    g = nx.bfs_tree(G,i)
    for id in g._node.keys():
        k = id2k[id]
        node = id2node[id]
        node2color_id[k][node] = i-1

In [34]:
def map_domain(data, domain, atlas, path, cmap=cmaps[0], suffix="", plane="z", 
               cbar=False, annotate=False, vmin=None, vmax=None, print_fig=True):

    stat_map = image.copy_img(atlas).get_data()
    for i, value in enumerate(data):
        stat_map[stat_map == i+1] = value
    stat_map = image.new_img_like(atlas, stat_map)
    display = plotting.plot_stat_map(stat_map,
                                     cut_coords=1,
                                     display_mode=plane, 
                                     symmetric_cbar=False, colorbar=cbar,
                                     cmap=cmap, threshold=vmin, 
                                     vmax=vmax, alpha=0.5,
                                     annotate=annotate, draw_cross=False)
    if print_fig:
        plotting.show()
    file_name = "{}/{}{}.png".format(path, domain, suffix)
    display.savefig(file_name, dpi=250)
    utilities.transparent_background(file_name)
    display.close()

In [35]:
for k in domain_range:
    
    path = "figures/k{:02}".format(k)
    if not os.path.exists(path):
        os.makedirs(path)
    
    lists = domains[k]["lists"]
    circuit = domains[k]["circuit"]
    names = list(OrderedDict.fromkeys(circuit["DOMAIN"]))
    
    scores = utilities.score_lists(lists, dtm_bin, label_var="DOMAIN").loc[act_bin.index]
    pmi = ontology.compute_cooccurrences(act_bin, scores, positive=True)
    pmi = ontology.threshold_pmi_by_circuits(pmi, circuit)
    vmaxs = [round(v, 2) for v in pmi.max()]
    
    for i, name in enumerate(names):
        if name in node2color_id[k].keys():
            map_domain(pmi[name], name, atlas, path, 
                       cmap=cmaps[node2color_id[k][name]], suffix="_z", plane="z", cbar=False, 
                       vmin=0.0, vmax=vmaxs[i], print_fig=False, annotate=False)

  fraction * (x1 - x0), y1 - y0])


## Word clouds

In [36]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [37]:
for k in domain_range:
    nodes = node2color_id[k].keys()
    lists = domains[k]["lists"]
    colors = [style.palettes["data-driven"][node2color_id[k][node]] for node in nodes]
    ontology.plot_wordclouds("figures/k{:02d}".format(k), nodes, lists, metric="R", 
                             path="", suffix="".format(k), palette=colors,
                             height=300, width=600, min_font_size=0, max_font_size=50,
                             brightness_offset=0.15, darkness_offset=-0.35, n_offsets=25,
                             font="../style/Avenir.ttf", print_fig=False)