In [53]:
cd /private/home/ccaucheteux/hasson-syntaxe-vs-semantics/

/private/home/ccaucheteux/hasson-syntaxe-vs-semantics


In [149]:
import pandas as pd
import numpy as np
from src import paths

In [150]:
from nilearn import plotting
import nilearn
import seaborn as sns
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

In [151]:
from mne.stats import fdr_correction
from scipy.stats import wilcoxon 

def get_pvals(r, corrected=True, alpha=0.05):
    # r of shape (dim, n_samples)
    pvals = [wilcoxon(x)[1] for x in r]
    if corrected:
        pvals = fdr_correction(pvals, alpha=alpha, method='indep')[0]
    else:
        pvals = np.array(pvals)<=alpha
    return pvals

def set_ticks(ax, x_values = None, y_values = None):
    if x_values is not None:
        ax.set_xticks(x_values)
        ax.set_xticklabels([f"{x:.2f}".replace("0.", ".") for x in x_values])
    if y_values is not None:
        ax.set_yticks(y_values)
        ax.set_yticklabels([f"{x:.2f}".replace("0.", ".") for x in y_values])

In [693]:
TASKS = [
    'forgot',
    'black',
    'merlin',
    'sherlock',
    #'shapessocial',
    #'shapesphysical',
    'piemanpni',
    'bronx',
    #'21styear',
    #'prettymouth',
    'slumlordreach'
]

# bronx, piemanpni, black, and forgot

FEATS = [
    "3_phone_features",
    'sum-gpt2-0',
    'sum-gpt2-9',
    #'sum-gpt2-6',
]

ALL_FEATS = FEATS.copy()
ALL_FEATS += [
    f"sum-gpt2-{i}" for i in range(13)
]
ALL_FEATS = np.unique(ALL_FEATS)

LABELS = {
    '3_phone_features':"Phonological", 
    'phone_sum-gpt2-0':"Word embedding",
    'phone_sum-gpt2-9.equiv-random-mean-10':"GPT29 - syntax",
    'phone_sum-gpt2-9':"GPT2 (layer 9)",
    'sum-gpt2-9':"GPT2 (layer 9)",

    'sum-gpt2-0':"Word embedding",
    'phone_sum-gpt2-9.shuffle_in_sentence':"GPT29 - scrambled sentences", 
    'wordpos':"Word position"
}
for i in np.arange(1, 13):
    if i not in [9]:
        LABELS[f'phone_sum-gpt2-{i}'] = f'GPT2 (layer {i})'
        LABELS[f'sum-gpt2-{i}'] = f'GPT2 (layer {i})'

COLORS = {
    '3_phone_features':"b", 
    'phone_sum-gpt2-0':"g",
    'sum-gpt2-0':"g",
    'phone_sum-gpt2-9':"r",
}
palette = sns.color_palette("Reds", 13)

for i in np.arange(1, 13):
    if i not in [0]:
        COLORS[f'phone_sum-gpt2-{i}'] = palette[i]
        COLORS[f'sum-gpt2-{i}'] = palette[i]

# Gather results

In [694]:
EXP_NAME = "multisubjects-0130"
EXP_NAME = "multisubjects-0130"
EXP_NAME = "concat-multisubjects-0201-valid"
EXP_NAME = "100-concat-multisubjects-0206-wordemb"
EXP_NAME = "concat-single-task-0206"
#EXP_NAME = "regressout-multisubjects-0201-valid"
CONCAT = True

In [695]:
ls $paths.scores/ #concat-multisubjects-0201-newformat

/bin/bash: /public/apps/anaconda3/2020.11/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m[01;34m0222-gpt2-errors-multisubjects[0m/          [01;34mmedian-multisubjects-0130[0m/
[01;34m0222-gpt2-errors-multisubjects-5folds[0m/   [01;34mmultisubjects[0m/
[01;34m0223-gpt2-errors-multisubjects-10folds[0m/  [01;34mmultisubjects-0130[0m/
[01;34m0223-gpt2-errors-singlesubjects[0m/         [01;34mmultisubjects-concat[0m/
[01;34m100-concat-multisubjects-0206-wordemb[0m/   [01;34mmultisubjects-control[0m/
[01;34m200-concat-multisubjects-0201-valid[0m/     [01;34mmultisubjects-controls[0m/
[01;34mconcat-multisubjects-0130[0m/               [01;34mregressout-multisubjects-0201-valid[0m/
[01;34mconcat-multisubjects-0201[0m/               [01;34mregressout-single-subjects-0206[0m/
[01;34mconcat-multisubjects-0201-newformat[0m/     [01;34mregressout-single-subjects-0209[0m/
[01;34mconcat-multisubjects-0201-seeds[0m/     

In [696]:
from collections import defaultdict

In [697]:
import matplotlib.pyplot as plt
from scipy.stats import sem
import seaborn as sns
from collections import defaultdict

def gather_scores(scores):
    names = {}
    result = {"L": defaultdict(lambda x: []), 
             "R": defaultdict(lambda x: [])}
    for hemi in ["L", "R"]:
        for d in scores:
            if hemi in d:
                for k, v in d[hemi].items():
                    result[hemi][k].append(v)
        
        # Names
        names[hemi] = result[hemi].keys()
        names[hemi] = [p.name.split(".pth")[0] for p in names[hemi]]
        #import pdb
        #pdb.set_trace()
        scores[hemi] = np.stack(list(result[hemi].values()))
    
    assert (np.array(names["L"]) == np.array(names["L"])).all()
    scores = np.stack([scores["L"], scores["R"]], axis=1)
    names = names["L"].copy()
    return scores, names

In [698]:
from pathlib import Path
import json
from src.task_dataset import get_task_df
df_path = pd.read_csv(paths.scores / EXP_NAME / "results_path.csv")
df_path["is_file"] = df_path.save_file.apply(lambda x : Path(x).is_file())
df_path = df_path.query("is_file")
df_path = df_path.sort_values("feature_file")
df_path = df_path.drop_duplicates()

df = get_task_df()

# Add duration 
duration = json.load(open(paths.event_meta_path))
duration = {k:v["duration"] for _, v1 in duration.items() for k, v in v1.items()}
duration["slumlordreach"] = duration["slumlord"] + duration["reach"]
duration = pd.DataFrame(duration, index=["duration"]).T
duration = duration.reset_index().rename(columns={"index":"audio_task"})
df = pd.merge(df, duration, on="audio_task", how="left")


df_path = pd.merge(df_path, df, left_on=["subject", "task"], right_on=["subject", "audio_task"], how="left")
df_path["feat"] = [Path(p).name.split(".pth")[0] for p in df_path.feature_file]

df_path = df_path.query("feat in @ALL_FEATS and task in @TASKS")

In [699]:
scores = df_path.save_file.apply(lambda x: np.load(x, allow_pickle=True).item())
idx = np.where([i is not None for i in scores])
scores = scores.iloc[idx]
df_path = df_path.iloc[idx]

In [700]:
result = scores.copy()
scores = np.stack([np.stack([list(val[hemi].values())[0] for val in result]) for hemi in ["L", "R"]])
scores.shape

(2, 3318, 40962)

In [701]:
def split_parc(xyz, label, n, axis='y'):

    axes = dict(x=0, y=1, z=2)

    m = xyz[:, axes[axis]].min()
    M = xyz[:, axes[axis]].max()
    bounds = (M-m) * np.linspace(0, 1., 1+n) + m

    groups = np.digitize(xyz[:, axes[axis]], 
                         bounds)
    
    labels = list()
    for group_id, _ in enumerate(bounds):
        label_ = label.copy()
        label_.name = f'{group_id}_' + label.name
        label_.vertices = label.vertices[groups==group_id]
        labels.append(label_)
    return labels


In [702]:
def split_labels(all_labels, areas, subjects_dir="", surf = 'pial'):
    surf = Path(subjects_dir) / 'fsaverage6' / 'surf' / f'%s.{surf}'
    xyz = {"rh": nib.freesurfer.read_geometry(str(surf) % "rh")[0],
           "lh": nib.freesurfer.read_geometry(str(surf) % "lh")[0]}
    new_labels = []
    for label in all_labels:
        if len(label.vertices) > 400:
            n = len(label.vertices) // 400
            hemi = label.hemi
            new = split_parc(xyz[hemi][label.vertices], label, n)
            new_labels.extend(new)
        else:
            new_labels.append(label)
    return new_labels

In [703]:
def split_labels(all_labels, subjects_dir="", surf = 'pial'):
    
    areas = ["-".join(l.name.split("-")[:-1]) for l in all_labels] # if "Networks" in i]
    areas = np.unique(areas)
    
    surf = Path(subjects_dir) / 'fsaverage6' / 'surf' / f'%s.{surf}'
    xyz = {"rh": nib.freesurfer.read_geometry(str(surf) % "rh")[0],
           "lh": nib.freesurfer.read_geometry(str(surf) % "lh")[0]}
    new_labels = []
    for area in areas:
        labels = [l for l in all_labels if l.name in [area+"-lh", area+"-rh"]]
        assert len(labels)==2
        n = max([len(l.vertices) for l in labels])
        if n > 350:
            n = n//350
            for hemi, l in zip(["lh", "rh"], labels):
                new = split_parc(xyz[hemi][l.vertices], l, n)
                new_labels.extend(new)
        else:
            new_labels.extend(labels)
            
    """for label in all_labels:
        if len(label.vertices) > 500:
            n = len(label.vertices) // 500
            hemi = label.hemi
            new = split_parc(xyz[hemi][label.vertices], label, n)
            new_labels.extend(new)
        else:
            new_labels.append(label)"""
    return new_labels

In [704]:
from matplotlib.gridspec import GridSpec
from scipy.stats import wilcoxon
from scipy.stats import pearsonr, spearmanr

from pathlib import Path
import numpy as np
import mne
import nibabel as nib
import matplotlib.pyplot as plt

In [705]:
import mne
subjects_dir = "../narratives/derivatives/freesurfer/"
all_labels = mne.read_labels_from_annot('fsaverage6', parc='aparc.a2009s', #parc='Yeo2011_17Networks_N1000', 
                                            subjects_dir=subjects_dir, verbose=False)

#areas = ["-".join(l.name.split("-")[:-1]) for l in all_labels]

all_labels = split_labels(all_labels, subjects_dir=subjects_dir)

rois = {l.name : l.vertices for l in all_labels}
rois_colors = {"-".join(l.name.split("-")[:-1]) : l.color for l in all_labels if "lh" in l.name}

#areas = [i.split("-")[0] for i in rois.keys() if "Networks" in i]
areas = ["-".join(i.split("-")[:-1]) for i in rois.keys()] # if "Networks" in i]
areas = np.unique(areas)

# Scores (75 regions)
scores_rois = np.zeros((*scores.shape[:-1], len(areas)))
for i, area in enumerate(areas):
    for h, hemi in enumerate(["lh", "rh"]):
        scores_rois[h, :, i] = np.nanmean(scores[h, :, rois[f"{area}-{hemi}"]], 0)

  scores_rois[h, :, i] = np.nanmean(scores[h, :, rois[f"{area}-{hemi}"]], 0)


In [None]:
cd /private/home/ccaucheteux/hasson-syntaxe-vs-semantics/

import pandas as pd
import numpy as np
from src import paths

from nilearn import plotting
import nilearn
import seaborn as sns
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'

from mne.stats import fdr_correction
from scipy.stats import wilcoxon 

def get_pvals(r, corrected=True, alpha=0.05):
    # r of shape (dim, n_samples)
    pvals = [wilcoxon(x)[1] for x in r]
    if corrected:
        pvals = fdr_correction(pvals, alpha=alpha, method='indep')[0]
    else:
        pvals = np.array(pvals)<=alpha
    return pvals

def set_ticks(ax, x_values = None, y_values = None):
    if x_values is not None:
        ax.set_xticks(x_values)
        ax.set_xticklabels([f"{x:.2f}".replace("0.", ".") for x in x_values])
    if y_values is not None:
        ax.set_yticks(y_values)
        ax.set_yticklabels([f"{x:.2f}".replace("0.", ".") for x in y_values])

TASKS = [
    'forgot',
    'black',
    'merlin',
    'sherlock',
    #'shapessocial',
    #'shapesphysical',
    'piemanpni',
    'bronx',
    #'21styear',
    #'prettymouth',
    'slumlordreach'
]

# bronx, piemanpni, black, and forgot

FEATS = [
    "3_phone_features",
    'sum-gpt2-0',
    'sum-gpt2-9',
    #'sum-gpt2-6',
]

ALL_FEATS = FEATS.copy()
ALL_FEATS += [
    f"sum-gpt2-{i}" for i in range(13)
]
ALL_FEATS = np.unique(ALL_FEATS)

LABELS = {
    '3_phone_features':"Phonological", 
    'phone_sum-gpt2-0':"Word embedding",
    'phone_sum-gpt2-9.equiv-random-mean-10':"GPT29 - syntax",
    'phone_sum-gpt2-9':"GPT2 (layer 9)",
    'sum-gpt2-9':"GPT2 (layer 9)",

    'sum-gpt2-0':"Word embedding",
    'phone_sum-gpt2-9.shuffle_in_sentence':"GPT29 - scrambled sentences", 
    'wordpos':"Word position"
}
for i in np.arange(1, 13):
    if i not in [9]:
        LABELS[f'phone_sum-gpt2-{i}'] = f'GPT2 (layer {i})'
        LABELS[f'sum-gpt2-{i}'] = f'GPT2 (layer {i})'

COLORS = {
    '3_phone_features':"b", 
    'phone_sum-gpt2-0':"g",
    'sum-gpt2-0':"g",
    'phone_sum-gpt2-9':"r",
}
palette = sns.color_palette("Reds", 13)

for i in np.arange(1, 13):
    if i not in [0]:
        COLORS[f'phone_sum-gpt2-{i}'] = palette[i]
        COLORS[f'sum-gpt2-{i}'] = palette[i]

# Gather results

EXP_NAME = "multisubjects-0130"
EXP_NAME = "multisubjects-0130"
EXP_NAME = "concat-multisubjects-0201-valid"
EXP_NAME = "100-concat-multisubjects-0206-wordemb"
EXP_NAME = "concat-single-task-0206"
#EXP_NAME = "regressout-multisubjects-0201-valid"
CONCAT = True

ls $paths.scores/ #concat-multisubjects-0201-newformat

from collections import defaultdict

import matplotlib.pyplot as plt
from scipy.stats import sem
import seaborn as sns
from collections import defaultdict

def gather_scores(scores):
    names = {}
    result = {"L": defaultdict(lambda x: []), 
             "R": defaultdict(lambda x: [])}
    for hemi in ["L", "R"]:
        for d in scores:
            if hemi in d:
                for k, v in d[hemi].items():
                    result[hemi][k].append(v)
        
        # Names
        names[hemi] = result[hemi].keys()
        names[hemi] = [p.name.split(".pth")[0] for p in names[hemi]]
        #import pdb
        #pdb.set_trace()
        scores[hemi] = np.stack(list(result[hemi].values()))
    
    assert (np.array(names["L"]) == np.array(names["L"])).all()
    scores = np.stack([scores["L"], scores["R"]], axis=1)
    names = names["L"].copy()
    return scores, names

from pathlib import Path
import json
from src.task_dataset import get_task_df
df_path = pd.read_csv(paths.scores / EXP_NAME / "results_path.csv")
df_path["is_file"] = df_path.save_file.apply(lambda x : Path(x).is_file())
df_path = df_path.query("is_file")
df_path = df_path.sort_values("feature_file")
df_path = df_path.drop_duplicates()

df = get_task_df()

# Add duration 
duration = json.load(open(paths.event_meta_path))
duration = {k:v["duration"] for _, v1 in duration.items() for k, v in v1.items()}
duration["slumlordreach"] = duration["slumlord"] + duration["reach"]
duration = pd.DataFrame(duration, index=["duration"]).T
duration = duration.reset_index().rename(columns={"index":"audio_task"})
df = pd.merge(df, duration, on="audio_task", how="left")


df_path = pd.merge(df_path, df, left_on=["subject", "task"], right_on=["subject", "audio_task"], how="left")
df_path["feat"] = [Path(p).name.split(".pth")[0] for p in df_path.feature_file]

df_path = df_path.query("feat in @ALL_FEATS and task in @TASKS")

scores = df_path.save_file.apply(lambda x: np.load(x, allow_pickle=True).item())
idx = np.where([i is not None for i in scores])
scores = scores.iloc[idx]
df_path = df_path.iloc[idx]

result = scores.copy()
scores = np.stack([np.stack([list(val[hemi].values())[0] for val in result]) for hemi in ["L", "R"]])
scores.shape

def split_parc(xyz, label, n, axis='y'):

    axes = dict(x=0, y=1, z=2)

    m = xyz[:, axes[axis]].min()
    M = xyz[:, axes[axis]].max()
    bounds = (M-m) * np.linspace(0, 1., 1+n) + m

    groups = np.digitize(xyz[:, axes[axis]], 
                         bounds)
    
    labels = list()
    for group_id, _ in enumerate(bounds):
        label_ = label.copy()
        label_.name = f'{group_id}_' + label.name
        label_.vertices = label.vertices[groups==group_id]
        labels.append(label_)
    return labels


subjects_dir = "../narratives/derivatives/freesurfer/"
surf = 'pial'
surf = Path(subjects_dir) / 'fsaverage6' / 'surf' / f'%s.{surf}'
xyz = {"rh": nib.freesurfer.read_geometry(str(surf) % "rh")[0],
       "lh": nib.freesurfer.read_geometry(str(surf) % "lh")[0]}

def split_labels(all_labels, areas, subjects_dir="", surf = 'pial'):
    surf = Path(subjects_dir) / 'fsaverage6' / 'surf' / f'%s.{surf}'
    xyz = {"rh": nib.freesurfer.read_geometry(str(surf) % "rh")[0],
           "lh": nib.freesurfer.read_geometry(str(surf) % "lh")[0]}
    new_labels = []
    for label in all_labels:
        if len(label.vertices) > 400:
            n = len(label.vertices) // 400
            hemi = label.hemi
            new = split_parc(xyz[hemi][label.vertices], label, n)
            new_labels.extend(new)
        else:
            new_labels.append(label)
    return new_labels

def split_labels(all_labels, subjects_dir="", surf = 'pial'):
    
    areas = ["-".join(l.name.split("-")[:-1]) for l in all_labels] # if "Networks" in i]
    areas = np.unique(areas)
    
    surf = Path(subjects_dir) / 'fsaverage6' / 'surf' / f'%s.{surf}'
    xyz = {"rh": nib.freesurfer.read_geometry(str(surf) % "rh")[0],
           "lh": nib.freesurfer.read_geometry(str(surf) % "lh")[0]}
    new_labels = []
    for area in areas:
        labels = [l for l in all_labels if l.name in [area+"-lh", area+"-rh"]]
        assert len(labels)==2
        n = max([len(l.vertices) for l in labels])
        if n > 350:
            n = n//350
            for hemi, l in zip(["lh", "rh"], labels):
                new = split_parc(xyz[hemi][l.vertices], l, n)
                new_labels.extend(new)
        else:
            new_labels.extend(labels)
            
    """for label in all_labels:
        if len(label.vertices) > 500:
            n = len(label.vertices) // 500
            hemi = label.hemi
            new = split_parc(xyz[hemi][label.vertices], label, n)
            new_labels.extend(new)
        else:
            new_labels.append(label)"""
    return new_labels

from matplotlib.gridspec import GridSpec
from scipy.stats import wilcoxon
from scipy.stats import pearsonr, spearmanr

from pathlib import Path
import numpy as np
import mne
import nibabel as nib
import matplotlib.pyplot as plt

import mne
subjects_dir = "../narratives/derivatives/freesurfer/"
all_labels = mne.read_labels_from_annot('fsaverage6', parc='aparc.a2009s', #parc='Yeo2011_17Networks_N1000', 
                                            subjects_dir=subjects_dir, verbose=False)

#areas = ["-".join(l.name.split("-")[:-1]) for l in all_labels]

all_labels = split_labels(all_labels, subjects_dir=subjects_dir)

rois = {l.name : l.vertices for l in all_labels}
rois_colors = {"-".join(l.name.split("-")[:-1]) : l.color for l in all_labels if "lh" in l.name}

#areas = [i.split("-")[0] for i in rois.keys() if "Networks" in i]
areas = ["-".join(i.split("-")[:-1]) for i in rois.keys()] # if "Networks" in i]
areas = np.unique(areas)

# Scores (75 regions)
scores_rois = np.zeros((*scores.shape[:-1], len(areas)))
for i, area in enumerate(areas):
    for h, hemi in enumerate(["lh", "rh"]):
        scores_rois[h, :, i] = np.nanmean(scores[h, :, rois[f"{area}-{hemi}"]], 0)