# Apply volumetric (MNI space) HCP-MMP1 atlas to Chinese and English LPP (MNI space) data and extract time series

### Imports

In [1]:
import math

import pandas as pd
from pandas import Series

import numpy as np

from nilearn.input_data import NiftiMasker, MultiNiftiMasker, NiftiLabelsMasker, NiftiMapsMasker

import nibabel as nb

import glob, os, natsort

In [2]:
#hcp-mmp1 atlas image
hcp_mmp1_atlas = './HCP-MMP_1mm.nii.gz'

In [3]:
#hcp-mmp1 labels
left_labels = pd.read_csv('./lh.hcp-mmp-b_colortab.txt', header=None,sep='\s+')
left_labels = left_labels[1]

left_labels

0         L_V1_ROI
1        L_MST_ROI
2         L_V6_ROI
3         L_V2_ROI
4         L_V3_ROI
          ...     
175    L_STSva_ROI
176     L_TE1m_ROI
177       L_PI_ROI
178    L_a32pr_ROI
179      L_p24_ROI
Name: 1, Length: 180, dtype: object

In [4]:
right_labels = pd.read_csv('./rh.hcp-mmp-b_colortab.txt', header=None,sep='\s+')
right_labels = right_labels[1]

right_labels

0         R_V1_ROI
1        R_MST_ROI
2         R_V6_ROI
3         R_V2_ROI
4         R_V3_ROI
          ...     
175    R_STSva_ROI
176     R_TE1m_ROI
177       R_PI_ROI
178    R_a32pr_ROI
179      R_p24_ROI
Name: 1, Length: 180, dtype: object

In [5]:
labels = np.concatenate((np.array(left_labels),np.array(right_labels)),axis=0)
print(labels)

['L_V1_ROI' 'L_MST_ROI' 'L_V6_ROI' 'L_V2_ROI' 'L_V3_ROI' 'L_V4_ROI'
 'L_V8_ROI' 'L_4_ROI' 'L_3b_ROI' 'L_FEF_ROI' 'L_PEF_ROI' 'L_55b_ROI'
 'L_V3A_ROI' 'L_RSC_ROI' 'L_POS2_ROI' 'L_V7_ROI' 'L_IPS1_ROI' 'L_FFC_ROI'
 'L_V3B_ROI' 'L_LO1_ROI' 'L_LO2_ROI' 'L_PIT_ROI' 'L_MT_ROI' 'L_A1_ROI'
 'L_PSL_ROI' 'L_SFL_ROI' 'L_PCV_ROI' 'L_STV_ROI' 'L_7Pm_ROI' 'L_7m_ROI'
 'L_POS1_ROI' 'L_23d_ROI' 'L_v23ab_ROI' 'L_d23ab_ROI' 'L_31pv_ROI'
 'L_5m_ROI' 'L_5mv_ROI' 'L_23c_ROI' 'L_5L_ROI' 'L_24dd_ROI' 'L_24dv_ROI'
 'L_7AL_ROI' 'L_SCEF_ROI' 'L_6ma_ROI' 'L_7Am_ROI' 'L_7PL_ROI' 'L_7PC_ROI'
 'L_LIPv_ROI' 'L_VIP_ROI' 'L_MIP_ROI' 'L_1_ROI' 'L_2_ROI' 'L_3a_ROI'
 'L_6d_ROI' 'L_6mp_ROI' 'L_6v_ROI' 'L_p24pr_ROI' 'L_33pr_ROI'
 'L_a24pr_ROI' 'L_p32pr_ROI' 'L_a24_ROI' 'L_d32_ROI' 'L_8BM_ROI'
 'L_p32_ROI' 'L_10r_ROI' 'L_47m_ROI' 'L_8Av_ROI' 'L_8Ad_ROI' 'L_9m_ROI'
 'L_8BL_ROI' 'L_9p_ROI' 'L_10d_ROI' 'L_8C_ROI' 'L_44_ROI' 'L_45_ROI'
 'L_47l_ROI' 'L_a47r_ROI' 'L_6r_ROI' 'L_IFJa_ROI' 'L_IFJp_ROI'
 'L_IFSp_ROI' 'L_IFSa_ROI' 'L_p9

## Chinese

In [8]:
#get the subject directories

chinese_subject_dirs = natsort.natsorted(glob.glob('/scratch/dgd45125/openNeuro_LPP/chinese/*'))
chinese_subject_dirs

['/scratch/dgd45125/openNeuro_LPP/chinese/subj001',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj002',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj003',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj004',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj005',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj006',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj007',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj008',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj009',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj010',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj011',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj013',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj014',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj015',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj016',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj017',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj018',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj019',
 '/scratch/dgd45125/openNeuro_LPP/chinese/subj020',
 '/scratch/d

In [None]:
chinese_subject_section_signal_df = pd.DataFrame()

for subject_num in range(len(chinese_subject_dirs)):
#for subject_num in range(1): 
    
    #pull the subject ID
    subject_ID = chinese_subject_dirs[subject_num].split("/")[-1]
    
    intermediate_df = pd.DataFrame()
    
    try:
        for section_num in range(9):    
            '''
            The hcp-mmp1 atlas provies 360 roi labels.
            The fit_transform takes 1 section and returns a n_volumes x 360 matrix where each column is the average brain signal from the roi.
            '''
            parcelation_masker = NiftiLabelsMasker(labels_img=hcp_mmp1_atlas,standardize=False)
            time_series = parcelation_masker.fit_transform(natsort.natsorted(glob.glob(chinese_subject_dirs[subject_num]+'/*'))[section_num])
            
            #print what we are working with as a double-check
            print(natsort.natsorted(glob.glob(chinese_subject_dirs[subject_num]+'/*'))[section_num])

            #place into a dataframe with labels
            df = pd.DataFrame(data = time_series, columns = labels)

            #add appropriate section number and subject number as constants
            df['section']=str(section_num+1)
            df['subj']=str(subject_ID)

            intermediate_df=intermediate_df.append(df)

            print("subject: {}, section {} done".format(subject_ID,section_num+1))
            
    except:
        print("something has gone wrong with subject: {}, section {} done".format(subject_ID,section_num+1))
        continue

    #append to subject-level dataframe
    chinese_subject_section_signal_df=chinese_subject_section_signal_df.append(intermediate_df)

print(len(chinese_subject_section_signal_df))

In [29]:
chinese_subject_section_signal_df

Unnamed: 0,L_V1_ROI,L_MST_ROI,L_V6_ROI,L_V2_ROI,L_V3_ROI,L_V4_ROI,L_V8_ROI,L_4_ROI,L_3b_ROI,L_FEF_ROI,...,R_MBelt_ROI,R_LBelt_ROI,R_A4_ROI,R_STSva_ROI,R_TE1m_ROI,R_PI_ROI,R_a32pr_ROI,R_p24_ROI,section,subj
0,20970.509766,32363.722656,22072.015625,21524.480469,19842.257812,26929.847656,25023.796875,21782.199219,24009.671875,19728.091797,...,22378.644531,24388.744141,30328.886719,22578.099609,21326.632812,21327.818359,18679.517578,17540.800781,1,subj001
1,20989.734375,32394.181641,22085.658203,21539.802734,19834.539062,26891.623047,24911.287109,21795.296875,24024.554688,19729.476562,...,22412.132812,24423.171875,30417.744141,22581.240234,21365.771484,21334.253906,18702.291016,17552.728516,1,subj001
2,21034.566406,32384.101562,22118.087891,21576.035156,19841.039062,26891.128906,24884.201172,21774.867188,23999.869141,19720.871094,...,22367.466797,24412.246094,30500.259766,22607.654297,21393.220703,21338.154297,18706.412109,17539.457031,1,subj001
3,20995.007812,32356.816406,22113.861328,21574.765625,19832.996094,26890.021484,24894.576172,21762.361328,23977.798828,19710.011719,...,22408.982422,24428.216797,30511.464844,22584.158203,21387.482422,21330.771484,18712.199219,17575.261719,1,subj001
4,21011.873047,32405.814453,22148.408203,21578.833984,19845.353516,26888.210938,24863.789062,21776.935547,23988.681641,19736.501953,...,22400.101562,24415.591797,30494.103516,22588.023438,21391.726562,21325.822266,18711.240234,17585.156250,1,subj001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,17988.826172,25359.945312,28000.234375,18985.851562,18844.376953,21883.666016,23222.880859,14687.437500,16181.128906,15288.067383,...,16913.685547,18746.308594,19820.697266,17915.525391,16566.876953,17308.503906,19205.695312,18410.947266,9,subj037
397,17982.875000,25347.013672,28009.347656,18983.103516,18846.119141,21877.130859,23213.615234,14667.217773,16155.383789,15284.856445,...,16904.062500,18741.851562,19797.812500,17918.957031,16567.441406,17284.277344,19217.960938,18391.517578,9,subj037
398,17990.166016,25360.398438,28024.722656,18977.203125,18842.070312,21867.267578,23213.871094,14668.295898,16154.747070,15274.808594,...,16831.257812,18713.724609,19737.367188,17921.722656,16586.876953,17277.082031,19234.814453,18444.556641,9,subj037
399,17992.384766,25361.148438,28033.119141,18971.830078,18845.005859,21866.849609,23184.494141,14678.887695,16152.396484,15287.773438,...,16841.794922,18728.447266,19727.650391,17952.734375,16612.406250,17263.800781,19260.152344,18488.943359,9,subj037


In [30]:
chinese_subject_section_signal_df.subj.value_counts()

subj037    2977
subj015    2977
subj020    2977
subj017    2977
subj001    2977
subj016    2977
subj004    2977
subj033    2977
subj014    2977
subj023    2977
subj010    2977
subj006    2977
subj028    2977
subj027    2977
subj011    2977
subj002    2977
subj034    2977
subj003    2977
subj026    2977
subj007    2977
subj005    2977
subj022    2977
subj032    2977
subj009    2977
subj036    2977
subj031    2977
subj025    2977
subj013    2977
subj021    2977
subj030    2977
subj029    2977
subj024    2977
subj008    2977
Name: subj, dtype: int64

In [31]:
#number of volumes in each section
n_chinese_scans = [283,322,322,307,293,392,364,293,401]
print(sum(n_chinese_scans))

2977


In [35]:
2977*33

98241

In [38]:
chinese_subject_section_signal_df.to_csv("lpp_cn_HCPMMP1_volumetric_roi_signals.csv.gz",
                                 index=False,
                                compression="gzip")

## English

In [39]:
#get the subject directories

english_subject_dirs = natsort.natsorted(glob.glob('/scratch/dgd45125/openNeuro_LPP/english/*'))
english_subject_dirs

['/scratch/dgd45125/openNeuro_LPP/english/subj057',
 '/scratch/dgd45125/openNeuro_LPP/english/subj058',
 '/scratch/dgd45125/openNeuro_LPP/english/subj059',
 '/scratch/dgd45125/openNeuro_LPP/english/subj061',
 '/scratch/dgd45125/openNeuro_LPP/english/subj062',
 '/scratch/dgd45125/openNeuro_LPP/english/subj063',
 '/scratch/dgd45125/openNeuro_LPP/english/subj064',
 '/scratch/dgd45125/openNeuro_LPP/english/subj065',
 '/scratch/dgd45125/openNeuro_LPP/english/subj067',
 '/scratch/dgd45125/openNeuro_LPP/english/subj068',
 '/scratch/dgd45125/openNeuro_LPP/english/subj069',
 '/scratch/dgd45125/openNeuro_LPP/english/subj070',
 '/scratch/dgd45125/openNeuro_LPP/english/subj072',
 '/scratch/dgd45125/openNeuro_LPP/english/subj073',
 '/scratch/dgd45125/openNeuro_LPP/english/subj074',
 '/scratch/dgd45125/openNeuro_LPP/english/subj075',
 '/scratch/dgd45125/openNeuro_LPP/english/subj076',
 '/scratch/dgd45125/openNeuro_LPP/english/subj077',
 '/scratch/dgd45125/openNeuro_LPP/english/subj078',
 '/scratch/d

In [None]:
english_subject_section_signal_df = pd.DataFrame()

for subject_num in range(len(english_subject_dirs)):
#for subject_num in range(1): 
    
    #pull the subject ID
    subject_ID = english_subject_dirs[subject_num].split("/")[-1]
    
    intermediate_df = pd.DataFrame()
    
    try:
        for section_num in range(9):    
            '''
            The hcp-mmp1 atlas provies 360 roi labels.
            The fit_transform takes 1 section and returns a n_volumes x 360 matrix where each column is the average brain signal from the roi.
            '''
            parcelation_masker = NiftiLabelsMasker(labels_img=hcp_mmp1_atlas,standardize=False)
            time_series = parcelation_masker.fit_transform(natsort.natsorted(glob.glob(english_subject_dirs[subject_num]+'/*'))[section_num])
            
            #print what we are working with as a double-check
            print(natsort.natsorted(glob.glob(english_subject_dirs[subject_num]+'/*'))[section_num])

            #place into a dataframe with labels
            df = pd.DataFrame(data = time_series, columns = labels)

            #add appropriate section number and subject number as constants
            df['section']=str(section_num+1)
            df['subj']=str(subject_ID)

            intermediate_df=intermediate_df.append(df)

            print("subject: {}, section {} done".format(subject_ID,section_num+1))
            
    except:
        print("something has gone wrong with subject: {}, section {} done".format(subject_ID,section_num+1))
        continue

    #append to subject-level dataframe
    english_subject_section_signal_df=english_subject_section_signal_df.append(intermediate_df)

print(len(english_subject_section_signal_df))

In [44]:
english_subject_section_signal_df

Unnamed: 0,L_V1_ROI,L_MST_ROI,L_V6_ROI,L_V2_ROI,L_V3_ROI,L_V4_ROI,L_V8_ROI,L_4_ROI,L_3b_ROI,L_FEF_ROI,...,R_MBelt_ROI,R_LBelt_ROI,R_A4_ROI,R_STSva_ROI,R_TE1m_ROI,R_PI_ROI,R_a32pr_ROI,R_p24_ROI,section,subj
0,26625.875000,28256.269531,27572.228516,23967.046875,21809.552734,21326.917969,23981.503906,19494.130859,20594.912109,20117.564453,...,19431.826172,19705.191406,19793.453125,21005.134766,15158.911133,17178.005859,18817.300781,19335.115234,1,subj057
1,26645.158203,28208.568359,27559.839844,23984.261719,21821.507812,21333.027344,23964.783203,19512.042969,20615.638672,20124.361328,...,19517.623047,19784.169922,19831.808594,21010.591797,15190.319336,17167.640625,18811.296875,19300.099609,1,subj057
2,26702.320312,28235.791016,27628.578125,24011.669922,21847.220703,21334.322266,23958.460938,19517.166016,20627.525391,20169.037109,...,19595.984375,19865.808594,19914.951172,21061.570312,15190.010742,17252.740234,18842.640625,19310.097656,1,subj057
3,26682.503906,28228.847656,27625.558594,23991.298828,21837.798828,21316.173828,23938.763672,19512.941406,20621.871094,20161.560547,...,19588.650391,19904.529297,19939.128906,21045.611328,15187.864258,17210.443359,18818.988281,19295.185547,1,subj057
4,26655.669922,28238.394531,27584.083984,23972.058594,21811.632812,21318.542969,23954.650391,19527.304688,20614.615234,20136.976562,...,19618.445312,19904.519531,19952.458984,21045.628906,15214.512695,17236.728516,18756.230469,19244.400391,1,subj057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,20180.173828,23413.185547,20449.201172,18905.566406,16899.671875,16630.492188,20574.824219,16068.533203,17600.779297,17487.531250,...,19211.718750,20622.134766,23296.964844,21900.210938,24024.908203,18627.472656,19804.296875,19320.472656,9,subj115
364,20159.859375,23401.609375,20443.171875,18887.689453,16880.033203,16612.886719,20485.986328,16068.931641,17598.337891,17485.546875,...,19214.791016,20611.628906,23278.164062,21853.337891,24041.027344,18587.308594,19800.515625,19294.503906,9,subj115
365,20193.199219,23432.224609,20476.296875,18911.714844,16896.550781,16651.175781,20552.679688,16064.453125,17602.246094,17501.158203,...,19163.226562,20565.746094,23227.328125,21868.800781,24056.929688,18585.306641,19823.640625,19338.669922,9,subj115
366,20227.945312,23433.611328,20491.894531,18940.218750,16930.878906,16691.535156,20683.769531,16094.185547,17630.523438,17516.712891,...,19129.880859,20586.050781,23188.246094,21871.421875,24090.203125,18628.242188,19854.017578,19384.468750,9,subj115


In [41]:
english_subject_section_signal_df.subj.value_counts()

subj099    2816
subj101    2816
subj092    2816
subj061    2816
subj103    2816
subj089    2816
subj070    2816
subj077    2816
subj074    2816
subj091    2816
subj105    2816
subj096    2816
subj106    2816
subj076    2816
subj069    2816
subj058    2816
subj075    2816
subj087    2816
subj065    2816
subj073    2816
subj100    2816
subj097    2816
subj095    2816
subj115    2816
subj113    2816
subj072    2816
subj083    2816
subj068    2816
subj062    2816
subj078    2816
subj059    2816
subj082    2816
subj079    2816
subj108    2816
subj057    2816
subj086    2816
subj067    2816
subj109    2816
subj064    2816
subj094    2816
subj114    2816
subj063    2816
subj104    2816
subj110    2816
subj098    2816
subj088    2816
subj084    2816
subj081    2816
Name: subj, dtype: int64

In [42]:
#number of volumes in each section
n_english_scans = [282,298,340,303,265,343,325,292,368]
print(sum(n_english_scans))

2816


In [43]:
2816*48

135168

In [46]:
english_subject_section_signal_df.to_csv("lpp_en_HCPMMP1_volumetric_roi_signals.csv.gz",
                                 index=False,
                                compression="gzip")