# Large Dataset
Create a NeuroTK dataset of tau stained images containing the 4 brain regions required for Braak staging: amygdala, hippocampus, temporal cortex, and occipital cortex.

In [None]:
%load_ext autoreload
%autoreload 2

In [52]:
import sys
sys.path.append('../..')

from pandas import DataFrame, concat
from tqdm.notebook import tqdm

from neurotk import login
from neurotk.girder_utils import get_items

In [6]:
# Authenticate client.
gc = login('https://megabrain.neurology.emory.edu/api/v1', username='jvizcar')

In [55]:
# Get all the items in the Emory ADRC collection.
items = []

for item in get_items(gc, '641ba814867536bb7a225533'):
    if 'largeImage' in item and item['name'].endswith(('.svs', '.ndpi')):
        items.append(item)
        
print(f'{len(items)} images found.')

44490 images found.


In [56]:
# Compile images into dataframe.
# Compile the item info into a data frame.
item_df = []

for item in items:
    meta = item.get('meta', {})
    
    schema = meta.get('npSchema', {})
    clinical = meta.get('npClinical', {})
    
    item_df.append([
        item['_id'],
        item['name'],
        schema.get('caseID', ''),
        schema.get('regionName', ''),
        schema.get('blockID', ''),
        schema.get('stainID', ''),
        clinical.get('CERAD', ''),
        clinical.get('Braak Stage', ''),
        clinical.get('Thal', ''),
        clinical.get('ABC', ''),
        clinical.get('Clin Dx', ''),
        clinical.get('Age at Onset', ''),
        clinical.get('Age at Death/Bx', ''),
        clinical.get('Duration (years)', ''),
        clinical.get('Sex', ''),
        clinical.get('Race', ''),
        item,
        ''
    ])
    
item_df = DataFrame(
    item_df,
    columns=[
        '_id', 'name', 'case', 'region', 'block', 'stain', 'CERAD', 
        'Braak Stage', 'Thal', 'ABC', 'Clinical Diagnosis', 'Age at onset', 
        'Age at death', 'Duration (years)', 'Sex', 'Race', 'item', 'Region'
    ]
)

item_df.head()

Unnamed: 0,_id,name,case,region,block,stain,CERAD,Braak Stage,Thal,ABC,Clinical Diagnosis,Age at onset,Age at death,Duration (years),Sex,Race,item,Region
0,641ba833867536bb7a225539,E19-35_3_BIELS.svs,E19-35,Frontal cortex,3,Biels,3,VI,5,3,AD,49,59,10,m,w,"{'_id': '641ba833867536bb7a225539', 'baseParen...",
1,641ba833867536bb7a22553b,E19-35_9_TAU.svs,E19-35,Insular cortex,9,Tau,3,VI,5,3,AD,49,59,10,m,w,"{'_id': '641ba833867536bb7a22553b', 'baseParen...",
2,641ba834867536bb7a22553d,E19-35_12_SYN.svs,E19-35,Hypothalamus,12,Syn,3,VI,5,3,AD,49,59,10,m,w,"{'_id': '641ba834867536bb7a22553d', 'baseParen...",
3,641ba834867536bb7a22553f,E19-35_11_SYN.svs,E19-35,Posterior basal ganglia,11,Syn,3,VI,5,3,AD,49,59,10,m,w,"{'_id': '641ba834867536bb7a22553f', 'baseParen...",
4,641ba835867536bb7a225541,E19-35_3_UBIQ.svs,E19-35,Frontal cortex,3,Ubiq,3,VI,5,3,AD,49,59,10,m,w,"{'_id': '641ba835867536bb7a225541', 'baseParen...",


In [57]:
# Group together some regions into a single name.
for i, r in item_df.iterrows():
    region = r.region
        
    if region in (
        'Temporal cortex', 
        'Left Temporal cortex', 
        'Temporal cortex, superior and middle gyri', 
        'Temporal cortex, inferior and middle gyri', 
        'Temporal cortex, left', 
        'Left Temporal cortex and straight gyrus', 
        'Right temporal cortex, bullet tract', 
        'Left temporal cortex, bullet tract', 
        'Right temporal cortex', 
    ):
        item_df.loc[i, 'Region'] = 'Temporal cortex'
        
    if region in (
        'Occipital cortex', 
        'Right occipital cortex', 
        'Left Occipital cortex', 
        'Occipital cortex, left', 
    ):
        item_df.loc[i, 'Region'] = 'Occipital cortex'
        
    if region in (
        'Hippocampus', 
        'Left hippocampus', 
        'Right hippocampus', 
        'Left Hippocampus', 
        'Right Hippocampus', 
        'Hippocampus, left', 
        'Posterior hippocampus', 
    ):
        item_df.loc[i, 'Region'] = 'Hippocampus'
    
    if region in (
        'Amygdala', 
        'Left Amygdala', 
        'Right Amygdala', 
        'Left amygdala', 
        'Amygdala, left', 
        'Right amygdala', 
    ):
        item_df.loc[i, 'Region'] = 'Amygdala'

In [67]:
# Grab only cases that have Braak stage data 
df = item_df[item_df.stain == 'Tau']

df = df[~df['Braak Stage'].isin(('', 'na', 'I-II'))]
# df['Braak Stage'] = df['Braak Stage'].replace({
#     '(VI)': 'VI', '(II)': 'II', '(III)': 'III', '(IV)': 'VI', '(V)': 'V'
# })

df = df[df.Region.isin((
    'Amygdala', 'Hippocampus', 'Occipital cortex', 'Temporal cortex'
))]

# Briefly push the metadata that we changed.
for i, r in tqdm(df.iterrows(), total=len(df)):
    clinical = r['item'].get('meta', {}).get('npClinical', {})
    
    clinical['Braak Stage'] = r['Braak Stage']
    schema = r['item'].get('meta', {}).get('npSchema', {})
    schema['region'] = r.Region
    
    # _ = gc.addMetadataToItem(r._id, {'npSchema': schema, 'npClinical': clinical})

  0%|          | 0/1349 [00:00<?, ?it/s]

In [68]:
data = []

region_list = sorted([
    'Amygdala', 'Hippocampus', 'Occipital cortex', 'Temporal cortex'
])

n = 0

for case in df.case.unique():
    case_df = df[df.case == case]
    
    if region_list == sorted(list(case_df.Region.unique())):
        n += 1
        if len(case_df) != 4:
            # Make sure to only choose one of each region.
            case_df = case_df.sort_values(by='name')
            
            rs = []
            
            for region in region_list:
                rs.append(case_df[case_df.Region == region].iloc[0])
                
            case_df = DataFrame(rs)
            
        data.append(case_df)
        
data = concat(data, ignore_index=True)
print(f'{n} valid cases found.')

266 valid cases found.


In [73]:
# Create neuroTK dataset item.
data_list = []

for _id in tqdm(data._id.tolist()):
    item = gc.getItem(_id)
    schema = item['meta']['npSchema']
    clinical = item['meta']['npClinical']
    
    data_list.append({
        '_id': _id,
        'name': item['name'],
        'npSchema': {
            'stainID': schema['stainID'],
            'caseID': schema['caseID'],
            'region': schema['region']
        },
        'npClinical': {
            'Braak Stage': clinical['Braak Stage'],
            'Age at Death/Bx': clinical['Age at Death/Bx'],
            'Age at Onset': clinical['Age at Onset'],
            'Clin Dx': clinical['Clin Dx'],
            'Duration (years)': clinical['Duration (years)'],
            'Race': clinical['Race'],
            'Sex': clinical['Sex']      
        }
    })

  0%|          | 0/1064 [00:00<?, ?it/s]

In [74]:
_ = gc.addMetadataToItem(
    '65131a2c6b4fa9ed76c6d47a', 
    {'data': data_list,
     'filters': {'npSchema.stainID': ['aBeta'], 
     'npSchema.region': ['Amygdala', 'Hippocampus', 'Temporal cortex', 'Occipital cortex'],
     'npClinical.Braak Stage': ['0', 'I', 'II', 'III', 'IV', 'V', 'VI']}}
)