In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.append('../')

In [3]:
import pandas as pd
from pathlib import Path
from polygenic_layers import process_data

## Get data for each of 4 fetal human brains

In [4]:
#fem 21pcw
donor1 = process_data.RAW_DATA / 'lmd_matrix_12566'
#fem 21pcw
donor2 = process_data.RAW_DATA / 'lmd_matrix_12690'
#male 15pcw
donor3 = process_data.RAW_DATA / 'lmd_matrix_12840'
#fem 16pcw
donor4 = process_data.RAW_DATA / 'lmd_matrix_14751'

In [5]:
%%time
exp1, samples1, probes1 = process_data.get_donor_data(list(donor1.glob('*.csv')))
exp2, samples2, probes2 = process_data.get_donor_data(list(donor2.glob('*.csv')))
exp3, samples3, probes3 = process_data.get_donor_data(list(donor3.glob('*.csv')))
exp4, samples4, probes4 = process_data.get_donor_data(list(donor4.glob('*.csv')))

CPU times: user 19.9 s, sys: 1.07 s, total: 21 s
Wall time: 21.1 s


### For each donor: merge expression data with probes information, group and take mean of probes by gene symbol

In [6]:
samples_gene_exp1 = process_data.get_exp_by_genes(probes1, exp1)
samples_gene_exp2 = process_data.get_exp_by_genes(probes2, exp2)
samples_gene_exp3 = process_data.get_exp_by_genes(probes3, exp3)
samples_gene_exp4 = process_data.get_exp_by_genes(probes4, exp4)

In [7]:
# there is a different number of columns for each donor, because different number of samples were taken
samples_gene_exp1.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,301,302,303,304,305,306,307,308,309,310
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
61E3.4,9.377013,9.175427,8.462613,9.164672,9.130459,9.000264,8.562635,9.405844,8.580369,9.356545,...,10.232404,11.258528,8.813283,8.696339,8.591509,9.331922,8.736812,8.787964,9.866223,9.981537
A1BG,4.249742,5.73107,4.902168,5.853085,4.010567,4.529675,4.743105,4.525157,4.845123,5.271694,...,4.357892,4.684222,4.828907,3.469966,4.687083,4.326251,5.296695,5.748596,4.2946,4.689516
A1CF,1.930052,3.431931,1.495802,2.499032,1.475365,1.48559,1.561409,1.940703,1.745391,3.124172,...,1.598939,1.951121,2.301461,1.819679,2.008151,3.879183,2.117793,2.091166,1.901383,1.622361
A2LD1,5.859647,4.674554,4.896483,4.840452,5.520156,5.699487,4.811058,4.320661,4.494216,5.27323,...,3.469073,3.946986,5.275123,6.208502,5.408643,5.631577,5.364709,5.228869,4.875429,4.160496
A2M,6.893537,6.270214,6.572636,6.527202,7.869559,6.944047,7.515969,7.098298,5.750414,7.139198,...,6.267216,6.424419,7.17251,7.369312,7.314822,8.808608,6.150774,5.783926,8.562068,6.597781


### Merge in sample information for each sample (adds name of structure from which it was sampled).
Also reshapes the df: columns are gene_symbols and samples are rows.

In [8]:
# generate annotated sample exp by gene
annotated_samples_exp1 = process_data.merge_sampleinfo_gene_expression(samples_gene_exp1, samples1)
annotated_samples_exp2 = process_data.merge_sampleinfo_gene_expression(samples_gene_exp2, samples2)
annotated_samples_exp3 = process_data.merge_sampleinfo_gene_expression(samples_gene_exp3, samples3)
annotated_samples_exp4 = process_data.merge_sampleinfo_gene_expression(samples_gene_exp4, samples4)

In [9]:
annotated_samples_exp1.head()

Unnamed: 0_level_0,structure_name,61E3.4,A1BG,A1CF,A2LD1,A2M,A2ML1,A3GALT2P,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,na
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,stratum pyramidale of caudal CA3,9.377013,4.249742,1.930052,5.859647,6.893537,2.734098,1.843554,2.140563,2.48696,...,6.745094,11.654925,5.211952,5.384706,1.228563,10.800131,5.124591,6.013523,6.608995,3.7734
2,outer CP in caudal cingulate cortex,9.175427,5.73107,3.431931,4.674554,6.270214,2.469216,1.659043,2.195799,3.189385,...,5.13536,11.920003,4.650096,5.920737,3.252287,10.498512,5.070245,6.222056,5.869048,3.090765
3,stratum pyramidale of caudal CA2,8.462613,4.902168,1.495802,4.896483,6.572636,3.310686,1.2778,2.472009,2.668977,...,6.62924,11.772713,5.925422,5.56108,3.288921,10.845009,3.153245,5.590283,6.391939,3.227571
4,inner CP in midcingulate cortex,9.164672,5.853085,2.499032,4.840452,6.527202,3.047076,1.879411,3.421815,1.69774,...,5.161912,12.06131,4.916696,5.276199,1.260221,10.492222,3.332065,6.044515,6.919192,3.57249
5,VZ in posterosuperior (dorsal) parietal cortex,9.130459,4.010567,1.475365,5.520156,7.869559,2.999476,1.251861,2.521641,2.038449,...,6.810025,11.900962,4.677629,5.810593,0.719841,10.078728,3.540345,5.673266,6.98162,3.049124


In [10]:
print(annotated_samples_exp1.shape)
print(annotated_samples_exp2.shape)
print(annotated_samples_exp3.shape)
print(annotated_samples_exp4.shape)

(310, 29177)
(226, 29177)
(327, 29177)
(340, 29177)


### Concatenate all samples into a single df
- Also create separate dfs for subsets of data (the 15/16 pcw fetal brains and the older 21pcw brains)

In [11]:
all_annotated_samples = pd.concat([annotated_samples_exp1, annotated_samples_exp2, annotated_samples_exp3, annotated_samples_exp4])

In [12]:
young_annotated_samples = pd.concat([annotated_samples_exp3, annotated_samples_exp4])
old_annotated_samples = pd.concat([annotated_samples_exp1, annotated_samples_exp2])

In [13]:
#drop index which has overlapping sample ids from the different donors
all_annotated_samples = all_annotated_samples.reset_index(drop=True)
young_annotated_samples = young_annotated_samples.reset_index(drop=True)
old_annotated_samples = old_annotated_samples.reset_index(drop=True)

In [14]:
print(all_annotated_samples.shape)
print(young_annotated_samples.shape)
print(old_annotated_samples.shape)

(1203, 29177)
(667, 29177)
(536, 29177)


In [15]:
all_annotated_samples.head()

Unnamed: 0,structure_name,61E3.4,A1BG,A1CF,A2LD1,A2M,A2ML1,A3GALT2P,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,na
0,stratum pyramidale of caudal CA3,9.377013,4.249742,1.930052,5.859647,6.893537,2.734098,1.843554,2.140563,2.48696,...,6.745094,11.654925,5.211952,5.384706,1.228563,10.800131,5.124591,6.013523,6.608995,3.7734
1,outer CP in caudal cingulate cortex,9.175427,5.73107,3.431931,4.674554,6.270214,2.469216,1.659043,2.195799,3.189385,...,5.13536,11.920003,4.650096,5.920737,3.252287,10.498512,5.070245,6.222056,5.869048,3.090765
2,stratum pyramidale of caudal CA2,8.462613,4.902168,1.495802,4.896483,6.572636,3.310686,1.2778,2.472009,2.668977,...,6.62924,11.772713,5.925422,5.56108,3.288921,10.845009,3.153245,5.590283,6.391939,3.227571
3,inner CP in midcingulate cortex,9.164672,5.853085,2.499032,4.840452,6.527202,3.047076,1.879411,3.421815,1.69774,...,5.161912,12.06131,4.916696,5.276199,1.260221,10.492222,3.332065,6.044515,6.919192,3.57249
4,VZ in posterosuperior (dorsal) parietal cortex,9.130459,4.010567,1.475365,5.520156,7.869559,2.999476,1.251861,2.521641,2.038449,...,6.810025,11.900962,4.677629,5.810593,0.719841,10.078728,3.540345,5.673266,6.98162,3.049124


In [16]:
all_annotated_samples.shape

(1203, 29177)

In [17]:
all_annotated_samples.groupby('structure_name').mean()

Unnamed: 0_level_0,61E3.4,A1BG,A1CF,A2LD1,A2M,A2ML1,A3GALT2P,A4GALT,A4GNT,AAAS,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,na
structure_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CP in caudal hippocampus,9.523061,4.767558,1.482926,4.782950,6.161790,2.532048,1.018464,2.404139,2.363384,8.854258,...,5.920398,11.386611,4.441848,6.101826,0.270588,10.392077,3.458293,6.264513,6.230141,2.887916
CP in caudal subicular cortex,8.276139,5.074995,2.298040,4.604232,6.952659,3.185611,1.945917,3.731330,2.311243,7.420626,...,5.112704,12.110389,5.258399,5.358638,2.018612,10.188603,2.100623,6.044459,7.122289,3.649878
CP in midlateral temporal cortex,8.953307,5.324436,2.180877,3.237411,6.532958,3.142125,1.829818,3.023866,2.235644,7.988862,...,4.459734,12.360515,5.301258,5.728014,3.702751,10.352593,3.553873,5.977050,7.098290,3.610568
CP in retrosplenial cortex,9.377530,5.412700,2.080123,4.826490,6.728611,2.819511,1.929728,2.800599,3.187157,8.955702,...,5.175566,11.933817,5.094244,5.663539,2.060827,10.259518,3.938113,6.396453,6.478038,3.138449
CP in rostral hippocampus,9.019278,5.867002,2.604324,4.917606,6.280484,3.876027,1.889324,2.408340,2.170276,7.968181,...,4.933706,12.199141,5.438430,5.466607,2.195360,10.290804,3.265081,6.039330,7.362587,3.459836
CP in temporal polar cortex,10.218990,5.412873,1.920514,5.169319,6.301188,2.311639,1.336329,2.936981,1.727422,6.892830,...,4.701984,11.327086,4.714441,5.649830,3.178321,10.389451,3.128658,6.014566,5.843980,3.075446
Dorsal claustrum,9.303352,4.613743,1.951403,3.592620,7.148480,2.748649,1.728705,2.819903,2.522568,8.629537,...,5.190845,11.419635,4.871324,5.762080,1.317876,10.431926,3.975338,6.117962,6.483335,3.236299
Edinger-Westphal nucleus (accessory oculomotor nucleus),9.158385,3.161407,1.578070,4.728864,7.104139,2.828207,1.288004,2.442327,2.451310,7.604028,...,5.920621,10.851610,5.555564,5.008340,0.118892,10.232386,2.223051,5.395534,6.885218,3.489804
IZ in caudal cingulate cortex,9.345787,4.780362,2.388353,5.118303,8.407341,3.264649,2.215342,3.226511,2.875786,8.624662,...,6.452832,12.188659,5.155326,5.528849,2.047879,10.040729,3.703834,6.230790,6.719254,3.701903
IZ in caudal midinferior temporal cortex (area TF),9.041097,4.583274,3.052268,4.918447,7.992466,3.532807,2.714306,3.237165,4.201650,8.363184,...,6.136522,11.927952,5.371269,5.382917,1.560775,9.857874,3.529329,6.189424,8.618711,3.150913


In [18]:
print(all_annotated_samples.groupby('structure_name').mean().shape)
print(young_annotated_samples.groupby('structure_name').mean().shape)
print(old_annotated_samples.groupby('structure_name').mean().shape)

(516, 29176)
(435, 29176)
(348, 29176)


### Extract out the samples which can be identified by the structure_name as being part of the the developing cortical plates/zones:
- SG
- MZ
- CP (inner and outer?)
- SP 
- IZ
- SZ (inner and outer?)

In [19]:
layer_samples = process_data.get_layer_samples(all_annotated_samples)
young_layer_samples = process_data.get_layer_samples(young_annotated_samples)
old_layer_samples = process_data.get_layer_samples(old_annotated_samples)

In [20]:
print(layer_samples.shape)
print(young_layer_samples.shape)
print(old_layer_samples.shape)

(683, 29177)
(345, 29177)
(338, 29177)


In [21]:
# how many of the different brain structures were sampled multiple times
layer_samples.groupby('structure_name').count().iloc[:, 0].value_counts()

4    76
3    64
1    59
2    54
5     4
Name: 61E3.4, dtype: int64

In [22]:
# probably shouldn't group at this point
# instead, select out layer markers as a categorical variable and groupby that
layer_samples_grouped = layer_samples.groupby('structure_name').mean()
young_layer_samples_grouped = young_layer_samples.groupby('structure_name').mean()
old_layer_samples_grouped = old_layer_samples.groupby('structure_name').mean()

In [23]:
print(layer_samples_grouped.shape)
print(young_layer_samples_grouped.shape)
print(old_layer_samples_grouped.shape)

(257, 29176)
(213, 29176)
(205, 29176)


In [24]:
layer_samples_grouped

Unnamed: 0_level_0,61E3.4,A1BG,A1CF,A2LD1,A2M,A2ML1,A3GALT2P,A4GALT,A4GNT,AAAS,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,na
structure_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CP in caudal subicular cortex,8.276139,5.074995,2.298040,4.604232,6.952659,3.185611,1.945917,3.731330,2.311243,7.420626,...,5.112704,12.110389,5.258399,5.358638,2.018612,10.188603,2.100623,6.044459,7.122289,3.649878
CP in midlateral temporal cortex,8.953307,5.324436,2.180877,3.237411,6.532958,3.142125,1.829818,3.023866,2.235644,7.988862,...,4.459734,12.360515,5.301258,5.728014,3.702751,10.352593,3.553873,5.977050,7.098290,3.610568
CP in retrosplenial cortex,9.377530,5.412700,2.080123,4.826490,6.728611,2.819511,1.929728,2.800599,3.187157,8.955702,...,5.175566,11.933817,5.094244,5.663539,2.060827,10.259518,3.938113,6.396453,6.478038,3.138449
CP in temporal polar cortex,10.218990,5.412873,1.920514,5.169319,6.301188,2.311639,1.336329,2.936981,1.727422,6.892830,...,4.701984,11.327086,4.714441,5.649830,3.178321,10.389451,3.128658,6.014566,5.843980,3.075446
IZ in caudal cingulate cortex,9.345787,4.780362,2.388353,5.118303,8.407341,3.264649,2.215342,3.226511,2.875786,8.624662,...,6.452832,12.188659,5.155326,5.528849,2.047879,10.040729,3.703834,6.230790,6.719254,3.701903
IZ in caudal midinferior temporal cortex (area TF),9.041097,4.583274,3.052268,4.918447,7.992466,3.532807,2.714306,3.237165,4.201650,8.363184,...,6.136522,11.927952,5.371269,5.382917,1.560775,9.857874,3.529329,6.189424,8.618711,3.150913
IZ in caudal perirhinal cortex,10.450880,5.550590,3.096018,2.491018,7.900211,3.770636,2.183331,3.940967,2.547928,7.641301,...,4.609684,12.182280,5.226634,5.961595,2.478577,10.259223,3.785930,6.071788,8.371401,3.764838
IZ in dorsolateral prefrontal cortex,9.348826,5.032285,2.457079,4.346956,8.032896,3.505778,1.668642,4.317183,2.207095,8.439130,...,7.688096,12.635718,6.202562,5.602972,0.661935,10.665405,4.418211,5.829885,6.471474,3.147947
IZ in dorsomedial extrastriate cortex (V2),9.703969,5.119407,2.513564,4.978707,8.411569,3.779262,2.326465,3.281004,2.992385,8.644139,...,6.248806,12.029736,5.684386,5.862393,1.975726,10.014787,4.172209,6.349983,6.438203,3.968634
IZ in dorsomedial frontal cortex,10.024616,5.110928,1.566811,5.202416,7.960082,3.055386,1.465504,2.973452,1.653284,9.418638,...,6.860085,12.185034,5.497264,5.131763,0.794174,9.700212,3.787178,5.904066,6.009488,3.267896


In [25]:
layer_samples.structure_name

1                     outer CP in caudal cingulate cortex
3                         inner CP in midcingulate cortex
4          VZ in posterosuperior (dorsal) parietal cortex
7                             SP in primary visual cortex
8            inner CP in medial temporal-occipital cortex
11      inner SZ in posterosuperior (dorsal) parietal ...
12                         VZ in caudal entorhinal cortex
14             VZ in dorsomedial extrastriate cortex (V2)
15                            VZ in primary visual cortex
16            inner SZ in dorsomedial extrastriate cortex
17            outer SZ in dorsomedial extrastriate cortex
18                      inner SZ in primary visual cortex
19                      outer SZ in primary visual cortex
20             IZ in dorsomedial extrastriate cortex (V2)
21                            IZ in primary visual cortex
22             SP in dorsomedial extrastriate cortex (V2)
23            inner CP in dorsomedial extrastriate cortex
24            

In [26]:
# extract layer markers from ['structure_name'] into categorical variable
# this works since no nulls and 
print(f"number of nulls: {layer_samples.structure_name.str.extract(r'(outer? [A-Z]{2,}|inner? [A-Z]{2,}|[A-Z]{2,})', expand=False).isnull().sum()}")
print(layer_samples.structure_name.str.extract(r'(outer? [A-Z]{2,}|inner? [A-Z]{2,}|[A-Z]{2,})', expand=False).unique())

number of nulls: 0
['outer CP' 'inner CP' 'VZ' 'SP' 'inner SZ' 'outer SZ' 'IZ' 'MZ' 'SG' 'SZ'
 'CP' 'LGE']


In [27]:
# how many samples from the LGE-VZ border region?
layer_samples.structure_name[layer_samples.structure_name.str.contains('LGE')]

652    LGE-VZ border region
965    LGE-VZ border region
Name: structure_name, dtype: object

In [28]:
# the LGE-VZ samples come from the younger fetal brains
young_annotated_samples.structure_name[young_annotated_samples.structure_name.str.contains('LGE')]

116    LGE-VZ border region
429    LGE-VZ border region
Name: structure_name, dtype: object

In [29]:
# one each from 15pcw and from 16pcw donors
annotated_samples_exp3.structure_name[annotated_samples_exp3.structure_name.str.contains('LGE')]

sample_id
117    LGE-VZ border region
Name: structure_name, dtype: object

In [30]:
annotated_samples_exp4.structure_name[annotated_samples_exp4.structure_name.str.contains('LGE')]

sample_id
103    LGE-VZ border region
Name: structure_name, dtype: object

In [31]:
#add a space to the last regex ([A-Z]) and you will select the VZ and not LGE from the 'LGE-VZ border region' structure
# is this what we actually want?
print(layer_samples.structure_name.str.extract(r'(outer? [A-Z]{2,}|inner? [A-Z]{2,}|[A-Z]{2,} )', expand=False).unique())

['outer CP' 'inner CP' 'VZ ' 'SP ' 'inner SZ' 'outer SZ' 'IZ ' 'MZ ' 'SG '
 'SZ ' 'CP ']


In [32]:
layer_samples['layer_marker'] = layer_samples.loc[:, 'structure_name'].str.extract(r'(outer? [A-Z]{2,}|inner? [A-Z]{2,}|[A-Z]{2,} )', expand=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [33]:
# no null layer markers
layer_samples.layer_marker.isnull().sum()

0

In [34]:
layer_samples.layer_marker.value_counts()

SP          99
inner CP    98
outer CP    96
VZ          83
IZ          75
MZ          72
outer SZ    60
inner SZ    58
SG          22
SZ          14
CP           6
Name: layer_marker, dtype: int64

## Final data matrix:
TODO:
- add/remove any structures prior to groupby (eg: what to do with 'LGE-VZ border region', or the 23 rows identified below such as "IZ in posterior parahippocampal cortex"
- remove the unused gene symbols
- determine what to do with the 6 samples from CP that are not defined as "outer CP" or "inner CP"

In [35]:
data_matrix = layer_samples.groupby('layer_marker').mean()

In [36]:
layer_samples.groupby('layer_marker').structure_name.count()

layer_marker
CP           6
IZ          75
MZ          72
SG          22
SP          99
SZ          14
VZ          83
inner CP    98
inner SZ    58
outer CP    96
outer SZ    60
Name: structure_name, dtype: int64

In [37]:
data_matrix

Unnamed: 0_level_0,61E3.4,A1BG,A1CF,A2LD1,A2M,A2ML1,A3GALT2P,A4GALT,A4GNT,AAAS,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,na
layer_marker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CP,9.263504,5.341734,2.106633,4.581739,6.662107,2.849651,1.816874,3.015662,2.639297,8.194904,...,4.966853,11.93324,5.092805,5.621183,2.513694,10.284867,3.432916,6.204239,6.583112,3.291873
IZ,9.721449,5.044096,2.381348,4.783662,8.210714,3.43086,2.08395,3.448331,2.6589,8.6453,...,6.251915,12.080024,5.247504,5.796943,1.742699,10.113893,3.876353,6.183512,6.718946,3.608881
MZ,10.153932,5.032147,2.490686,4.647577,7.695129,3.391987,2.09154,3.112252,2.52529,8.684954,...,5.934518,11.928072,5.106107,6.305739,2.15066,10.092704,3.893056,6.188307,6.593633,3.481697
SG,9.49314,4.495447,2.130108,5.575499,8.418385,3.052261,2.145772,3.672317,2.085803,8.728263,...,6.707415,11.717334,4.93922,5.755067,1.562925,9.815353,3.619593,5.993121,6.521102,3.48691
SP,9.422381,5.153091,2.132368,4.386287,7.452764,3.023764,2.064839,2.852441,2.475055,8.357874,...,5.506006,12.006968,5.12932,5.856573,1.616925,10.354681,3.673724,6.117645,6.48197,3.325019
SZ,9.203677,4.229729,2.112373,5.629858,7.530069,3.139615,1.740439,2.847862,2.76972,9.200567,...,7.757726,11.912856,5.099656,5.854383,1.416432,9.838449,3.513602,5.795867,6.513303,3.390654
VZ,8.888887,3.916393,1.901858,5.767095,7.308448,2.981158,1.524764,2.60396,2.067596,9.142339,...,7.882126,11.780346,4.954271,5.871684,1.502222,9.619547,3.523906,5.607129,6.949982,3.189814
inner CP,9.265179,5.305051,2.288865,3.84561,6.525049,2.972453,1.883755,2.592038,2.603016,8.517455,...,4.954797,12.016049,5.173295,5.81924,1.717434,10.421845,3.723881,6.186948,6.362026,3.253188
inner SZ,9.015022,4.034066,2.073784,5.624158,7.621815,3.15126,1.734557,2.790072,2.519314,9.260109,...,7.952739,11.766265,5.105962,6.134173,1.548699,9.802632,3.654338,5.774385,6.789792,3.394654
outer CP,9.322415,5.187203,2.496094,3.969812,6.284804,3.033176,1.924765,2.501182,2.647756,8.815236,...,4.740961,12.129538,5.218246,5.898055,1.796166,10.448065,3.991349,6.314328,6.492832,3.256586


## Further inspecting layer_samples data
- There are 200 layer samples that contain the marker "CP", but not all are classfied as either "outer CP" (96) or "inner CP" (98). Which samples are these?

In [38]:
layer_samples.structure_name[(layer_samples.structure_name.str.contains('CP')) & (~layer_samples.structure_name.str.contains('outer')) & (~layer_samples.structure_name.str.contains('inner'))]

58            CP in retrosplenial cortex
342           CP in retrosplenial cortex
554          CP in temporal polar cortex
567     CP in midlateral temporal cortex
592        CP in caudal subicular cortex
1081          CP in retrosplenial cortex
Name: structure_name, dtype: object

### NOTE:
- Samples from the subicular cortex appear. These may be considered part of archicortex
- Q: How many samples were taken from subicular cortex? Should these be kept? Are there any from presubiculum too?

In [39]:
layer_samples.structure_name[layer_samples.structure_name.str.contains('subicu')]

592    CP in caudal subicular cortex
732    VZ in caudal subicular cortex
759    SP in caudal subicular cortex
Name: structure_name, dtype: object

## Take a look at samples that were not selected as cortical layer samples. Was anything missed?
- should VZ be included in cortical layers?
- rostral migratory stream?

In [40]:
layer_samples.shape

(683, 29178)

In [41]:
# these would be samples/structures that are not considered part of the cortical layers
not_layer_samples = all_annotated_samples[~(all_annotated_samples.structure_name.str.contains('SG|MZ|CP|SP|IZ|SZ|VZ'))]

In [42]:
not_layer_samples.shape

(497, 29177)

In [43]:
not_layer_samples

Unnamed: 0,structure_name,61E3.4,A1BG,A1CF,A2LD1,A2M,A2ML1,A3GALT2P,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,na
0,stratum pyramidale of caudal CA3,9.377013,4.249742,1.930052,5.859647,6.893537,2.734098,1.843554,2.140563,2.486960,...,6.745094,11.654925,5.211952,5.384706,1.228563,10.800131,5.124591,6.013523,6.608995,3.773400
2,stratum pyramidale of caudal CA2,8.462613,4.902168,1.495802,4.896483,6.572636,3.310686,1.277800,2.472009,2.668977,...,6.629240,11.772713,5.925422,5.561080,3.288921,10.845009,3.153245,5.590283,6.391939,3.227571
5,pyramidal cells of caudal CA4,9.000264,4.529675,1.485590,5.699487,6.944047,2.797022,1.383018,2.186168,2.101121,...,8.012986,11.959759,4.646307,5.659454,2.043347,10.104067,3.140946,5.397309,7.089670,3.405972
6,lateral posterior nucleus of thalamus,8.562635,4.743105,1.561409,4.811058,7.515969,3.581545,1.373177,2.343040,3.054178,...,7.413261,11.579441,5.831313,5.313867,2.615047,10.526476,4.898312,5.920426,6.039466,3.554722
10,stratum lacunosum-moleculare of caudal CA1,10.574462,5.538220,2.356150,6.397130,7.823384,3.644281,2.489221,3.066482,1.883053,...,7.238009,10.490450,5.054913,5.526805,1.679262,9.355275,3.545557,5.782888,6.235455,3.398985
13,stratum lacunosum-moleculare of rostral CA1,10.065136,4.355705,1.984735,5.545660,7.969812,3.575142,1.962408,1.933998,2.281512,...,6.592704,11.812115,4.586670,5.178637,1.272198,9.657946,3.105636,5.566064,6.098637,2.907514
73,caudal presubiculum (postsubiculum),9.233283,5.535746,1.858805,4.588649,6.801419,2.370045,1.790506,1.455610,1.891234,...,5.807603,11.998372,5.331329,5.869557,1.402896,10.528844,3.323656,6.118657,6.476673,3.424603
74,polymorphic layer of caudal subiculum,9.179915,5.039373,2.138232,5.273030,7.425699,2.965403,2.142848,3.436019,1.755759,...,6.991967,12.536423,5.587574,5.738258,1.838488,10.555064,2.343740,6.003568,8.175087,3.711451
75,pyramidal layer of caudal subiculum,8.670561,4.797912,1.717747,5.090680,7.004094,2.468162,1.525590,2.925865,3.134569,...,6.569283,11.728751,4.542226,5.166232,1.208969,9.993448,3.128463,6.072138,6.216741,3.434267
76,molecular layer of caudal subiculum,10.558395,5.403184,1.635162,5.438049,8.202671,3.249849,1.492493,2.328908,1.104926,...,6.254745,11.191271,4.334670,5.864572,1.501257,10.173680,3.311147,5.862451,5.871991,3.164845


In [44]:
print(not_layer_samples.structure_name.unique().shape)

(246,)


In [45]:
# inspect these for any that don't belong
for name in not_layer_samples.structure_name.unique():
    print(name)

stratum pyramidale of caudal CA3
stratum pyramidale of caudal CA2
pyramidal cells of caudal CA4
lateral posterior nucleus of thalamus
stratum lacunosum-moleculare of caudal CA1
stratum lacunosum-moleculare of rostral CA1
caudal presubiculum (postsubiculum)
polymorphic layer of caudal subiculum
pyramidal layer of caudal subiculum
molecular layer of caudal subiculum
stratum oriens of caudal CA1
stratum pyramidale of caudal CA1
stratum radiatum of caudal CA1
subgranular zone of caudal dentate gyrus
granular layer of caudal dentate gyrus
superior colliculus
central nucleus of inferior colliculus
inferior nucleus of pulvinar
lateral nucleus of pulvinar
medial nucleus of pulvinar
lateral parabrachial nucleus
layer VI of area 35c
layer V of area 35c
layer IIIu of area 35c
layer III of area 35c
layer II of area 35c
nucleus coeruleus
medial parabrachial nucleus
periaqueductal gray substance, dorsolateral portion
rostral presubiculum
pyramidal cells of rostral CA4
layer VI of caudal entorhinal c

In [46]:
# which rows are not part of the layers_samples_df nor those that don't contain the layer info?
497+683

1180

### 23 other brain areas that were excluded:
- often contain layer markers, but are also part of allocortex

In [47]:
all_annotated_samples[(~all_annotated_samples.index.isin(not_layer_samples.index)) & (~all_annotated_samples.index.isin(layer_samples.index))]

Unnamed: 0,structure_name,61E3.4,A1BG,A1CF,A2LD1,A2M,A2ML1,A3GALT2P,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,na
9,MZ in posterior parahippocampal cortex,9.356545,5.271694,3.124172,5.27323,7.139198,3.985868,3.558682,3.751302,2.916571,...,6.416089,12.081693,5.570489,6.511685,2.558061,10.404612,3.114751,6.675051,8.403867,3.97846
64,IZ in posterior parahippocampal cortex,9.922014,5.002598,2.392573,4.808052,7.906407,3.415423,2.175206,2.999757,2.013918,...,6.203957,11.940437,5.519395,5.695201,1.674436,9.641493,3.676458,6.652623,5.912702,3.508994
65,SP in posterior parahippocampal cortex,9.60983,4.816781,1.942333,4.332929,7.130082,2.923953,1.642763,2.854657,3.069735,...,5.4817,12.155681,5.003442,5.739156,1.594491,10.300234,4.400127,5.943354,6.095682,3.502495
67,outer CP in posterior parahippocampal cortex,8.742081,5.303762,4.545296,4.226079,6.314282,2.698372,2.050292,2.146126,2.730736,...,5.80445,12.460551,5.057379,5.366566,2.061449,10.244296,3.844901,6.306328,6.04985,3.159107
70,VZ in posterior parahippocampal cortex,8.711403,4.278586,1.545458,5.875086,7.441561,3.501665,1.48183,2.76009,2.755906,...,7.07579,11.61985,5.700169,6.111549,2.545074,9.576741,3.276225,5.842342,5.933756,3.971651
72,SZ in posterior parahippocampal cortex,8.994894,4.648288,2.052584,6.330385,7.106441,3.08339,2.04683,2.236986,2.898786,...,7.712512,12.381219,5.154612,6.291172,2.320694,9.865293,2.97887,6.025324,6.412626,3.757621
168,inner CP in posterior parahippocampal cortex,8.866994,5.160505,2.196297,4.73686,6.37809,2.593506,2.138725,2.826343,3.08673,...,5.602192,12.004928,5.172568,5.940504,2.846103,10.243197,3.72342,6.457394,6.354918,3.535268
560,VZ in posterior parahippocampal cortex,8.632376,3.258824,1.752862,6.112296,6.065953,3.72823,1.463905,2.506027,1.811569,...,8.301008,11.832863,4.974109,5.759735,1.471811,9.360964,3.161225,5.342718,8.326822,3.157492
561,inner CP in posterior parahippocampal cortex,9.393758,4.359741,1.914076,3.250613,7.994045,3.180778,1.591452,2.565236,1.990557,...,5.07335,12.785655,5.079853,5.604897,1.919829,10.459587,3.54804,6.137617,5.854169,3.11392
589,SP in caudal hippocampal proper,8.681311,5.094963,2.398815,4.987837,7.692091,3.072596,1.780741,3.361591,2.152374,...,5.582862,12.639965,5.225737,5.313863,1.933853,10.270293,4.512285,6.026204,7.836637,3.736189
