In [1]:
import pandas as pd
import os

from maca import clean_annotation, clean_labels

metadata_folder = os.path.join('..', 'metadata' )

annotation_folder = os.path.join(metadata_folder, 'manual_annotations')
# annotation_folder

summary_folder = os.path.join(metadata_folder, 'summary')
! mkdir $summary_folder

%load_ext autoreload
%autoreload 2


mkdir: ../metadata/summary: File exists


In [2]:
from maca import clean_annotation, clean_labels
import glob
import os

dfs = []

original_dfs = []

# cleaned_tissues = 'Bladder', 'Kidney', 'Liver', 'Lung'

skip_tissues = [] #'Trachea'

globber = os.path.join(annotation_folder, '*', '*.csv')


for csv in glob.glob(globber):
    if '10x' not in csv.lower():
        continue
    basename = os.path.basename(csv)
    if basename == 'MACA_Metadata.csv':
        continue
    tissue = os.path.dirname(csv).split('/')[-1]
    if tissue.startswith('annotation_cleaning_double_check'):
        continue
        
    # Skip Tongue for now because the annotation is the same as the plates
#     if tissue in skip_tissues:
#         continue

        
    print('\n-', tissue,  '-')
    if tissue == "Lung":
        df = pd.read_csv(csv, usecols=["V1", "V2"], index_col=0)
        df.columns = ['annotation']
#         brea
    else:
        df = pd.read_csv(csv, index_col=0)
#     print('\t', '\n\t'.join(df['annotation'].astype(str).unique()))
    try:
        sizes = df.fillna('.').groupby(['annotation', 'subannotation']).size()
    except KeyError:
        try:
            sizes = df.fillna('.').groupby(['annotation']).size()
#             print('-- before cleaning --\n', )
        except KeyError:
            sizes = df.head()
            print('-- no "annotation" or "subannotation" columns --', )
            
    print('-- before cleaning --\n', sizes)
    
    original_dfs.append(df)
    
    df = clean_annotation(df, tissue, debug=True)
    sizes = df.fillna('..').groupby(['annotation', 'subannotation']).size()
    print('--- after cleaning ---\n', sizes)
    df['tissue'] = tissue
    
#     if tissue in cleaned_tissues:
    dfs.append(df)
    
    
combined = pd.concat(dfs)

# Remove Heart annotation of Aorta cells. 
# "Heart" comes after "Aorta" alphabetically so we're using 
# "drop_duplicates" which keeps only the first occurence
# combined = combined.loc[~combined.index.duplicated()]

combined['annotation_subannotation'] = combined.apply(
    lambda x: '{annotation}: {subannotation}'.format(**x) 
    if pd.notnull(x['subannotation']) else x['annotation'], axis=1)

combined.head()


- Bladder -
-- before cleaning --
 annotation         subannotation
Basal              .                266
Endothelial cells  .                 68
Immune cells       .                 57
LuminalA1          .                313
LuminalA2          .                219
LuminalB           .                391
MesenchymalA1      .                169
MesenchymalA2      .                501
MesenchymalB1      .                187
MesenchymalB2      .                329
dtype: int64
		--- after cleaning labels ---
annotation         subannotation
basal              -                266
endothelial_cells  -                 68
immune_cells       -                 57
luminala           -                532
luminalb           -                391
mesenchymala       -                670
mesenchymalb       -                516
dtype: int64
			---- After tissue-specific cleaning ----
annotation         subannotation
basal cells        -                266
endothelial_cells  -                 68
imm

--- after cleaning ---
 annotation      subannotation   
b_cells         ..                   205
ciliated        ..                    41
club            ..                     5
dendritic       ..                   225
endothelial     ..                   425
epithelial      alveolar_type_ii      89
immune          ..                   151
macrophages     alveolar             345
                interstitial         111
mast            ..                    22
mesothelial     ..                    24
monocytes       ..                   145
natural_killer  ..                   832
stromal         ..                  2534
t_cells         ..                   246
unknown         ..                    49
dtype: int64

- Mammary -
-- before cleaning --
 annotation                                   subannotation
B cells                                      .                 764
Basal Epithelial cells                       .                 393
Endothelial cells                            

			---- After tissue-specific cleaning ----
annotation       subannotation           
b_cells          -                            282
                 follicular                  4906
                 marginal_zone               1440
                 proliferative                 98
dendritic_cells  -                            102
                 plasmocytoid                  42
macrophages      -                            395
t_cells          -                             92
                 cd4+                         905
                 cd8+                         946
                 invariant_natural_killer     120
                 natural_killer_cells         224
dtype: int64
--- after cleaning ---
 annotation   subannotation           
b_cells      ..                           282
             follicular                  4906
             marginal_zone               1440
             proliferative                 98
dendritic    ..                           102
         

Unnamed: 0,annotation,subannotation,tissue,annotation_subannotation
10X_P4_3_AAAGTAGAGATGCCAG,mesenchymal,b,Bladder,mesenchymal: b
10X_P4_3_AACCGCGTCCAACCAA,mesenchymal,a,Bladder,mesenchymal: a
10X_P4_3_AACTCCCGTCGGGTCT,mesenchymal,a,Bladder,mesenchymal: a
10X_P4_3_AACTCTTAGTTGCAGG,luminal,a,Bladder,luminal: a
10X_P4_3_AACTCTTTCATAACCG,mesenchymal,a,Bladder,mesenchymal: a


In [3]:
original_annotations = pd.concat(original_dfs)
print(original_annotations.shape)

# Drop the double-annotated heart and aorta annotations
original_annotations = original_annotations.loc[~original_annotations.index.duplicated()]
print(original_annotations.shape)
# original_annotations = original_annotations.drop('plate.barcode', axis=1)
# original_annotations.head()

combined_with_original = combined.join(original_annotations, rsuffix='_original', lsuffix='_cleaned')
print(combined_with_original.shape)
# combined_with_original.head()

cols = ['tissue', 'annotation_original', 'subannotation_original', 'annotation_cleaned', 'subannotation_cleaned',]

# combined_with_original.fillna('.').groupby(['tissue', 'annotation', 'subannotation', 'annotation_original', 'subannotation_original']).size()

combined_sizes = combined_with_original.fillna('.').groupby(cols).size().reset_index()
combined_sizes = combined_sizes.rename(columns={0: 'n_cells'})
print(combined_sizes.shape)

from IPython.display import display, HTML

for tissue, df in combined_sizes.groupby('tissue'):
    display(df)

(54837, 2)
(54837, 2)
(54837, 6)
(125, 6)


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
0,Bladder,Basal,.,basal,.,266
1,Bladder,Endothelial cells,.,endothelial,.,68
2,Bladder,Immune cells,.,immune,.,57
3,Bladder,LuminalA1,.,luminal,a,313
4,Bladder,LuminalA2,.,luminal,a,219
5,Bladder,LuminalB,.,luminal,b,391
6,Bladder,MesenchymalA1,.,mesenchymal,a,169
7,Bladder,MesenchymalA2,.,mesenchymal,a,501
8,Bladder,MesenchymalB1,.,mesenchymal,b,187
9,Bladder,MesenchymalB2,.,mesenchymal,b,329


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
10,Heart,Blood_Cells,.,erythrocytes,.,55
11,Heart,CMs,.,cardiomyocytes,.,83
12,Heart,Coronary_vascular_EDC,.,endothelial,coronary_vascular,178
13,Heart,Endocardial_EDC,.,endothelial,endocardial,65
14,Heart,Fb,.,fibroblasts,.,157
15,Heart,Fb_SMC,Fibroblast,fibroblasts,.,65
16,Heart,Fb_SMC,Smooth_Muscle_Cells,smooth_muscle,.,21


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
17,Kidney,Collecting duct cells,.,collecting_duct,.,72
18,Kidney,Feneserated capillary cells,.,fenestrated_capillary,.,345
19,Kidney,Fibroblasts,.,fibroblasts,.,113
20,Kidney,Macrophages,.,macrophages,.,181
21,Kidney,Proximal ascending tubule,.,tubule,proximal_ascending,352
22,Kidney,Smooth muscle cells,.,smooth_muscle,.,50
23,Kidney,Thick Ascending tubule,.,tubule,thick_ascending,473
24,Kidney,connecting tubule cells,.,tubule,connecting,245
25,Kidney,endothelial cells,.,endothelial,.,45
26,Kidney,other immune,.,immune,.,54


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
28,Liver,F-hep-midlobule,F-Hamp-high,hepatocytes,midlobular,50
29,Liver,F-hep-pericentral,"F-Cyp2e1, Glul",hepatocytes,pericentral,258
30,Liver,F-hep-periportal,F-Cyp2f2,hepatocytes,periportal,203
31,Liver,M-hep-midlobule,M-Mup-high,hepatocytes,midlobular,214
32,Liver,M-hep-pericentral,"M-Cyp2e1, Glul",hepatocytes,pericentral,115
33,Liver,M-hep-periportal,M-Cyp2f2,hepatocytes,periportal,166
34,Liver,endothelial,.,endothelial,.,20


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
35,Lung,Alveolar Macrophages,.,macrophages,alveolar,345
36,Lung,B Cells,.,b_cells,.,205
37,Lung,Ciliated Cells,.,ciliated,.,41
38,Lung,Club Cells,.,club,.,5
39,Lung,Dendritic Cells,.,dendritic,.,225
40,Lung,Endothelial Cell Type I,.,endothelial,.,349
41,Lung,Endothelial Cell Type IV,.,endothelial,.,76
42,Lung,Interstital Macrophages,.,macrophages,interstitial,111
43,Lung,Mast Cells,.,mast,.,22
44,Lung,Mesothelial Cells,.,mesothelial,.,24


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
57,Mammary,B cells,.,b_cells,.,764
58,Mammary,Basal Epithelial cells,.,basal,.,393
59,Mammary,Endothelial cells,.,endothelial,.,251
60,Mammary,Hormone responsive luminal epithelial cells,.,luminal,hormone_responsive,190
61,Mammary,Luminal epithelial progenitors Cd14+,.,luminal,progenitors,240
62,Mammary,Macrophages,.,macrophages,.,186
63,Mammary,S100a4+/Ccl5+ cells,.,stromal,s100a4+_and_ccl5+,57
64,Mammary,Stromal cells,.,stromal,.,581
65,Mammary,Stromal cells Mustn1+/Procr+,.,stromal,mustn1+_and_procr+,59
66,Mammary,T cells,.,t_cells,.,1467


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
68,Marrow,Erythrocytes,.,erythrocytes,.,142
69,Marrow,Erythroid-progenitors_Erythroblasts,.,progenitors,erythroid,268
70,Marrow,Granulocyte-progenitors,.,progenitors,granulocyte,330
71,Marrow,Granulocytes,.,granulocytes,.,770
72,Marrow,Immature-B,.,b_cells,immature,299
73,Marrow,Macrophages,.,macrophages,.,222
74,Marrow,Mature-B,.,b_cells,mature,91
75,Marrow,Megakaryocyte-progenitors,.,progenitors,megakaryocyte,62
76,Marrow,Monocyte-progenitors,.,progenitors,mature,322
77,Marrow,Monocytes,.,monocytes,.,522


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
81,Muscle,.,.,unknown,.,312
82,Muscle,B cells,.,b_cells,.,463
83,Muscle,T cells,.,t_cells,.,323
84,Muscle,chondrogenic cells,.,chondrogenic,.,380
85,Muscle,endothelial cells,.,endothelial,.,1290
86,Muscle,fibro-/adipogenic progenitors,.,mesenchymal_stem,.,1111
87,Muscle,macrophages,.,macrophages,.,308
88,Muscle,satellite cells,.,satellite,.,349


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
89,Spleen,10. Dendritic cells,.,dendritic,.,102
90,Spleen,11. Proliferative B cells,.,b_cells,proliferative,98
91,Spleen,12. Macrophages 2,.,macrophages,.,96
92,Spleen,13 T cells,.,t_cells,.,92
93,Spleen,14. Plasmocytoid Dendritic cells,.,dendritic,plasmocytoid,42
94,Spleen,2. Marginal Zone B cells,.,b_cells,marginal_zone,1440
95,Spleen,3. CD8+ T Cells,.,t_cells,cd8+,946
96,Spleen,4. Cd4+ T cells,.,t_cells,cd4+,905
97,Spleen,6. Macrophages 1,.,macrophages,.,299
98,Spleen,7. B cells,.,b_cells,.,282


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
102,Thymus,DN-Stage 4c,.,t_cells,double_negative-stage4c,43
103,Thymus,DN-Stage3,.,t_cells,double_negative-stage3,35
104,Thymus,DN-Stage4b,.,t_cells,double_negative-stage4b,211
105,Thymus,immature SP Cd8,.,t_cells,immature_single_positive_cd8+,194
106,Thymus,mix of cells,"Single positive Cd8, possibly mature",t_cells,"single_positive_cd8+,_possibly_mature",117
107,Thymus,mix of cells,stromal_mesenchymal cells,stromal_mesenchymal,.,81
108,Thymus,stromal mesenchymal cell,.,stromal_mesenchymal,.,31
109,Thymus,thymocyte-DP,.,t_cells,double_positive,381
110,Thymus,thymocyte-DP-proliferating,.,t_cells,double_positive-proliferating,336


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
111,Tongue,Basal_cells,.,basal,basal,4242
112,Tongue,Maturing/nonkeratinized,.,keratinocytes,suprabasal,1454
113,Tongue,Proliferating cells,.,basal,proliferating,843
114,Tongue,keratinocytes,.,basal,keratinocytes,962
115,Tongue,unknown_cell,.,unknown,.,37


Unnamed: 0,tissue,annotation_original,subannotation_original,annotation_cleaned,subannotation_cleaned,n_cells
116,Trachea,Epcam,.,epithelial,.,246
117,Trachea,Epcam,Foxj1,epithelial,ciliated_cells,20
118,Trachea,Epcam,Krt5,epithelial,basal_cells,36
119,Trachea,Epcam,Krt5/Krt14,epithelial,krt5/krt14,443
120,Trachea,Epcam,Scgb1a1,epithelial,secretory_cells,140
121,Trachea,Pdgfrb,.,stromal,.,7904
122,Trachea,Pdgfrb + Ptprc,.,neuroendocrine,.,543
123,Trachea,Pecam1,.,endothelial,.,1039
124,Trachea,Ptprc,.,immune,.,898


### Write combined original vs cleaned

In [4]:
csv = os.path.join(summary_folder, 'maca_3month_original_vs_cleaned_10x.csv')
combined_sizes.to_csv(csv, index=False)

## make sure there are no duplicated cell ids

In [5]:
combined.index.duplicated().sum()

0

## Make sure there are 12 tissues

In [6]:
tissues = sorted(combined['tissue'].unique())
tissues

['Bladder',
 'Heart',
 'Kidney',
 'Liver',
 'Lung',
 'Mammary',
 'Marrow',
 'Muscle',
 'Spleen',
 'Thymus',
 'Tongue',
 'Trachea']

In [7]:
pd.Series(tissues)

0     Bladder
1       Heart
2      Kidney
3       Liver
4        Lung
5     Mammary
6      Marrow
7      Muscle
8      Spleen
9      Thymus
10     Tongue
11    Trachea
dtype: object

    0                    Aorta
    1                  Bladder
    2     Brain_FACS_microglia
    3       Brain_FACS_neurons
    4                    Colon
    5                Diaphragm
    6                      Fat
    7                    Heart
    8                   Kidney
    9                    Liver
    10                    Lung
    11           Mammary_Gland
    12                  Marrow
    13                  Muscle
    14                Pancreas
    15                    Skin
    16                  Spleen
    17                  Thymus
    18                  Tongue
    19                 Trachea
    dtype: object

In [8]:
combined.groupby('tissue').size()


tissue
Bladder     2500
Heart        624
Kidney      2781
Liver       1026
Lung        5449
Mammary     4481
Marrow      3652
Muscle      4536
Spleen      9552
Thymus      1429
Tongue      7538
Trachea    11269
dtype: int64

In [9]:
pd.options.display.max_rows = 100

In [10]:
annotation_sizes = combined.groupby('annotation').size().sort_values(ascending=False).to_frame()
annotation_sizes = annotation_sizes.rename(columns={0: 'n_cells'})
print(annotation_sizes.shape)
annotation_sizes

(34, 1)


Unnamed: 0_level_0,n_cells
annotation,Unnamed: 1_level_1
stromal,11135
b_cells,8611
basal,6706
t_cells,6095
endothelial,3381
tubule,1921
macrophages,1748
keratinocytes,1454
progenitors,1381
luminal,1353


In [11]:
csv = os.path.join(summary_folder, 'maca_3month_annotation_counts_10x.csv')
annotation_sizes.to_csv(csv)

In [12]:
len(combined.groupby('tissue').size())

12

### Write annotations

In [13]:
csv = os.path.join(metadata_folder, 'maca_3month_annotations_10x.csv')

combined.to_csv(csv)

In [14]:
combined.groupby('annotation').apply(lambda x: len(x['annotation'].iloc[0])).sort_values(ascending=False)

annotation
fenestrated_capillary    21
stromal_mesenchymal      19
mesenchymal_stem         16
collecting_duct          15
neuroendocrine           14
cardiomyocytes           14
natural_killer           14
keratinocytes            13
smooth_muscle            13
granulocytes             12
chondrogenic             12
erythrocytes             12
progenitors              11
mesothelial              11
mesenchymal              11
macrophages              11
endothelial              11
hepatocytes              11
fibroblasts              11
epithelial               10
satellite                 9
monocytes                 9
dendritic                 9
ciliated                  8
unknown                   7
luminal                   7
stromal                   7
t_cells                   7
b_cells                   7
immune                    6
tubule                    6
basal                     5
club                      4
mast                      4
dtype: int64