In [6]:
import pandas as pd

import scanpy as sc
import sourmash

### Clean batbam chanel metadata file to join onto adata

In [52]:
bam_channel_metadata = pd.read_csv('/home/olga/data_sm/batlas/Bat_Bams/metadata.csv')

In [53]:
# check column names have no spaces
bam_channel_metadata.columns.values

array(['Folder', 'Sex', 'Tissue', 'Animal ID'], dtype=object)

In [54]:
print(bam_channel_metadata.shape)
bam_channel_metadata.head()

(34, 4)


Unnamed: 0,Folder,Sex,Tissue,Animal ID
0,bat2-BL,M,bladder,2
1,bat2-BM,M,bone marrow,2
2,bat2-BR,M,brain,2
3,bat2-FTB,M,brown fat,2
4,bat2-KD,M,kidney,2


In [75]:
# add on missing metadata

from io import StringIO

s = """Folder,Animal ID,Tissue,Sex
bat2-FTW,2,white fat,M
bat2-CO,2,colon,M
bat3-FTB,3,brown fat,M
bat3-FTW,3,white fat,M
"""

df = pd.read_csv(StringIO(s))
# print(df.shape)
# df.head()

bam_channel_metadata_full = pd.concat([
    bam_channel_metadata, 
    df,
])

print(bam_channel_metadata_full.shape)
bam_channel_metadata_full.head()

(38, 4)


Unnamed: 0,Folder,Sex,Tissue,Animal ID
0,bat2-BL,M,bladder,2
1,bat2-BM,M,bone marrow,2
2,bat2-BR,M,brain,2
3,bat2-FTB,M,brown fat,2
4,bat2-KD,M,kidney,2


In [76]:
bat_metadata_index = bam_channel_metadata_full.set_index(["Animal ID", "Tissue", "Sex"])
bat_metadata_index.shape
bat_metadata_index.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Folder
Animal ID,Tissue,Sex,Unnamed: 3_level_1
2,bladder,M,bat2-BL
2,bone marrow,M,bat2-BM
2,brain,M,bat2-BR
2,brown fat,M,bat2-FTB
2,kidney,M,bat2-KD


### Read in and clean adata

In [119]:
h5ad = '/home/olga/data_sm/batlas/baca-droplet-raw.h5ad'
adata = sc.read(h5ad)
adata

AnnData object with n_obs × n_vars = 24201 × 11878
    obs: 'animalID', 'batch', 'sex', 'tissue', 'method', 'n_genes', 'n_counts'
    var: 'n_cells'

In [58]:
adata.obs.head()

Unnamed: 0_level_0,animalID,batch,sex,tissue,method,n_genes,n_counts,louvain
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAACCTGAGAGGGCTT-1-0,3,0,male,Whole Blood,droplet,1443,10000.0,21
AAAGATGTCAGCTTAG-1-0,3,0,male,Whole Blood,droplet,1607,10000.0,10
AACCATGGTTTGTTTC-1-0,3,0,male,Whole Blood,droplet,1873,10000.000977,10
AACTCCCTCGCATGAT-1-0,3,0,male,Whole Blood,droplet,1328,10000.000977,10
AAGGCAGCAAGCGCTC-1-0,3,0,male,Whole Blood,droplet,1427,10000.0,21


In [111]:
adata.var.head()

Unnamed: 0_level_0,n_cells,means,dispersions,dispersions_norm,highly_variable
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GORAB,3053,0.21183,0.765241,-0.32828,False
PRRX1,1415,0.210433,1.767581,1.722931,True
MROH9,69,0.006136,1.164431,0.488631,False
FMO3,1877,0.410142,2.888616,2.947184,True
FMO2,3036,0.908601,3.398133,2.438222,True


In [59]:
# clean up adata obs tissue names to match bam_channel_metadata
adata.obs["tissue_lower"] = adata.obs["tissue"].str.lower()
adata.obs["sex_abbrev"] = adata.obs["sex"].str[0].str.upper()
adata.obs["animalID"] = adata.obs["animalID"].astype(int)
adata.obs.head()

Unnamed: 0_level_0,animalID,batch,sex,tissue,method,n_genes,n_counts,louvain,tissue_lower,sex_abbrev
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAACCTGAGAGGGCTT-1-0,3,0,male,Whole Blood,droplet,1443,10000.0,21,whole blood,M
AAAGATGTCAGCTTAG-1-0,3,0,male,Whole Blood,droplet,1607,10000.0,10,whole blood,M
AACCATGGTTTGTTTC-1-0,3,0,male,Whole Blood,droplet,1873,10000.000977,10,whole blood,M
AACTCCCTCGCATGAT-1-0,3,0,male,Whole Blood,droplet,1328,10000.000977,10,whole blood,M
AAGGCAGCAAGCGCTC-1-0,3,0,male,Whole Blood,droplet,1427,10000.0,21,whole blood,M


In [77]:
adata_joined_bat_bam = adata.obs.join(
    bat_metadata_index, 
    on = ["animalID", "tissue_lower", "sex_abbrev"],
)

print(adata_joined_bat_bam.shape)
adata_joined_bat_bam.head()

(24201, 11)


Unnamed: 0_level_0,animalID,batch,sex,tissue,method,n_genes,n_counts,louvain,tissue_lower,sex_abbrev,Folder
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAACCTGAGAGGGCTT-1-0,3,0,male,Whole Blood,droplet,1443,10000.0,21,whole blood,M,bat3-WB
AAAGATGTCAGCTTAG-1-0,3,0,male,Whole Blood,droplet,1607,10000.0,10,whole blood,M,bat3-WB
AACCATGGTTTGTTTC-1-0,3,0,male,Whole Blood,droplet,1873,10000.000977,10,whole blood,M,bat3-WB
AACTCCCTCGCATGAT-1-0,3,0,male,Whole Blood,droplet,1328,10000.000977,10,whole blood,M,bat3-WB
AAGGCAGCAAGCGCTC-1-0,3,0,male,Whole Blood,droplet,1427,10000.0,21,whole blood,M,bat3-WB


In [78]:
# check where folders are null
null_columns = adata_joined_bat_bam[adata_joined_bat_bam.Folder.isnull()]
null_columns.groupby(by=["animalID", "tissue"], observed=True).size()


Series([], dtype: int64)

### make new adata columns

In [118]:
# make channel barcode ex: bat3-WB__AAAAAAAATTTTT in adata.obs
adata_joined_bat_bam["bam_channel_barcode"] = adata_joined_bat_bam.Folder + "__" + adata_joined_bat_bam.index
print(adata_joined_bat_bam.shape)
adata_joined_bat_bam.head()


(24201, 12)


Unnamed: 0_level_0,animalID,batch,sex,tissue,method,n_genes,n_counts,louvain,tissue_lower,sex_abbrev,Folder,bam_channel_barcode
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AAACCTGAGAGGGCTT-1-0,3,0,male,Whole Blood,droplet,1443,10000.0,21,whole blood,M,bat3-WB,bat3-WB__AAACCTGAGAGGGCTT-1-0
AAAGATGTCAGCTTAG-1-0,3,0,male,Whole Blood,droplet,1607,10000.0,10,whole blood,M,bat3-WB,bat3-WB__AAAGATGTCAGCTTAG-1-0
AACCATGGTTTGTTTC-1-0,3,0,male,Whole Blood,droplet,1873,10000.000977,10,whole blood,M,bat3-WB,bat3-WB__AACCATGGTTTGTTTC-1-0
AACTCCCTCGCATGAT-1-0,3,0,male,Whole Blood,droplet,1328,10000.000977,10,whole blood,M,bat3-WB,bat3-WB__AACTCCCTCGCATGAT-1-0
AAGGCAGCAAGCGCTC-1-0,3,0,male,Whole Blood,droplet,1427,10000.0,21,whole blood,M,bat3-WB,bat3-WB__AAGGCAGCAAGCGCTC-1-0


In [112]:
bat_annotate = pd.read_csv("/home/olga/data_sm/batlas/bat_annotate.csv", index_col=0)
print(bat_annotate.shape)
bat_annotate.head()

(82924, 15)


Unnamed: 0_level_0,animalID,batch,sex,tissue,method,n_genes,n_counts,louvain,clustertype,cluster,celltype,celltype_tiss,clustercellnumber,newcelltype,clustertonumber
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AAACCTGAGAGGGCTT-1-0,3,0,male,Whole Blood,10x,2370,6673.0,0,T cell,Whole Blood_T cell,Immune cell,Whole Blood_Immune cell,354,T cell,181
AAACGGGAGACGCAAC-1-0,3,0,male,Whole Blood,10x,1196,2132.0,12,Non-classical monocyte,Whole Blood_Non-classical monocyte,Immune cell,Whole Blood_Immune cell,488,Other immune cell,180
AAAGATGTCAGCTTAG-1-0,3,0,male,Whole Blood,10x,2647,7631.0,8,Classical monocyte,Whole Blood_Classical monocyte,Immune cell,Whole Blood_Immune cell,229,Other immune cell,178
AAAGCAAAGCCCAACC-1-0,3,0,male,Whole Blood,10x,556,1048.0,5,Classical monocyte,Whole Blood_Classical monocyte,Immune cell,Whole Blood_Immune cell,229,Other immune cell,178
AAAGCAATCAACACCA-1-0,3,0,male,Whole Blood,10x,661,1256.0,0,T cell,Whole Blood_T cell,Immune cell,Whole Blood_Immune cell,354,T cell,181


In [None]:
adata_joined_bat_bam

In [None]:
adata_joined_bat_bam.join(
    bat_annotate,
)

In [113]:
# read in mouse human, lemur adata object and see what is similar between bat.obs.cluster_type and cell_ontology_class
adata_mhl = sc.read(
    "/home/olga/data_sm/tabula-microcebus/data-objects/cross-species/concatenated__human-lung--lemur-lung--mouse-lung__10x__one2one_orthologs__unified_compartments__bbknn.h5ad"
)



This is where adjacency matrices should go now.

This is where adjacency matrices should go now.


In [114]:
print(adata_mhl)


AnnData object with n_obs × n_vars = 87909 × 13350
    obs: 'age', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'free_annotation', 'individual', 'mouse.id', 'patient', 'possibly_contaminated_barcode', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'tissue', 'narrow_group', 'broad_group', 'compartment_group', 'compartment_narrow', 'compartment_broad', 'compartment_broad_narrow', 'compartment_species', 'compartment_narrow_species', 'n_counts', 'log_counts', 'sqrt_counts', 'n_genes'
    var: 'mouse_lemur__gene_name', 'mouse__gene_name', 'gene_ids-lemur', 'n_cells-mouse', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'age_colors', 'broad_group_colors', 'cell_ontology_class_colors', 'compartment_group_colors', 'individual_colors', 'narrow_group_colors', 'neighbors', 'pca', 'sex_colors', 'species_batch_colors', 'species_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances

In [116]:
set(adata_mhl.obs.cell_ontology_class)

{'B cell',
 'CD4-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta T cell',
 'NK cell',
 'T cell',
 'adventitial cell',
 'alveolar macrophage',
 'bronchial smooth muscle cell',
 'capillary endothelial cell',
 'ciliated columnar cell of tracheobronchial tree',
 'classical monocyte',
 'conventional dendritic cell',
 'endothelial cell of artery',
 'endothelial cell of lymphatic vessel',
 'fibroblast',
 'fibroblast of lung',
 'immature NK T cell',
 'intermediate monocyte',
 'lung ciliated cell',
 'lung endothelial cell',
 'lymphocyte',
 'macrophage',
 'mature NK T cell',
 'monocyte',
 'myeloid dendritic cell',
 'myofibroblast cell',
 'naive thymus-derived CD4-positive, alpha-beta T cell',
 'naive thymus-derived CD8-positive, alpha-beta T cell',
 'natural killer cell',
 'non-classical monocyte',
 'pericyte cell',
 'plasma cell',
 'plasmacytoid dendritic cell',
 'pulmonary interstitial fibroblast',
 'smooth muscle cell of the pulmonary artery',
 'stromal cell',
 'type II pneumocyte',
