In [1]:
import os

from IPython.display import display
import pandas as pd
import scanpy as sc

### Utility Functions

<hr>

In [2]:
def read_and_describe(fn, **kargs):
    df = pd.read_csv(fn, **kargs)
    print(df.shape)
    display(df.head())
    return df


### Orthologs Mapping

<hr>

In [3]:
orthologs_mapping = read_and_describe(
    "/home/olga/data_lg/data_sm_copy/immune-evolution/rawdata/orthologs-combined/ncbi_mgi_ensembl__mouse-lemur_human_mouse__orthologs__gene_names__one2one___ensembl_v101__mRhiFer1_v1__human__mouse-lemur__mouse__orthologs.csv",
    #index_col=0
)

orthologs_mapping = orthologs_mapping.drop_duplicates()
print(orthologs_mapping.shape)
orthologs_mapping.head()
#bam_channel_metadata = read_and_describe('/home/olga/data_lg/data_sm_copy/batlas/Bat_Bams/metadata.csv')

(23602, 4)


Unnamed: 0,mouse_lemur__gene_name,human__gene_name,mouse__gene_name,bat__gene_name
0,ARNT,ARNT,Arnt,ARNT
1,ARNT,ARNT,Arnt,ARNT
2,FKTN,FKTN,Fktn,FKTN
3,FKTN,FKTN,Fktn,FKTN
4,FEM1C,FEM1C,Fem1c,FEM1C


(12610, 4)


Unnamed: 0,mouse_lemur__gene_name,human__gene_name,mouse__gene_name,bat__gene_name
0,ARNT,ARNT,Arnt,ARNT
2,FKTN,FKTN,Fktn,FKTN
4,FEM1C,FEM1C,Fem1c,FEM1C
5,ROGDI,ROGDI,Rogdi,ROGDI
6,IDUA,IDUA,Idua,IDUA


### Read in anndata for all species:

<hr>

In [4]:
mouse_folder = "/home/olga/data_lg/data_sm_copy/czb-tabula-muris-senis/Data-objects"
lemur_folder = '/home/olga/data_lg/data_sm_copy/tabula-microcebus/data-objects/10x'
human_folder = '/mnt/ibm_sm/olga/immune-evolution/rawdata/human/h5ads/'
bat_folder = '/home/olga/data_lg/data_sm_copy/batlas'

mouse_h5ad = 'tabula-muris-senis-droplet-official-raw-obj--no-duplicate-barcodes-per-seq-run.h5ad'
lemur_h5ad = 'tabula-microcebus--10x--counts--min-51-genes--min-101-counts--trnas-summed--no-duplicate-barcodes--pairs-disambiguated--no-na.h5ad'
human_h5ad = 'Pilot1_Pilot2_filtered_ensembl_id_Aug5.ss2_norm.ann.compartment.allgenes.h5ad'
bat_h5ad = 'baca-droplet-raw.h5ad'

def load_adata(species, folder, h5ad):
    adata = sc.read(os.path.join(folder, h5ad))
    print("-"*15, species, "-"*15)
    print(adata)
    return adata


mouse_adata = load_adata("mouse", mouse_folder, mouse_h5ad)
lemur_adata = load_adata("lemur", lemur_folder, lemur_h5ad)
human_adata = load_adata("human", human_folder, human_h5ad)
bat_adata = load_adata("bat", bat_folder, bat_h5ad)

--------------- mouse ---------------
AnnData object with n_obs × n_vars = 238915 × 20138
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'species', 'species_latin', 'channel', 'channel_tissue', 'sequencing_run', 'cell_barcode', 'n_barcodes'
    var: 'n_cells'
--------------- lemur ---------------
AnnData object with n_obs × n_vars = 232823 × 31509
    obs: 'batch', 'channel', 'individual', 'method', 'original_channel', 'channel_no_10x', 'sequencing_run', 'free_annotation', 'tissue', 'cell_ontology_class', 'cell_ontology_id', 'compartment', 'compartment_id', 'n_genes', 'n_counts', 'cell_barcode', 'possibly_contaminated_barcode'
    var: 'gene_ids'
Only considering the two last: ['.allgenes', '.h5ad'].
Only considering the two last: ['.allgenes', '.h5ad'].
--------------- human ---------------
AnnData object with n_obs × n_vars = 207016 × 58870
    obs: '10X_run

### Utility Code for creating one2one subsets between all species

<hr>

In [5]:
def create_one2one_subset(adata,  species, orthologs_mapping):
    # filter out all genes not konwn to be ortholog between species
    gene_col = f'{species}__gene_name'
    gene_mask = adata.var.index.isin(
        orthologs_mapping[gene_col].values
    )
    
    one2one = adata[:, gene_mask]
    one2one_new_var = one2one.var.merge(
        orthologs_mapping, 
        left_index=True, 
        right_on=gene_col
    )
    
    # set index to human index for easier merging later
    one2one_var_human_index = one2one_new_var.set_index(
        "human__gene_name"
    )
    one2one.var = one2one_var_human_index
    
    return one2one
    

In [6]:
def subset_one2one_bat(adata, bat_adata):
    # subset to only orthologs that are also found in bat
    # (bat dataset has fewest number of orthologs present)
    
    gene_mask = adata.var.index.isin(
        bat_adata.var.index
    )
    return adata[:, gene_mask]
    

In [7]:
def subset_only_shared_genes(adata, bat_genes):
    # subset to only genes that are shared in bat
    gene_mask = adata.var.index.isin(
        bat_genes
    )
    return adata[:, gene_mask]

### Create one2one orthologs subset

In [8]:
bat_one2one = create_one2one_subset(
    bat_adata,  "bat", orthologs_mapping
)
bat_one2onea

  if not is_categorical(df_full[k]):


AnnData object with n_obs × n_vars = 24201 × 6951
    obs: 'animalID', 'batch', 'sex', 'tissue', 'method', 'n_genes', 'n_counts'
    var: 'n_cells', 'mouse_lemur__gene_name', 'mouse__gene_name', 'bat__gene_name'

In [11]:
mouse_one2one = create_one2one_subset(
    mouse_adata,  "mouse", orthologs_mapping
)
mouse_one2one

AnnData object with n_obs × n_vars = 238915 × 11297
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'species', 'species_latin', 'channel', 'channel_tissue', 'sequencing_run', 'cell_barcode', 'n_barcodes'
    var: 'n_cells', 'mouse_lemur__gene_name', 'mouse__gene_name', 'bat__gene_name'

In [12]:
lemur_one2one = create_one2one_subset(
    lemur_adata,  "mouse_lemur", orthologs_mapping
)
lemur_one2one

AnnData object with n_obs × n_vars = 232823 × 12288
    obs: 'batch', 'channel', 'individual', 'method', 'original_channel', 'channel_no_10x', 'sequencing_run', 'free_annotation', 'tissue', 'cell_ontology_class', 'cell_ontology_id', 'compartment', 'compartment_id', 'n_genes', 'n_counts', 'cell_barcode', 'possibly_contaminated_barcode'
    var: 'gene_ids', 'mouse_lemur__gene_name', 'mouse__gene_name', 'bat__gene_name'

In [13]:
human_one2one = create_one2one_subset(
    human_adata,  "human", orthologs_mapping
)
human_one2one

AnnData object with n_obs × n_vars = 207016 × 12472
    obs: '10X_run', 'smartseq2_barcode', 'smartseq2_library_plate', 'smartseq2_sorting_direction', 'smartseq2_sort_order', 'smartseq2_cDNA_Plate', 'cell_id', 'method', 'donor', '10X_replicate', 'sequencer', 'sex', 'tissue', 'smartseq2_well_seq', 'notes', 'smartseq2_sorted_population', '10X_sample', 'anatomical_position', '10X_barcode', 'n_counts', 'n_genes', 'manual_cell_ontology_class', 'manual_annotation', 'doublet_filter', 'compartment', 'batch', 'compartment_pred', 'leiden', 'cluster_id', 'Propagated Annotation', 'Entropy All Methods', 'Entropy kNN Reps', 'Entropy scANVI Reps', 'Entropy SVM Reps', 'Potential Doublets', 'Onclass', 'singlecellNet', 'kNN', 'SVM', 'scANVI', 'compartment_pred_svm', 'manual_cell_ontology_classcollapsed', 'Propagated Annotationcollapsed', 'Onclasscollapsed', 'singlecellNetcollapsed', 'kNNcollapsed', 'SVMcollapsed', 'scANVIcollapsed', 'miscompartmentalized', 'log_n_counts', 'log_n_genes'
    var: 'feature

In [14]:
mouse_subset_to_bat = subset_one2one_bat(mouse_one2one, bat_one2one)
mouse_subset_to_bat

View of AnnData object with n_obs × n_vars = 238915 × 6382
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'species', 'species_latin', 'channel', 'channel_tissue', 'sequencing_run', 'cell_barcode', 'n_barcodes'
    var: 'n_cells', 'mouse_lemur__gene_name', 'mouse__gene_name', 'bat__gene_name'

In [15]:
lemur_subset_to_bat = subset_one2one_bat(lemur_one2one, bat_one2one)
lemur_subset_to_bat

View of AnnData object with n_obs × n_vars = 232823 × 6948
    obs: 'batch', 'channel', 'individual', 'method', 'original_channel', 'channel_no_10x', 'sequencing_run', 'free_annotation', 'tissue', 'cell_ontology_class', 'cell_ontology_id', 'compartment', 'compartment_id', 'n_genes', 'n_counts', 'cell_barcode', 'possibly_contaminated_barcode'
    var: 'gene_ids', 'mouse_lemur__gene_name', 'mouse__gene_name', 'bat__gene_name'

In [16]:
human_subset_to_bat = subset_one2one_bat(human_one2one, bat_one2one)
human_subset_to_bat

View of AnnData object with n_obs × n_vars = 207016 × 6932
    obs: '10X_run', 'smartseq2_barcode', 'smartseq2_library_plate', 'smartseq2_sorting_direction', 'smartseq2_sort_order', 'smartseq2_cDNA_Plate', 'cell_id', 'method', 'donor', '10X_replicate', 'sequencer', 'sex', 'tissue', 'smartseq2_well_seq', 'notes', 'smartseq2_sorted_population', '10X_sample', 'anatomical_position', '10X_barcode', 'n_counts', 'n_genes', 'manual_cell_ontology_class', 'manual_annotation', 'doublet_filter', 'compartment', 'batch', 'compartment_pred', 'leiden', 'cluster_id', 'Propagated Annotation', 'Entropy All Methods', 'Entropy kNN Reps', 'Entropy scANVI Reps', 'Entropy SVM Reps', 'Potential Doublets', 'Onclass', 'singlecellNet', 'kNN', 'SVM', 'scANVI', 'compartment_pred_svm', 'manual_cell_ontology_classcollapsed', 'Propagated Annotationcollapsed', 'Onclasscollapsed', 'singlecellNetcollapsed', 'kNNcollapsed', 'SVMcollapsed', 'scANVIcollapsed', 'miscompartmentalized', 'log_n_counts', 'log_n_genes'
    var: '

### common index between all species

<hr>

In [18]:
common_index = human_subset_to_bat.var.index.intersection(
    lemur_subset_to_bat.var.index
).intersection(
    mouse_subset_to_bat.var.index
).intersection(
    bat_one2one.var.index
)
len(common_index)

6363

### Subsetting only to genes in shared index

<hr>

In [19]:
human_subset_to_bat_w_shared_index = subset_only_shared_genes(
    human_subset_to_bat, common_index
)
human_subset_to_bat_w_shared_index

View of AnnData object with n_obs × n_vars = 207016 × 6363
    obs: '10X_run', 'smartseq2_barcode', 'smartseq2_library_plate', 'smartseq2_sorting_direction', 'smartseq2_sort_order', 'smartseq2_cDNA_Plate', 'cell_id', 'method', 'donor', '10X_replicate', 'sequencer', 'sex', 'tissue', 'smartseq2_well_seq', 'notes', 'smartseq2_sorted_population', '10X_sample', 'anatomical_position', '10X_barcode', 'n_counts', 'n_genes', 'manual_cell_ontology_class', 'manual_annotation', 'doublet_filter', 'compartment', 'batch', 'compartment_pred', 'leiden', 'cluster_id', 'Propagated Annotation', 'Entropy All Methods', 'Entropy kNN Reps', 'Entropy scANVI Reps', 'Entropy SVM Reps', 'Potential Doublets', 'Onclass', 'singlecellNet', 'kNN', 'SVM', 'scANVI', 'compartment_pred_svm', 'manual_cell_ontology_classcollapsed', 'Propagated Annotationcollapsed', 'Onclasscollapsed', 'singlecellNetcollapsed', 'kNNcollapsed', 'SVMcollapsed', 'scANVIcollapsed', 'miscompartmentalized', 'log_n_counts', 'log_n_genes'
    var: '

In [20]:
mouse_subset_to_bat_w_shared_index = subset_only_shared_genes(
    mouse_subset_to_bat, common_index
)
mouse_subset_to_bat_w_shared_index

View of AnnData object with n_obs × n_vars = 238915 × 6363
    obs: 'age', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'species', 'species_latin', 'channel', 'channel_tissue', 'sequencing_run', 'cell_barcode', 'n_barcodes'
    var: 'n_cells', 'mouse_lemur__gene_name', 'mouse__gene_name', 'bat__gene_name'

In [21]:
lemur_subset_to_bat_w_shared_index = subset_only_shared_genes(
    lemur_subset_to_bat, common_index
)
lemur_subset_to_bat_w_shared_index

View of AnnData object with n_obs × n_vars = 232823 × 6363
    obs: 'batch', 'channel', 'individual', 'method', 'original_channel', 'channel_no_10x', 'sequencing_run', 'free_annotation', 'tissue', 'cell_ontology_class', 'cell_ontology_id', 'compartment', 'compartment_id', 'n_genes', 'n_counts', 'cell_barcode', 'possibly_contaminated_barcode'
    var: 'gene_ids', 'mouse_lemur__gene_name', 'mouse__gene_name', 'bat__gene_name'

In [22]:
bat_subset_to_bat_w_shared_index = subset_only_shared_genes(
    bat_one2one, common_index
)
bat_subset_to_bat_w_shared_index

View of AnnData object with n_obs × n_vars = 24201 × 6363
    obs: 'animalID', 'batch', 'sex', 'tissue', 'method', 'n_genes', 'n_counts'
    var: 'n_cells', 'mouse_lemur__gene_name', 'mouse__gene_name', 'bat__gene_name'

### Write one2one shared index adata objects to file

<hr>

In [23]:
filename_mapping = {
    "bat": bat_subset_to_bat_w_shared_index,
    "mouse": mouse_subset_to_bat_w_shared_index,
    "human": human_subset_to_bat_w_shared_index,
    "lemur": lemur_subset_to_bat_w_shared_index,
}

for species, adata in filename_mapping.items():
    adata.write(
        f"/mnt/ibm_sm/olga/immune-evolution/rawdata/{species}/h5ads/{species}_subset_w_shared_one2one_orthologs_human_lemur_mouse_bat.h5ad"
    )

  if is_string_dtype(df[key]) and not is_categorical(df[key])


### Combine species adata

<hr>

In [24]:
combined_adata = human_subset_to_bat_w_shared_index.concatenate(
    lemur_subset_to_bat_w_shared_index, 
    mouse_subset_to_bat_w_shared_index,
    bat_subset_to_bat_w_shared_index,
    batch_key='species_batch', 
    batch_categories=['human', 'lemur', 'mouse', 'bat']
)



In [25]:
combined_adata

AnnData object with n_obs × n_vars = 702955 × 6363
    obs: '10X_run', 'smartseq2_barcode', 'smartseq2_library_plate', 'smartseq2_sorting_direction', 'smartseq2_sort_order', 'smartseq2_cDNA_Plate', 'cell_id', 'method', 'donor', '10X_replicate', 'sequencer', 'sex', 'tissue', 'smartseq2_well_seq', 'notes', 'smartseq2_sorted_population', '10X_sample', 'anatomical_position', '10X_barcode', 'n_counts', 'n_genes', 'manual_cell_ontology_class', 'manual_annotation', 'doublet_filter', 'compartment', 'batch', 'compartment_pred', 'leiden', 'cluster_id', 'Propagated Annotation', 'Entropy All Methods', 'Entropy kNN Reps', 'Entropy scANVI Reps', 'Entropy SVM Reps', 'Potential Doublets', 'Onclass', 'singlecellNet', 'kNN', 'SVM', 'scANVI', 'compartment_pred_svm', 'manual_cell_ontology_classcollapsed', 'Propagated Annotationcollapsed', 'Onclasscollapsed', 'singlecellNetcollapsed', 'kNNcollapsed', 'SVMcollapsed', 'scANVIcollapsed', 'miscompartmentalized', 'log_n_counts', 'log_n_genes', 'channel', 'indiv

In [26]:
combined_adata.obs.columns

Index(['10X_run', 'smartseq2_barcode', 'smartseq2_library_plate',
       'smartseq2_sorting_direction', 'smartseq2_sort_order',
       'smartseq2_cDNA_Plate', 'cell_id', 'method', 'donor', '10X_replicate',
       'sequencer', 'sex', 'tissue', 'smartseq2_well_seq', 'notes',
       'smartseq2_sorted_population', '10X_sample', 'anatomical_position',
       '10X_barcode', 'n_counts', 'n_genes', 'manual_cell_ontology_class',
       'manual_annotation', 'doublet_filter', 'compartment', 'batch',
       'compartment_pred', 'leiden', 'cluster_id', 'Propagated Annotation',
       'Entropy All Methods', 'Entropy kNN Reps', 'Entropy scANVI Reps',
       'Entropy SVM Reps', 'Potential Doublets', 'Onclass', 'singlecellNet',
       'kNN', 'SVM', 'scANVI', 'compartment_pred_svm',
       'manual_cell_ontology_classcollapsed', 'Propagated Annotationcollapsed',
       'Onclasscollapsed', 'singlecellNetcollapsed', 'kNNcollapsed',
       'SVMcollapsed', 'scANVIcollapsed', 'miscompartmentalized',
       'lo

In [27]:
combined_adata.obs.species_batch.value_counts()

mouse    238915
lemur    232823
human    207016
bat       24201
Name: species_batch, dtype: int64

### clean combined adata

<hr>

In [29]:
def create_common_individual_id(row):
    species = row.species_batch
    if species == "human":
        return f"human_{row.donor}"
    elif species in ("lemur", "mouse lemur"):
        return f"lemur_{row.individual}"
    elif species == "mouse":
        return f"mouse_{row['mouse.id']}"
    elif species == "bat":
        return f"bat_{row.animalID}"
    

combined_adata.obs["common_individual_id"] = combined_adata.obs.apply(
    create_common_individual_id,
    axis=1
)
combined_adata.obs["common_individual_id"].value_counts()

human_TSP2       159308
lemur_Antoine     96842
lemur_Stumpy      83751
human_TSP1        47708
lemur_Martine     31776
mouse_30-M-2      24369
lemur_Bernard     20454
mouse_21-F-55     18359
mouse_18-M-52     17441
mouse_21-F-54     16285
mouse_1-M-63      15653
mouse_3-F-56      14787
bat_3             12781
mouse_30-M-5      12484
mouse_24-M-58     11883
mouse_24-M-59     11838
mouse_18-F-50     11808
bat_2             11420
mouse_18-F-51      9571
mouse_1-M-62       9500
mouse_24-M-60      9497
mouse_30-M-3       9286
mouse_3-F-57       8233
mouse_30-M-4       7768
mouse_3-M-8        6189
mouse_3-M-7/8      5235
mouse_3-M-9        4897
mouse_3-M-5/6      4880
mouse_18-M-53      4837
mouse_24-M-61      3665
mouse_3-M-8/9       450
Name: common_individual_id, dtype: int64

In [30]:
# drop smartseq data columns

cols_to_drop = [
    'smartseq2_barcode', 
    'smartseq2_library_plate', 
    'smartseq2_sorting_direction', 
    'smartseq2_sort_order', 
    'smartseq2_cDNA_Plate',
    'smartseq2_sort_order', 
    'smartseq2_cDNA_Plate'
]

combined_adata_obs = combined_adata.obs.drop(cols_to_drop, axis=1)
print(combined_adata_obs.shape)
combined_adata_obs.head()

(702955, 69)


Unnamed: 0,10X_run,cell_id,method,donor,10X_replicate,sequencer,sex,tissue,smartseq2_well_seq,notes,...,mouse.id,subtissue,tissue_free_annotation,species,species_latin,channel_tissue,n_barcodes,animalID,species_batch,common_individual_id
AAACCCAAGTTGTCGT_TSP1_blood_2-human,TSP1_blood_2,AAACCCAAGTTGTCGT_TSP1_blood_2,10X,TSP1,2,novaseq,female,Blood,,,...,,,,,,,,,human,human_TSP1
AAACGAAAGCCGTTAT_TSP1_blood_2-human,TSP1_blood_2,AAACGAAAGCCGTTAT_TSP1_blood_2,10X,TSP1,2,novaseq,female,Blood,,,...,,,,,,,,,human,human_TSP1
AAAGGATAGTGTTGTC_TSP1_blood_2-human,TSP1_blood_2,AAAGGATAGTGTTGTC_TSP1_blood_2,10X,TSP1,2,novaseq,female,Blood,,,...,,,,,,,,,human,human_TSP1
AAAGGGCAGGAACTAT_TSP1_blood_2-human,TSP1_blood_2,AAAGGGCAGGAACTAT_TSP1_blood_2,10X,TSP1,2,novaseq,female,Blood,,,...,,,,,,,,,human,human_TSP1
AAAGGGCCAACCAACT_TSP1_blood_2-human,TSP1_blood_2,AAAGGGCCAACCAACT_TSP1_blood_2,10X,TSP1,2,novaseq,female,Blood,,,...,,,,,,,,,human,human_TSP1


In [None]:
# need to unify for cell ontology, and compartment
# need to lookup cell ontology class to use?
def unify_cell_ontology_class(row):
    species = row.species_batch
    if species == "human":
        return 
    elif species in ("lemur", "mouse lemur"):
        return row.cell_ontology_class
    elif species == "mouse":
        return 
    elif species == "bat":
        return 


combined_adata_obs.columns