# Imports

In [121]:
%load_ext autoreload
%autoreload 2

import glob
import os
import re
import pandas as pd
pd.options.display.max_colwidth = 500
pd.options.display.max_columns = 50

import seaborn as sns
import sourmash
import scanpy as sc
from tqdm import tqdm
import kmermaid_utils
import sig_utils

from sourmash_search_utils import read_and_process_sourmash_search_results

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Constants (ksizes, scaled)

In [68]:
MOAR_KSIZES = list(range(21, 91, 3))

In [69]:
SCALED = 10

In [70]:
ls -lha

total 5.0K
drwxr-xr-x 10 olga czb 4.0K Mar 27 13:48 [0m[01;34m.[0m/
drwxr-xr-x  4 olga czb 4.0K Feb 28 09:53 [01;34m..[0m/
drwxr-xr-x 11 olga czb 4.0K Mar 27 13:38 [01;34m0--train-mouse1[0m/
drwxr-xr-x  4 olga czb 4.0K Mar  5 14:16 [01;34m1--test-mouse2[0m/
drwxr-xr-x  5 olga czb 4.0K Feb 28 16:48 [01;34m2--search-results[0m/
drwxr-xr-x  5 olga czb 4.0K Mar 27 13:48 [01;34m3-0--search-results--min-5-percent-shared-kmers[0m/
drwxr-xr-x  5 olga czb 4.0K Mar  5 14:16 [01;34m3-0--search-results--remove_common[0m/
drwxr-xr-x  5 olga czb 4.0K Mar 27 13:48 [01;34m3-1--search-results--min-10-percent-shared-kmers[0m/
drwxr-xr-x  5 olga czb 4.0K Mar  5 14:16 [01;34m3-1--search-results--raw_merged[0m/
drwxr-xr-x  2 olga czb 4.0K Mar  5 14:48 [01;34m4--aggregated-results[0m/


In [98]:
mouse2mouse_dir = '/home/olga/data_sm/immune-evolution/kmer-signatures/0--mouse2mouse/1--mouse2mouse/'
!ls -lha $mouse2mouse_dir

total 5.0K
drwxr-xr-x 10 olga czb 4.0K Mar 27 13:48 .
drwxr-xr-x  4 olga czb 4.0K Feb 28 09:53 ..
drwxr-xr-x 11 olga czb 4.0K Mar 27 13:38 0--train-mouse1
drwxr-xr-x  4 olga czb 4.0K Mar  5 14:16 1--test-mouse2
drwxr-xr-x  5 olga czb 4.0K Feb 28 16:48 2--search-results
drwxr-xr-x  5 olga czb 4.0K Mar 27 13:48 3-0--search-results--min-5-percent-shared-kmers
drwxr-xr-x  5 olga czb 4.0K Mar  5 14:16 3-0--search-results--remove_common
drwxr-xr-x  5 olga czb 4.0K Mar 27 13:48 3-1--search-results--min-10-percent-shared-kmers
drwxr-xr-x  5 olga czb 4.0K Mar  5 14:16 3-1--search-results--raw_merged
drwxr-xr-x  2 olga czb 4.0K Mar  5 14:48 4--aggregated-results


In [71]:
sig_outdir_base = "/home/olga/data_sm/immune-evolution/kmer-signatures"

mouse2mouse_dir = os.path.join(sig_outdir_base, "0--mouse2mouse", "1--mouse2mouse")


aggregated_results_dir = os.path.join(mouse2mouse_dir, "4--aggregated-results")
! mkdir -p $aggregated_results_dir

## Def describe

In [72]:
def describe(df, random=False):
    print(df.shape)
    print("--- First 5 entries ---")
    display(df.head())
    if random:
        print('--- Random subset ---')
        display(df.sample(5))

### Remove rogue TQDM instances

In [73]:
def remove_rogue_tqdm():
    import tqdm
    try:
        tqdm._instances.clear()
    except AttributeError:
        pass

## Read one2one 5had

In [74]:
one2one = sc.read(
#     "/home/phoenix/data_sm/code/immune-evolution/h5ads/human-lemur-mouse-bat__lung_only_copy.h5ad"
    "/home/olga/data_sm/immune-evolution/h5ads/human-lemur-mouse-bat/human-lemur-mouse-bat__lung_only.h5ad"
)
one2one.obs = one2one.obs.reset_index().set_index('cell_id')
# one2one = one2one[one2one.obs.species_batch == 'mouse']

one2one.obs.head()

Unnamed: 0_level_0,index,age,cell_barcode,cell_ontology_class,cell_ontology_id,channel,free_annotation,individual,sample,sequencing_run,sex,species,species_batch,species_latin,tissue,narrow_group,broad_group,compartment_group,compartment_narrow,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
P2_1__AAACCTGAGAAACCAT,P2_1_AAACCTGAGAAACCAT-human-hlm,46y,AAACCTGAGAAACCAT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AAATGCCAGATGAGAG,P2_1_AAATGCCAGATGAGAG-human-hlm,46y,AAATGCCAGATGAGAG,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGATCCCT,P2_1_AACACGTTCGATCCCT-human-hlm,46y,AACACGTTCGATCCCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGCACTCT,P2_1_AACACGTTCGCACTCT-human-hlm,46y,AACACGTTCGCACTCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACCATGCAGCTCGCA,P2_1_AACCATGCAGCTCGCA-human-hlm,46y,AACCATGCAGCTCGCA,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2


# Read and transform Sourmash search results

## Functions / tools to add cell type ontologies

### Column renamer

In [75]:
cols = ["cell_ontology_class", "narrow_group", "broad_group", "compartment_group"]

mouse1_renamer = {col: f"mouse1_{col}"  for col in cols}
mouse2_renamer = {col: f"mouse2_{col}" for col in cols}

In [76]:
def add_ontologies(search_results_df, adata, mouse_cell_id_col, renamer):

    search_results_with_ontologies = search_results_df.join(
        adata.obs[cols], 
        on=mouse_cell_id_col,
        how='inner'
    )
    # Remove null ksizes
#     search_results_with_ontologies = search_results_with_ontologies.loc[search_results_with_ontologies.ksize.notnull()]
    search_results_with_ontologies['ksize'] = search_results_with_ontologies['ksize'].astype(int)
    search_results_with_ontologies['sketch_value'] = search_results_with_ontologies['sketch_value'].astype(int)
    search_results_with_ontologies.rename(columns=renamer, inplace=True)
    search_results_with_ontologies.reset_index(inplace=True, drop=True)
#     search_results_with_ontologies.dropna(subset=renamer.values(), inplace=True)

#     print(search_results_with_ontologies.shape)
    return search_results_with_ontologies

## Read output CSVs

In [78]:
celltype_cols = ["cell_ontology_class", "narrow_group", "broad_group", "compartment_group"]
matching_cols = [f'{x}_matches' for x in celltype_cols]
celltype_to_matching_cols = dict(zip(celltype_cols, matching_cols))

In [117]:
1+1

2

In [123]:
dfs = []

globber = os.path.join(mouse2mouse_dir, '3-*', '*', '*.csv')

# Hardcoded magic number from the total number of lines in the file of all comands submitted to parallel
# (immune-evolution)
#  ✘  Tue 15 Dec - 18:55  ~/data_sm/immune-evolution/pipeline-results/mouse/kmermaid/lung--mouse--remove-ribo/sbt_indices_handmade_remove_ribosomal_all_cells_per_mouse 
#  olga@lrrr  wc -l  /home/olga/data_sm/immune-evolution/pipeline-results/mouse/kmermaid/lung--mouse--remove-ribo/search_mouse2mouse__remove_ribosomal_dissociation__all_cells__remove_common_hashes/sourmash_search_commands.txt
# 97416 /home/olga/data_sm/immune-evolution/pipeline-results/mouse/kmermaid/lung--mouse--remove-ribo/search_mouse2mouse__remove_ribosomal_dissociation__all_cells__remove_common_hashes/sourmash_search_commands.tx

# Manually obtained from the file sent to gnu parallel
total = sum(1 for _ in glob.iglob(globber))



def add_is_matching_col(
    df, 
    celltype_to_matching_cols=celltype_to_matching_cols, 
    mouse1_renamer=mouse1_renamer, 
    mouse2_renamer=mouse2_renamer
):
    ## Add matching col of whether the category matches
    for celltype_col, matching_col in celltype_to_matching_cols.items():
        mouse1_col = mouse1_renamer[celltype_col]
        mouse2_col = mouse2_renamer[celltype_col]
        df[matching_col] = df[mouse1_col] == df[mouse2_col]
    return df


for csv in tqdm(glob.iglob(globber), total=total):
#     df = read_and_process_sourmash_search_results(
#         csv,
#         adata=one2one,
#         column_renamer=mouse2_renamer,
#         query_species='mouse',
#         shared_celltypes=SHARED_CELLTYPES,
#     )

#     df["database_type"] = os.path.dirname(csv).split("/")[-2].split("--")[-1]

#     dfs.append(df)
    
    sketch_id = os.path.basename(os.path.dirname(csv))
    basename = os.path.basename(csv).split('.csv')[0]
    cell_id = basename.split('.')[0]


    df = pd.read_csv(csv)
    if 'containment' in csv:
        df["search_method"] = "containment"
    else:
        df["search_method"] = "similarity_abundance"
    df["database_type"] = os.path.dirname(csv).split("/")[-2].split("--")[-1] 
    
    df['sketch_id'] = sketch_id
    sig_info = df['sketch_id'].str.extractall(sig_utils.SKETCH_INFO_PATTERN)
    sig_info = sig_info.droplevel(-1)
    sig_info['ksize'] = sig_info['ksize'].astype(int)
    sig_info['sketch_value'] = sig_info['sketch_value'].astype(int)
    df['mouse2_cell_id'] = cell_id
    df_metadata = pd.concat([df, sig_info], axis=1)
    df_metadata = df_metadata.drop(['filename', 'md5'], axis=1)
    df_metadata = df_metadata.rename(columns={'name': 'mouse1_broad_group'})
#     df_metadata['mouse1_channel'] = df_metadata.mouse1_cell_id.str.split('__').str[0]
    df_metadata['mouse2_channel'] = df_metadata.mouse2_cell_id.str.split('__').str[0]
#     df_metadata['mouse1_eq_mouse2'] = df_metadata.mouse2_channel == df_metadata.mouse1_channel
    
    # Add cell_ontology_class, narrow_group, broad_group, compartment_group for each mouse
    df_metadata_mouse2_ontologies = add_ontologies(df_metadata, one2one, 'mouse2_cell_id', mouse2_renamer)
    
    # Add whether mouse1, mouse2 cell type match
    #     df_metadata_mouse2_ontologies = add_is_matching_col(df_metadata_mouse2_ontologies)
    
    dfs.append(df_metadata_mouse2_ontologies)
#     break
search_results = pd.concat(dfs, ignore_index=True)
print(search_results.shape)
search_results.head()

100%|██████████| 65940/65940 [54:50<00:00, 20.04it/s]  


(656190, 16)


Unnamed: 0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mouse2_cell_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,mouse2_channel,mouse2_cell_ontology_class,mouse2_narrow_group,mouse2_broad_group,mouse2_compartment_group
0,0.002426,Monocyte,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid
1,0.002267,B cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid
2,0.001553,Capillary,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid
3,0.001389,T cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid
4,0.001267,Dendritic,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid


In [125]:
search_results['database_method'] = search_results.database_type.astype(str) + ': ' + search_results.search_method.astype(str)

In [126]:
search_results.database_method.value_counts()

min-10-percent-shared-kmers: containment                                 65940
min-10-percent-shared-kmers: similarity_abundance                        65940
min-10-percent-shared-kmers-remove-common-kmers: containment             65940
min-5-percent-shared-kmers: containment                                  65940
min-5-percent-shared-kmers: similarity_abundance                         65940
min-10-percent-shared-kmers-remove-common-kmers: similarity_abundance    65940
min-5-percent-shared-kmers-remove-common-kmers: containment              65940
min-5-percent-shared-kmers-remove-common-kmers: similarity_abundance     65940
raw_merged: similarity_abundance                                         64335
remove_common: similarity_abundance                                      64335
Name: database_method, dtype: int64

In [127]:
search_results

Unnamed: 0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mouse2_cell_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,mouse2_channel,mouse2_cell_ontology_class,mouse2_narrow_group,mouse2_broad_group,mouse2_compartment_group,database_method
0,0.002426,Monocyte,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance
1,0.002267,B cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance
2,0.001553,Capillary,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance
3,0.001389,T cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance
4,0.001267,Dendritic,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656185,0.323133,Smooth Muscle and Myofibroblast,similarity_abundance,min-10-percent-shared-kmers,alphabet-protein__ksize-30__scaled-10,MACA_18m_F_LUNG_51__GACGTTAGTCCTCTTG,alphabet,protein,30,scaled,10,MACA_18m_F_LUNG_51,mature NK T cell,Natural Killer T cell,Natural Killer T cell,lymphoid,min-10-percent-shared-kmers: similarity_abundance
656186,0.254790,Macrophage,similarity_abundance,min-10-percent-shared-kmers,alphabet-protein__ksize-30__scaled-10,MACA_18m_F_LUNG_51__GACGTTAGTCCTCTTG,alphabet,protein,30,scaled,10,MACA_18m_F_LUNG_51,mature NK T cell,Natural Killer T cell,Natural Killer T cell,lymphoid,min-10-percent-shared-kmers: similarity_abundance
656187,0.247480,Capillary,similarity_abundance,min-10-percent-shared-kmers,alphabet-protein__ksize-30__scaled-10,MACA_18m_F_LUNG_51__GACGTTAGTCCTCTTG,alphabet,protein,30,scaled,10,MACA_18m_F_LUNG_51,mature NK T cell,Natural Killer T cell,Natural Killer T cell,lymphoid,min-10-percent-shared-kmers: similarity_abundance
656188,0.231250,Fibroblast,similarity_abundance,min-10-percent-shared-kmers,alphabet-protein__ksize-30__scaled-10,MACA_18m_F_LUNG_51__GACGTTAGTCCTCTTG,alphabet,protein,30,scaled,10,MACA_18m_F_LUNG_51,mature NK T cell,Natural Killer T cell,Natural Killer T cell,lymphoid,min-10-percent-shared-kmers: similarity_abundance


In [128]:
describe(search_results)

(656190, 17)
--- First 5 entries ---


Unnamed: 0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mouse2_cell_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,mouse2_channel,mouse2_cell_ontology_class,mouse2_narrow_group,mouse2_broad_group,mouse2_compartment_group,database_method
0,0.002426,Monocyte,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance
1,0.002267,B cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance
2,0.001553,Capillary,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance
3,0.001389,T cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance
4,0.001267,Dendritic,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance


### Make sure ksize 21, 24 in results

In [129]:
sorted(list(set(search_results.ksize)))

[21, 30, 51]

In [130]:
len(set(search_results.sketch_id))

3

In [131]:
search_results.sketch_id.value_counts()

alphabet-DNA__ksize-21__scaled-10        219226
alphabet-protein__ksize-30__scaled-10    218576
alphabet-dayhoff__ksize-51__scaled-10    218388
Name: sketch_id, dtype: int64

In [132]:
search_results.database_type.value_counts()

min-5-percent-shared-kmers-remove-common-kmers     131880
min-10-percent-shared-kmers-remove-common-kmers    131880
min-10-percent-shared-kmers                        131880
min-5-percent-shared-kmers                         131880
remove_common                                       64335
raw_merged                                          64335
Name: database_type, dtype: int64

In [133]:
set(search_results.alphabet)

{'DNA', 'dayhoff', 'protein'}

## Add human readable column of cell type lookup

In [134]:


mouse2_channel__to__human_readable = {
    'MACA_18m_M_LUNG_52': 'self:self',
    'MACA_18m_M_LUNG_53': 'male:male',
    'MACA_18m_F_LUNG_51': 'male:female'
}


search_results['mouse2_human_readable'] = search_results.mouse2_channel.map(mouse2_channel__to__human_readable)
search_results['broad_group_matches'] = search_results['mouse1_broad_group'] == search_results['mouse2_broad_group']
describe(search_results)

(656190, 19)
--- First 5 entries ---


Unnamed: 0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mouse2_cell_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,mouse2_channel,mouse2_cell_ontology_class,mouse2_narrow_group,mouse2_broad_group,mouse2_compartment_group,database_method,mouse2_human_readable,broad_group_matches
0,0.002426,Monocyte,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,False
1,0.002267,B cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,True
2,0.001553,Capillary,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,False
3,0.001389,T cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,False
4,0.001267,Dendritic,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,False


## Add some convenience columns for plotting

In [135]:
%%time
broad_group_matches_str = {True: "same celltype", False: 'different celltype'}

search_results['broad_group_matches_string'] = search_results['broad_group_matches'].map(
    broad_group_matches_str
)
search_results['alpha_ksize'] = search_results.apply(
    lambda x: '{alphabet}, k:{ksize}'.format(**x), 
    axis=1
)
search_results['alpha_match'] = search_results.apply(
    lambda x: '{alphabet}, {broad_group_matches_string}'.format(**x), 
    axis=1
)

search_results['mouse2_match'] = search_results.apply(
    lambda x: '{mouse2_human_readable}, {broad_group_matches_string}'.format(**x), 
    axis=1
)
describe(search_results)

(656190, 23)
--- First 5 entries ---


Unnamed: 0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mouse2_cell_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,mouse2_channel,mouse2_cell_ontology_class,mouse2_narrow_group,mouse2_broad_group,mouse2_compartment_group,database_method,mouse2_human_readable,broad_group_matches,broad_group_matches_string,alpha_ksize,alpha_match,mouse2_match
0,0.002426,Monocyte,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,False,different celltype,"dayhoff, k:51","dayhoff, different celltype","male:female, different celltype"
1,0.002267,B cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,True,same celltype,"dayhoff, k:51","dayhoff, same celltype","male:female, same celltype"
2,0.001553,Capillary,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,False,different celltype,"dayhoff, k:51","dayhoff, different celltype","male:female, different celltype"
3,0.001389,T cell,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,False,different celltype,"dayhoff, k:51","dayhoff, different celltype","male:female, different celltype"
4,0.001267,Dendritic,similarity_abundance,remove_common,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__TGCACCTAGGGTATCG,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,B cell,B cell,B cell,lymphoid,remove_common: similarity_abundance,male:female,False,different celltype,"dayhoff, k:51","dayhoff, different celltype","male:female, different celltype"


CPU times: user 2min 48s, sys: 1.31 s, total: 2min 50s
Wall time: 2min 50s


## Constants (`classification_groupby`, `celltype_col`, `query_organism`, `scoring_groupby`)

In [147]:
classification_groupby = ["mouse2_cell_id", "database_type", "search_method", "alphabet", "ksize"]

celltype_col = "broad_group"
sbt_organism = "mouse1"
query_organism = "mouse2"

predicted_celltype_col = f"{sbt_organism}_{celltype_col}"
ground_truth_celltype_col = f"{query_organism}_{celltype_col}"

scoring_groupby = [
    "mouse2_human_readable",
    "database_type",
    "search_method",
    "alphabet",
    "ksize",
]

In [153]:

t_cell = 'MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA'
search_results.query('mouse2_cell_id == @t_cell').groupby(classification_groupby).apply(lambda x: x.nlargest(1, 'similarity'))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mouse2_cell_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,mouse2_channel,mouse2_cell_ontology_class,mouse2_narrow_group,mouse2_broad_group,mouse2_compartment_group,database_method,mouse2_human_readable,broad_group_matches,broad_group_matches_string,alpha_ksize,alpha_match,mouse2_match,groundtruth_celltype,predicted_celltype
mouse2_cell_id,database_type,search_method,alphabet,ksize,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers,containment,DNA,21,159125,0.296154,Monocyte,containment,min-10-percent-shared-kmers,alphabet-DNA__ksize-21__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,DNA,21,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: containment,male:female,False,different celltype,"DNA, k:21","DNA, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers,containment,dayhoff,51,137145,0.27151,Monocyte,containment,min-10-percent-shared-kmers,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: containment,male:female,False,different celltype,"dayhoff, k:51","dayhoff, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers,containment,protein,30,181105,0.296568,Monocyte,containment,min-10-percent-shared-kmers,alphabet-protein__ksize-30__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,protein,30,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: containment,male:female,False,different celltype,"protein, k:30","protein, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers,similarity_abundance,DNA,21,619100,0.507136,T cell,similarity_abundance,min-10-percent-shared-kmers,alphabet-DNA__ksize-21__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,DNA,21,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: similarity_abundance,male:female,True,same celltype,"DNA, k:21","DNA, same celltype","male:female, same celltype",T cell,T cell
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers,similarity_abundance,dayhoff,51,597120,0.537653,T cell,similarity_abundance,min-10-percent-shared-kmers,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: similarity_abundance,male:female,True,same celltype,"dayhoff, k:51","dayhoff, same celltype","male:female, same celltype",T cell,T cell
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers,similarity_abundance,protein,30,641080,0.560248,T cell,similarity_abundance,min-10-percent-shared-kmers,alphabet-protein__ksize-30__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,protein,30,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: similarity_abundance,male:female,True,same celltype,"protein, k:30","protein, same celltype","male:female, same celltype",T cell,T cell
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers-remove-common-kmers,containment,DNA,21,355340,0.182216,Monocyte,containment,min-10-percent-shared-kmers-remove-common-kmers,alphabet-DNA__ksize-21__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,DNA,21,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers-remove-common-kmers: containment,male:female,False,different celltype,"DNA, k:21","DNA, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers-remove-common-kmers,containment,dayhoff,51,333360,0.165634,Monocyte,containment,min-10-percent-shared-kmers-remove-common-kmers,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers-remove-common-kmers: containment,male:female,False,different celltype,"dayhoff, k:51","dayhoff, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers-remove-common-kmers,containment,protein,30,377320,0.171718,Monocyte,containment,min-10-percent-shared-kmers-remove-common-kmers,alphabet-protein__ksize-30__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,protein,30,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers-remove-common-kmers: containment,male:female,False,different celltype,"protein, k:30","protein, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,min-10-percent-shared-kmers-remove-common-kmers,similarity_abundance,DNA,21,487220,0.083241,T cell,similarity_abundance,min-10-percent-shared-kmers-remove-common-kmers,alphabet-DNA__ksize-21__scaled-10,MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,alphabet,DNA,21,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers-remove-common-kmers: similarity_abundance,male:female,True,same celltype,"DNA, k:21","DNA, same celltype","male:female, same celltype",T cell,T cell


## Add unifying column of predicted and ground truth celltypes

In [154]:
search_results['groundtruth_celltype'] = search_results[ground_truth_celltype_col]
search_results['predicted_celltype'] = search_results[predicted_celltype_col]

# write results to parquet file

In [155]:
%%time

parquet_fn = "sourmash-search-results.parquet"
parquet = os.path.join(aggregated_results_dir, parquet_fn)
search_results.to_parquet(parquet)

CPU times: user 1.53 s, sys: 188 ms, total: 1.72 s
Wall time: 1.72 s


# Predict cell type with top hit

## Get top N

In [156]:
%%time

predicted_cells_top_hit = search_results.groupby(
    classification_groupby, as_index=False, group_keys=False
).apply(lambda x: x.nlargest(1, 'similarity'))

predicted_cells_top_hit = predicted_cells_top_hit.set_index(f"{query_organism}_cell_id")

print(predicted_cells_top_hit.shape)
predicted_cells_top_hit.head()

(65940, 24)
CPU times: user 5min 2s, sys: 5.64 s, total: 5min 8s
Wall time: 5min 6s


Unnamed: 0_level_0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,mouse2_channel,mouse2_cell_ontology_class,mouse2_narrow_group,mouse2_broad_group,mouse2_compartment_group,database_method,mouse2_human_readable,broad_group_matches,broad_group_matches_string,alpha_ksize,alpha_match,mouse2_match,groundtruth_celltype,predicted_celltype
mouse2_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.296154,Monocyte,containment,min-10-percent-shared-kmers,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: containment,male:female,False,different celltype,"DNA, k:21","DNA, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.27151,Monocyte,containment,min-10-percent-shared-kmers,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: containment,male:female,False,different celltype,"dayhoff, k:51","dayhoff, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.296568,Monocyte,containment,min-10-percent-shared-kmers,alphabet-protein__ksize-30__scaled-10,alphabet,protein,30,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: containment,male:female,False,different celltype,"protein, k:30","protein, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.507136,T cell,similarity_abundance,min-10-percent-shared-kmers,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: similarity_abundance,male:female,True,same celltype,"DNA, k:21","DNA, same celltype","male:female, same celltype",T cell,T cell
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.537653,T cell,similarity_abundance,min-10-percent-shared-kmers,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51,"CD4-positive, alpha-beta T cell",CD4+ T,T cell,lymphoid,min-10-percent-shared-kmers: similarity_abundance,male:female,True,same celltype,"dayhoff, k:51","dayhoff, same celltype","male:female, same celltype",T cell,T cell


In [157]:
celpredicted_cells_top_hit.loc['']

NameError: name 'celpredicted_cells_top_hit' is not defined

### Write top N to file

In [None]:
%%time

parquet_fn = "sourmash-search-results--top-hit.parquet"
parquet = os.path.join(aggregated_results_dir, parquet_fn)
predicted_cells_top_hit.to_parquet(parquet)

In [None]:
parquet

In [None]:
predicted_cells_top_hit.database_type.value_counts()