# Imports

In [59]:
%load_ext autoreload
%autoreload 2

import glob
import os
import re
import pandas as pd
pd.options.display.max_colwidth = 500
pd.options.display.max_columns = 50

import seaborn as sns
import sourmash
import scanpy as sc
from tqdm import tqdm

# Artisinal, small-batch modules
import kmermaid_utils
import sig_utils

from sourmash_search_utils import read_and_process_sourmash_search_results

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Constants (ksizes, scaled)

In [60]:
sig_outdir_base = '/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures'

human_test_dir = os.path.join(sig_outdir_base, "2--test-human")


aggregated_results_dir = os.path.join(human_test_dir, '4--aggregated-results')
! mkdir -p $aggregated_results_dir

## Def describe

In [61]:
def describe(df, random=False):
    print(df.shape)
    print("--- First 5 entries ---")
    display(df.head())
    if random:
        print('--- Random subset ---')
        display(df.sample(5))

### Remove rogue TQDM instances

In [70]:
def remove_rogue_tqdm():
    import tqdm
    try:
        tqdm._instances.clear()
    except AttributeError:
        pass

## Read one2one 5had

In [63]:
one2one = sc.read(
#     "/home/phoenix/data_lg/data_sm_copy/code/immune-evolution/h5ads/human-lemur-mouse-bat__lung_only_copy.h5ad"
    "/home/olga/data_lg/data_sm_copy/immune-evolution/h5ads/human-lemur-mouse-bat/human-lemur-mouse-bat__lung_only.h5ad"
)
one2one.obs = one2one.obs.reset_index().set_index('cell_id')

one2one.obs.head()

Unnamed: 0_level_0,index,age,cell_barcode,cell_ontology_class,cell_ontology_id,channel,free_annotation,individual,sample,sequencing_run,sex,species,species_batch,species_latin,tissue,narrow_group,broad_group,compartment_group,compartment_narrow,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
P2_1__AAACCTGAGAAACCAT,P2_1_AAACCTGAGAAACCAT-human-hlm,46y,AAACCTGAGAAACCAT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AAATGCCAGATGAGAG,P2_1_AAATGCCAGATGAGAG-human-hlm,46y,AAATGCCAGATGAGAG,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGATCCCT,P2_1_AACACGTTCGATCCCT-human-hlm,46y,AACACGTTCGATCCCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGCACTCT,P2_1_AACACGTTCGCACTCT-human-hlm,46y,AACACGTTCGCACTCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACCATGCAGCTCGCA,P2_1_AACCATGCAGCTCGCA-human-hlm,46y,AACCATGCAGCTCGCA,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,male,Human,human,Homo sapiens,Lung,Capillary Aerocyte,Capillary,endothelial,endothelial: Capillary Aerocyte,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2


# Analyze Sourmash search

## Functions / tools to add cell type ontologies

### Column renamer

In [64]:
cols = ["cell_ontology_class", "narrow_group", "broad_group", "compartment_group"]

mouse_renamer = {col: f"mouse_{col}"  for col in cols}
human_renamer = {col: f"human_{col}" for col in cols}

In [65]:
def add_ontologies(search_results_df, adata, mouse_cell_id_col, renamer):

    search_results_with_ontologies = search_results_df.join(
        adata.obs[cols], 
        on=mouse_cell_id_col,
        how='inner'
#         right_on="cell_id",
    )
    # Remove null ksizes
#     search_results_with_ontologies = search_results_with_ontologies.loc[search_results_with_ontologies.ksize.notnull()]
    search_results_with_ontologies['ksize'] = search_results_with_ontologies['ksize'].astype(int)
    search_results_with_ontologies['sketch_value'] = search_results_with_ontologies['sketch_value'].astype(int)
    search_results_with_ontologies.rename(columns=renamer, inplace=True)
    search_results_with_ontologies.reset_index(inplace=True, drop=True)
#     search_results_with_ontologies.dropna(subset=renamer.values(), inplace=True)

#     print(search_results_with_ontologies.shape)
    return search_results_with_ontologies

## Read output CSVs

In [66]:
celltype_cols = ["cell_ontology_class", "narrow_group", "broad_group", "compartment_group"]
matching_cols = [f'{x}_matches' for x in celltype_cols]
celltype_to_matching_cols = dict(zip(celltype_cols, matching_cols))

In [68]:
SHARED_CELLTYPES = ['Alveolar Epithelial Type 2',
 'B cell',
 'Capillary',
 'Dendritic',
 'Fibroblast',
 'Macrophage',
 'Monocyte',
 'Natural Killer T cell',
 'Smooth Muscle and Myofibroblast',
 'T cell']
SHARED_CELLTYPES

['Alveolar Epithelial Type 2',
 'B cell',
 'Capillary',
 'Dendritic',
 'Fibroblast',
 'Macrophage',
 'Monocyte',
 'Natural Killer T cell',
 'Smooth Muscle and Myofibroblast',
 'T cell']

### Example csv path

In [22]:
ll /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--remove_common/alphabet-DNA__ksize-21__scaled-10/P3_7__TCCACACAGTGACTCT.csv

-rw-r--r-- 1 olga 1800 Feb 28 20:43 /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--remove_common/alphabet-DNA__ksize-21__scaled-10/P3_7__TCCACACAGTGACTCT.csv


In [85]:
ls -lha $human_test_dir

total 6.0K
drwxr-xr-x 12 olga czb 4.0K Mar 27 13:09 [0m[01;34m.[0m/
drwxr-xr-x  7 olga czb 4.0K Mar  5 12:02 [01;34m..[0m/
drwxr-xr-x  4 olga czb 4.0K Mar  3 14:58 [01;34m0--protein-coding-fastas[0m/
drwxr-xr-x  5 olga czb 4.0K Feb 28 11:12 [01;34m1--single-cell-sigs[0m/
drwxr-xr-x  5 olga czb 4.0K Feb 28 17:02 [01;34m2--mouse-search-results[0m/
drwxr-xr-x  5 olga czb 4.0K Mar 26 19:54 [01;34m3-0--mouse-search-results--min-5-percent-shared-kmers[0m/
drwxr-xr-x  5 olga czb 4.0K Mar 27 13:09 [01;34m3-0--mouse-search-results--min-5-percent-shared-kmers-remove-common-kmers[0m[K/
drwxr-xr-x  5 olga czb 4.0K Feb 28 17:17 [01;34m3-0--mouse-search-results--remove_common[0m/
drwxr-xr-x  5 olga czb 4.0K Mar 26 19:54 [01;34m3-1--mouse-search-results--min-10-percent-shared-kmers[0m/
drwxr-xr-x  5 olga czb 4.0K Mar 27 13:09 [01;34m3-1--mouse-search-results--min-10-percent-shared-kmers-remove-common-kmers[0m[K/
drwxr-xr-x  5 olga czb 4.0K Feb 28 17:17 [01;34m3-1--mouse-search

### Actually read the csvs

In [99]:
1+1

2

In [None]:
dfs = []

globber = os.path.join(human_test_dir, "3-*", "*", "*.csv")

total = sum(1 for _ in glob.iglob(globber))


query_species = "human"

cols = ["cell_ontology_class", "narrow_group", "broad_group", "compartment_group"]

query_renamer = {col: f"{query_species}_{col}" for col in cols}



for csv in tqdm(glob.iglob(globber), total=total):
    df = read_and_process_sourmash_search_results(
        csv,
        adata=one2one,
        column_renamer=query_renamer,
        query_species=query_species,
        shared_celltypes=SHARED_CELLTYPES,
    )

    df["database_type"] = os.path.dirname(csv).split("/")[-2].split("--")[-1]

    dfs.append(df)
#     break
search_results = pd.concat(dfs, ignore_index=True)
print(search_results.shape)
search_results.head()

 56%|█████▋    | 239467/424610 [4:58:10<4:00:13, 12.85it/s] 

In [102]:
search_results.search_method.value_counts()

similarity_abundance    2535405
containment             1698231
Name: search_method, dtype: int64

In [87]:
n_results_per_sketch_id = search_results.groupby(['sketch_id', 'alphabet', 'ksize']).size()
n_results_per_sketch_id.name = 'n_results'
n_results_per_sketch_id = n_results_per_sketch_id.reset_index()
describe(n_results_per_sketch_id)

(3, 4)
--- First 5 entries ---


Unnamed: 0,sketch_id,alphabet,ksize,n_results
0,alphabet-DNA__ksize-21__scaled-10,DNA,21,848262
1,alphabet-dayhoff__ksize-51__scaled-10,dayhoff,51,842911
2,alphabet-protein__ksize-30__scaled-10,protein,30,844232


In [88]:
n_results_per_sketch_id.nlargest(5, 'n_results')

Unnamed: 0,sketch_id,alphabet,ksize,n_results
0,alphabet-DNA__ksize-21__scaled-10,DNA,21,848262
2,alphabet-protein__ksize-30__scaled-10,protein,30,844232
1,alphabet-dayhoff__ksize-51__scaled-10,dayhoff,51,842911


In [89]:
n_results_per_sketch_id.loc[n_results_per_sketch_id.n_results.argmax()]

sketch_id    alphabet-DNA__ksize-21__scaled-10
alphabet                                   DNA
ksize                                       21
n_results                               848262
Name: 0, dtype: object

## Constants (`classification_groupby`, `celltype_col`, `query_organism`, `scoring_groupby`)

In [104]:


celltype_col = "broad_group"
sbt_organism = "mouse"
query_organism = "human"

classification_groupby = [
    f"{query_organism}_cell_id",
    'search_method',
    "database_type",
    "alphabet",
    "ksize",
]

predicted_celltype_col = f"{sbt_organism}_{celltype_col}"
ground_truth_celltype_col = f"{query_organism}_{celltype_col}"

scoring_groupby = ["database_type", 'search_method', "alphabet", "ksize"]

## Add unifying column of predicted and ground truth celltypes

In [105]:
search_results['groundtruth_celltype'] = search_results[ground_truth_celltype_col]
search_results['predicted_celltype'] = search_results[predicted_celltype_col]

# write results to parquet file

In [106]:
%%time

parquet_fn = "sourmash-search-results.parquet"
parquet = os.path.join(aggregated_results_dir, parquet_fn)
search_results.to_parquet(parquet)

parquet

CPU times: user 6.41 s, sys: 884 ms, total: 7.29 s
Wall time: 7.47 s


'/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/2--test-human/4--aggregated-results/sourmash-search-results.parquet'

# Predict cell type with top hit

## Get top N

In [107]:
%%time

predicted_cells_top_hit = search_results.groupby(
    classification_groupby, as_index=False, group_keys=False
).apply(lambda x: x.nlargest(1, 'similarity'))

predicted_cells_top_hit = predicted_cells_top_hit.set_index(f"{query_organism}_cell_id")

print(predicted_cells_top_hit.shape)
predicted_cells_top_hit.head()

(424610, 17)
CPU times: user 30min 26s, sys: 1min 6s, total: 31min 32s
Wall time: 31min 16s


Unnamed: 0_level_0,similarity,mouse_broad_group,search_method,sketch_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,human_channel,human_cell_ontology_class,human_narrow_group,human_broad_group,human_compartment_group,database_type,groundtruth_celltype,predicted_celltype
human_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
P3_2__AAACCTGGTTAGAACA,0.047525,Dendritic,containment,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.246955,Dendritic,containment,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.231591,Dendritic,containment,alphabet-protein__ksize-30__scaled-10,alphabet,protein,30,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.016143,Dendritic,containment,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers-remove-common-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.102805,Dendritic,containment,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers-remove-common-kmers,Macrophage,Dendritic


## Add unifying column of predicted and ground truth celltypes

In [108]:
predicted_cells_top_hit['groundtruth_celltype'] = predicted_cells_top_hit[ground_truth_celltype_col]
predicted_cells_top_hit['predicted_celltype'] = predicted_cells_top_hit[predicted_celltype_col]

### Write top N to file

In [109]:
%%time

parquet_fn = "sourmash-search-results--top-hit.parquet"
parquet = os.path.join(aggregated_results_dir, parquet_fn)
predicted_cells_top_hit.to_parquet(parquet)

CPU times: user 1.72 s, sys: 83.7 ms, total: 1.8 s
Wall time: 1.9 s


In [110]:
parquet 

'/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/2--test-human/4--aggregated-results/sourmash-search-results--top-hit.parquet'

In [111]:
describe(predicted_cells_top_hit)

(424610, 17)
--- First 5 entries ---


Unnamed: 0_level_0,similarity,mouse_broad_group,search_method,sketch_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,human_channel,human_cell_ontology_class,human_narrow_group,human_broad_group,human_compartment_group,database_type,groundtruth_celltype,predicted_celltype
human_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
P3_2__AAACCTGGTTAGAACA,0.047525,Dendritic,containment,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.246955,Dendritic,containment,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.231591,Dendritic,containment,alphabet-protein__ksize-30__scaled-10,alphabet,protein,30,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.016143,Dendritic,containment,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers-remove-common-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.102805,Dendritic,containment,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers-remove-common-kmers,Macrophage,Dendritic
