In [15]:
%load_ext autoreload
%autoreload 2

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from joblib import Parallel, delayed
from sklearn.metrics import confusion_matrix
from tqdm import tqdm

# Artisinal, small-batch, handmade modules
import classification_metrics
from nb_utils import describe

from celltype_utils import SHARED_CELLTYPES, BROAD_TO_COMPARTMENT, broad_to_compartment
from path_constants import top_hit_paths, FIGURE_FOLDER, sig_outdir_base
from plot_constants import get_cmap_color

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Assign figure folder

In [5]:
figure_folder = os.path.join(FIGURE_FOLDER, "kmermaid_classification_metrics")

# Read in classification data

In [6]:
dfs = []
for species, parquet in top_hit_paths.items():
    try:
        df = pd.read_parquet(parquet)
    except FileNotFoundError:
        print(f"Could not find parquet for {species}")
        continue

    print(f"Species: {species}")
    describe(df)

    df["species"] = species
    df = df.query(
        "groundtruth_celltype in @SHARED_CELLTYPES "
        "and predicted_celltype in @SHARED_CELLTYPES"
    )
    df.index.name = "query_cell_id"
    dfs.append(df)
predicted_cells = pd.concat(dfs, ignore_index=False)

databases_to_choose = (
#     "min-10-percent-shared-kmers-remove-common-kmers",
    "min-5-percent-shared-kmers-remove-common-kmers",
)

# Only use database with common k-mers removed because without that, no AT2 cells are predicted!
# predicted_cells = predicted_cells.query(
#     'database_type == "min-10-percent-shared-kmers-remove-common-kmers" '
#     'and search_method == "similarity_abundance"'
# )
# predicted_cells = predicted_cells.query('database_type == "remove_common"')
# predicted_cells = predicted_cells.loc[predicted_cells.database_type.isin(databases_to_choose) ]
predicted_cells.groundtruth_celltype = (
    predicted_cells.groundtruth_celltype.cat.remove_unused_categories()
)
# predicted_cells.predicted_celltype = predicted_cells.predicted_celltype.cat.remove_unused_categories()
describe(predicted_cells)

Species: mouse
(65940, 24)
--- First 5 entries ---


Unnamed: 0_level_0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,...,mouse2_compartment_group,database_method,mouse2_human_readable,broad_group_matches,broad_group_matches_string,alpha_ksize,alpha_match,mouse2_match,groundtruth_celltype,predicted_celltype
mouse2_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.296154,Monocyte,containment,min-10-percent-shared-kmers,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,...,lymphoid,min-10-percent-shared-kmers: containment,male:female,False,different celltype,"DNA, k:21","DNA, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.27151,Monocyte,containment,min-10-percent-shared-kmers,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,...,lymphoid,min-10-percent-shared-kmers: containment,male:female,False,different celltype,"dayhoff, k:51","dayhoff, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.296568,Monocyte,containment,min-10-percent-shared-kmers,alphabet-protein__ksize-30__scaled-10,alphabet,protein,30,scaled,10,...,lymphoid,min-10-percent-shared-kmers: containment,male:female,False,different celltype,"protein, k:30","protein, different celltype","male:female, different celltype",T cell,Monocyte
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.507136,T cell,similarity_abundance,min-10-percent-shared-kmers,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,...,lymphoid,min-10-percent-shared-kmers: similarity_abundance,male:female,True,same celltype,"DNA, k:21","DNA, same celltype","male:female, same celltype",T cell,T cell
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.537653,T cell,similarity_abundance,min-10-percent-shared-kmers,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,...,lymphoid,min-10-percent-shared-kmers: similarity_abundance,male:female,True,same celltype,"dayhoff, k:51","dayhoff, same celltype","male:female, same celltype",T cell,T cell


Species: lemur
(147236, 17)
--- First 5 entries ---


Unnamed: 0_level_0,similarity,mouse_broad_group,search_method,sketch_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,lemur_channel,lemur_cell_ontology_class,lemur_narrow_group,lemur_broad_group,lemur_compartment_group,database_type,groundtruth_celltype,predicted_celltype
lemur_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
ANTOINE_LUNG_CD31__AAACCTGAGAAACGAG,0.049508,Dendritic,containment,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,ANTOINE_LUNG_CD31,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
ANTOINE_LUNG_CD31__AAACCTGAGAAACGAG,0.114944,Dendritic,containment,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,ANTOINE_LUNG_CD31,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
ANTOINE_LUNG_CD31__AAACCTGAGAAACGAG,0.110182,Dendritic,containment,alphabet-protein__ksize-30__scaled-10,alphabet,protein,30,scaled,10,ANTOINE_LUNG_CD31,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
ANTOINE_LUNG_CD31__AAACCTGAGAAACGAG,0.081077,Macrophage,similarity_abundance,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,ANTOINE_LUNG_CD31,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Macrophage
ANTOINE_LUNG_CD31__AAACCTGAGAAACGAG,0.09006,Macrophage,similarity_abundance,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,ANTOINE_LUNG_CD31,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Macrophage


Species: bat
(216998, 17)
--- First 5 entries ---


Unnamed: 0_level_0,similarity,mouse_broad_group,search_method,sketch_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,bat_channel,bat_cell_ontology_class,bat_narrow_group,bat_broad_group,bat_compartment_group,database_type,groundtruth_celltype,predicted_celltype
bat_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
bat2-LU__AAACCTGAGGTACTCT,0.021308,Dendritic,containment,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,bat2-LU,,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
bat2-LU__AAACCTGAGGTACTCT,0.056749,Dendritic,containment,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,bat2-LU,,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
bat2-LU__AAACCTGAGGTACTCT,0.064116,Dendritic,containment,alphabet-protein__ksize-30__scaled-10,alphabet,protein,30,scaled,10,bat2-LU,,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
bat2-LU__AAACCTGAGGTACTCT,0.060978,Macrophage,similarity_abundance,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,bat2-LU,,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Macrophage
bat2-LU__AAACCTGAGGTACTCT,0.03015,Dendritic,similarity_abundance,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,bat2-LU,,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic


Species: human
(424610, 17)
--- First 5 entries ---


Unnamed: 0_level_0,similarity,mouse_broad_group,search_method,sketch_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,human_channel,human_cell_ontology_class,human_narrow_group,human_broad_group,human_compartment_group,database_type,groundtruth_celltype,predicted_celltype
human_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
P3_2__AAACCTGGTTAGAACA,0.047525,Dendritic,containment,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.246955,Dendritic,containment,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.231591,Dendritic,containment,alphabet-protein__ksize-30__scaled-10,alphabet,protein,30,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.016143,Dendritic,containment,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers-remove-common-kmers,Macrophage,Dendritic
P3_2__AAACCTGGTTAGAACA,0.102805,Dendritic,containment,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,P3_2,macrophage,Macrophage,Macrophage,myeloid,min-10-percent-shared-kmers-remove-common-kmers,Macrophage,Dendritic


(854784, 41)
--- First 5 entries ---


Unnamed: 0_level_0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,...,bat_channel,bat_cell_ontology_class,bat_narrow_group,bat_broad_group,bat_compartment_group,human_channel,human_cell_ontology_class,human_narrow_group,human_broad_group,human_compartment_group
query_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.296154,Monocyte,containment,min-10-percent-shared-kmers,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,...,,,,,,,,,,
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.27151,Monocyte,containment,min-10-percent-shared-kmers,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,...,,,,,,,,,,
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.296568,Monocyte,containment,min-10-percent-shared-kmers,alphabet-protein__ksize-30__scaled-10,alphabet,protein,30,scaled,10,...,,,,,,,,,,
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.507136,T cell,similarity_abundance,min-10-percent-shared-kmers,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,...,,,,,,,,,,
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.537653,T cell,similarity_abundance,min-10-percent-shared-kmers,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,...,,,,,,,,,,


In [7]:
predicted_cells.species.value_counts()

human    424610
bat      216998
lemur    147236
mouse     65940
Name: species, dtype: int64

### ignore mouse2mouse predictions on same query mouse as test mouse

In [8]:
self2self_rows = (
    predicted_cells.species == "mouse"
) & predicted_cells.index.str.startswith("MACA_18m_M_LUNG_52")
print(self2self_rows.sum())

35160


In [9]:
predicted_cells = predicted_cells.loc[~self2self_rows]

## Add predicted compartment

In [10]:
predicted_cells["predicted_compartment"] = predicted_cells["predicted_celltype"].map(
    broad_to_compartment
)
predicted_cells["groundtruth_compartment"] = predicted_cells["groundtruth_celltype"].map(
    broad_to_compartment
)

In [11]:
# predicted_cells.to_parq

## Add `database_method` column for containment vs similarity

In [12]:
predicted_cells['database_method'] = predicted_cells.database_type.astype(str) + ': ' + predicted_cells.search_method.astype(str)

In [13]:
predicted_cells_min_5_percent = predicted_cells.query(
    'database_type == "min-5-percent-shared-kmers-remove-common-kmers" '
    'and search_method == "similarity_abundance"'
)
describe(predicted_cells_min_5_percent)

(84753, 43)
--- First 5 entries ---


Unnamed: 0_level_0,similarity,mouse1_broad_group,search_method,database_type,sketch_id,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value,...,bat_narrow_group,bat_broad_group,bat_compartment_group,human_channel,human_cell_ontology_class,human_narrow_group,human_broad_group,human_compartment_group,predicted_compartment,groundtruth_compartment
query_cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.086712,T cell,similarity_abundance,min-5-percent-shared-kmers-remove-common-kmers,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,...,,,,,,,,,lymphoid,lymphoid
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.119992,T cell,similarity_abundance,min-5-percent-shared-kmers-remove-common-kmers,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,...,,,,,,,,,lymphoid,lymphoid
MACA_18m_F_LUNG_51__AAACCTGCAGCCTATA,0.114376,T cell,similarity_abundance,min-5-percent-shared-kmers-remove-common-kmers,alphabet-protein__ksize-30__scaled-10,alphabet,protein,30,scaled,10,...,,,,,,,,,lymphoid,lymphoid
MACA_18m_F_LUNG_51__AAACCTGGTGTGGTTT,0.181781,B cell,similarity_abundance,min-5-percent-shared-kmers-remove-common-kmers,alphabet-DNA__ksize-21__scaled-10,alphabet,DNA,21,scaled,10,...,,,,,,,,,lymphoid,lymphoid
MACA_18m_F_LUNG_51__AAACCTGGTGTGGTTT,0.170558,B cell,similarity_abundance,min-5-percent-shared-kmers-remove-common-kmers,alphabet-dayhoff__ksize-51__scaled-10,alphabet,dayhoff,51,scaled,10,...,,,,,,,,,lymphoid,lymphoid


## Write selected database type and search method to file

In [16]:
predicted_cells_min_5_percent.to_parquet(
    os.path.join(sig_outdir_base, "aggregated-predicted-cells.parquet")
)

In [25]:
1+1

2

In [28]:
predicted_cells_min_5_percent.search_method.value_counts()

similarity_abundance    84753
Name: search_method, dtype: int64