# Goal for how to organize the signatures

Copy all ribosomal-removed signatures to these subfolders


Using `DNA`, `protein`, and `dayhoff` for shorthand now. In reality would use the full sketch id, e.g. `alphabet-DNA__ksize-21__scaled-10`

```
mouse2mouse/
    self2self-bootstrapped/
        0--train-mouse1-single-cells/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        1--test-mouse1-single-cells/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...                
        2--train-mouse1-merged-celltype-sigs/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        3--train-mouse1-celltype-remove-common-kmers/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        4--train-mouse1--celltype-sbt-dbs/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        5--search-results/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-00/
                ...
    mouse1-mouse2/
        0--train-mouse1-single-cells/
            DNA/
            protein/
            dayhoff/
        1--test-mouse2-single-cells/
            DNA/
            protein/
            dayhoff/
        2--train-mouse1-merged-celltype-sigs/
            DNA/
            protein/
            dayhoff/
        3--train-mouse1-celltype-remove-common-kmers/
            DNA/
            protein/
            dayhoff/
        4--train-mouse1-celltype-sbt-dbs/
            DNA/
            protein/
            dayhoff
        5--search-results/
            DNA/
            protein/
            dayhoff/
train-mouse/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        DNA/
        protein/
        dayhoff/
    2--merged-celltype-sigs/
        DNA/
        protein/
        dayhoff/
    3--celltype-remove-common-kmers/
        DNA/
        protein/
        dayhoff/
    4--celltype-sbt-dbs/
        DNA/
        protein/
        dayhoff/
test-human/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        DNA/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoffl
test-lemur/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoff/
test-bat/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoff/
```

# Imports

In [None]:
import glob
import os
import shutil

import pandas as pd
import scanpy as sc
import sig_utils
from joblib import Parallel, delayed
from tqdm import tqdm

import create_sourmash_command_utils as csc_utils

import sig_reorg

pd.options.display.max_rows = 500

%load_ext autoreload
%autoreload 2

## Def Describe

In [None]:
%%file pandas_utils.py

def describe(df, random=False):
    print(df.shape)
    print("--- First 5 entries ---")
    display(df.head())
    if random:
        print('--- Random subset ---')
        display(df.sample(5))

In [None]:
from pandas_utils import describe

## Read one2one h5ad

In [None]:
ls -lha /home/olga/data_sm/immune-evolution/h5ads/

In [5]:
ad = sc.read('/home/olga/data_sm/immune-evolution/h5ads/human-mouse-bat__one2one_orthologs_raw.h5ad')
ad.obs.tissue.value_counts()

Lung    63575
Name: tissue, dtype: int64

In [6]:
ad.obs.groupby(['species', 'individual'], observed=True).size()

species  individual
Human    TSP1          11628
         TSP2          19489
Mouse    18-F-50         320
         18-F-51        1138
         18-M-52        1573
         18-M-53        1256
         21-F-54        1312
         21-F-55        1526
         30-M-2         7728
         30-M-3         1565
         30-M-4          920
         30-M-5          496
         1-M-62         1043
         1-M-63         1384
         3-F-56          348
         3-F-57          979
         3-M-5/6         578
         3-M-7/8        1636
nan      nan            8656
dtype: int64

In [7]:
ad.obs.head()

Unnamed: 0,original_cell_id,individual,tissue,subtissue,cell_barcode,channel,sex,n_counts,n_genes,cell_ontology_class,...,clustertype,cluster,celltype,celltype_tiss,clustercellnumber,newcelltype,clustertonumber,tissue_lower,sex_abbrev,cell_id
TSP1_lung_2__AAACCCACAACTCGTA-0,AAACCCACAACTCGTA_TSP1_lung_2-5-human,TSP1,Lung,,AAACCCACAACTCGTA,TSP1_lung_2,female,31791.0,1470.0,basophil,...,,,,,,,,,,
TSP1_lung_2__AAACCCAGTCCGACGT-0,AAACCCAGTCCGACGT_TSP1_lung_2-5-human,TSP1,Lung,,AAACCCAGTCCGACGT,TSP1_lung_2,female,19200.0,3756.0,macrophage,...,,,,,,,,,,
TSP1_lung_2__AAACCCAGTTCAGTAC-0,AAACCCAGTTCAGTAC_TSP1_lung_2-5-human,TSP1,Lung,,AAACCCAGTTCAGTAC,TSP1_lung_2,female,20952.0,3885.0,macrophage,...,,,,,,,,,,
TSP1_lung_2__AAACGAAGTATCGATC-0,AAACGAAGTATCGATC_TSP1_lung_2-5-human,TSP1,Lung,,AAACGAAGTATCGATC,TSP1_lung_2,female,20300.0,4043.0,macrophage,...,,,,,,,,,,
TSP1_lung_2__AAACGAATCATTCATC-0,AAACGAATCATTCATC_TSP1_lung_2-5-human,TSP1,Lung,,AAACGAATCATTCATC,TSP1_lung_2,female,20878.0,4284.0,macrophage,...,,,,,,,,,,


In [8]:
ad.obs.groupby(['species', 'tissue']).size()

species  tissue
Human    Lung      31117
Mouse    Lung      23802
nan      Lung       8656
dtype: int64

In [9]:
ls -lha /home/olga/data_sm/immune-evolution/h5ads/human-mouse/

total 15G
drwxr-xr-x 2 olga    czb 4.0K Mar 30 08:57 [0m[01;34m.[0m/
drwxrwxr-x 6 phoenix czb 4.0K Mar 31 09:03 [01;34m..[0m/
-rw-r--r-- 1 olga    czb 444M Jan 11 15:05 human-mouse-one2one-orthologs--bladder.h5ad
-rw-r--r-- 1 olga    czb 5.1G Jan 11 15:04 human-mouse-one2one-orthologs.h5ad
-rw-r--r-- 1 olga    czb 791M Jan 11 15:04 human-mouse-one2one-orthologs--lung.h5ad
-rw-r--r-- 1 olga    czb 209M Jan 11 15:04 human-mouse-one2one-orthologs--muscle.h5ad
-rw-r--r-- 1 olga    czb   44 Mar 30 08:57 README.md
-rw-r--r-- 1 olga    czb 3.1G Jan 11 14:58 tabula-muris-senis--human-mouse-one2one-ortholog-subset.h5ad
-rw-r--r-- 1 olga    czb 4.9G Jan 11 14:50 tabula-sapiens--human-mouse-one2one-ortholog-subset.h5ad


In [10]:
! echo "This is Tabula Sapiens + Tabula Muris Senis" >> /home/olga/data_sm/immune-evolution/h5ads/human-mouse/README.md

In [11]:
human_mouse = sc.read('/home/olga/data_sm/immune-evolution/h5ads/human-mouse/human-mouse-one2one-orthologs--lung.h5ad')
human_mouse

AnnData object with n_obs × n_vars = 54919 × 14245
    obs: 'original_cell_id', 'individual', 'tissue', 'subtissue', 'cell_barcode', 'channel', 'sex', 'n_counts', 'n_genes', 'cell_ontology_class', 'species', 'species_latin', 'narrow_group', 'broad_group', 'compartment_group', 'age', 'cell_ontology_id', 'free_annotation', 'sequencing_run', 'species_batch'
    var: 'ensembl_id-human', 'len-human', 'mouse_gene_symbol-mouse', 'n_cells-mouse'

In [12]:
human_mouse.obs.species_batch.value_counts()

human    31117
mouse    23802
Name: species_batch, dtype: int64

In [13]:
human_mouse.obs.head()

Unnamed: 0_level_0,original_cell_id,individual,tissue,subtissue,cell_barcode,channel,sex,n_counts,n_genes,cell_ontology_class,species,species_latin,narrow_group,broad_group,compartment_group,age,cell_ontology_id,free_annotation,sequencing_run,species_batch
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
TSP1_lung_2__AAACCCACAACTCGTA,AAACCCACAACTCGTA_TSP1_lung_2-5-human,TSP1,Lung,,AAACCCACAACTCGTA,TSP1_lung_2,female,31791.0,1470.0,basophil,Human,Homo sapiens,,,,,,,,human
TSP1_lung_2__AAACCCAGTCCGACGT,AAACCCAGTCCGACGT_TSP1_lung_2-5-human,TSP1,Lung,,AAACCCAGTCCGACGT,TSP1_lung_2,female,19200.0,3756.0,macrophage,Human,Homo sapiens,macrophage,macrophage,myeloid,,,,,human
TSP1_lung_2__AAACCCAGTTCAGTAC,AAACCCAGTTCAGTAC_TSP1_lung_2-5-human,TSP1,Lung,,AAACCCAGTTCAGTAC,TSP1_lung_2,female,20952.0,3885.0,macrophage,Human,Homo sapiens,macrophage,macrophage,myeloid,,,,,human
TSP1_lung_2__AAACGAAGTATCGATC,AAACGAAGTATCGATC_TSP1_lung_2-5-human,TSP1,Lung,,AAACGAAGTATCGATC,TSP1_lung_2,female,20300.0,4043.0,macrophage,Human,Homo sapiens,macrophage,macrophage,myeloid,,,,,human
TSP1_lung_2__AAACGAATCATTCATC,AAACGAATCATTCATC_TSP1_lung_2-5-human,TSP1,Lung,,AAACGAATCATTCATC,TSP1_lung_2,female,20878.0,4284.0,macrophage,Human,Homo sapiens,macrophage,macrophage,myeloid,,,,,human


In [14]:
adata = sc.read(
#     "/home/phoenix/data_sm/code/immune-evolution/h5ads/human-lemur-mouse-bat__lung_only_copy.h5ad"
    "/home/olga/data_sm/immune-evolution/h5ads/human-lemur-mouse-bat/human-lemur-mouse-bat__lung_only.h5ad"
)
adata.obs = adata.obs.reset_index().set_index('cell_id')
print(adata)
adata.obs.head()

AnnData object with n_obs × n_vars = 126745 × 10560
    obs: 'index', 'age', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'free_annotation', 'individual', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'tissue', 'narrow_group', 'broad_group', 'compartment_group', 'compartment_narrow', 'channel_cleaned', 'batch', 'n_genes', 'n_counts', 'species_batch_v2', 'compartment_broad', 'compartment_broad_narrow', 'compartment_species', 'compartment_narrow_species', 'common_individual_id'
    var: 'bat__gene_name', 'mouse_lemur__gene_name-bat', 'mouse__gene_name-bat', 'mouse_lemur__gene_name_x-hlm', 'mouse__gene_name_x-hlm', 'gene_ids-lemur-hlm', 'n_cells-mouse-hlm', 'mouse_lemur__gene_name_y-hlm', 'mouse__gene_name_y-hlm'


Unnamed: 0_level_0,index,age,cell_barcode,cell_ontology_class,cell_ontology_id,channel,free_annotation,individual,sample,sequencing_run,...,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P2_1__AAACCTGAGAAACCAT,P2_1_AAACCTGAGAAACCAT-human-hlm,46y,AAACCTGAGAAACCAT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AAATGCCAGATGAGAG,P2_1_AAATGCCAGATGAGAG-human-hlm,46y,AAATGCCAGATGAGAG,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGATCCCT,P2_1_AACACGTTCGATCCCT-human-hlm,46y,AACACGTTCGATCCCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGCACTCT,P2_1_AACACGTTCGCACTCT-human-hlm,46y,AACACGTTCGCACTCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACCATGCAGCTCGCA,P2_1_AACCATGCAGCTCGCA-human-hlm,46y,AACCATGCAGCTCGCA,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2


In [15]:
adata.obs.species.value_counts()

Human          57353
Mouse lemur    36934
Mouse          23802
Bat             8656
Name: species, dtype: int64

In [16]:
adata.obs.query('species == "Mouse"').groupby(["age", "channel"], observed=True).size()

age  channel           
18m  MACA_18m_F_LUNG_50     320
     MACA_18m_F_LUNG_51    1138
     MACA_18m_M_LUNG_52    1573
     MACA_18m_M_LUNG_53    1256
21m  MACA_21m_F_LUNG_54    1312
     MACA_21m_F_LUNG_55    1526
30m  10X_P1_6              7728
     10X_P1_14             1565
     10X_P2_5               920
     10X_P3_0               496
1m   10X_P5_13             1043
     10X_P5_14             1384
3m   10X_P7_8               348
     10X_P7_9               979
     10X_P8_12              578
     10X_P8_13             1636
dtype: int64

## Set shared celltypes

In [17]:
SHARED_CELLTYPES = ['Alveolar Epithelial Type 2',
 'B cell',
 'Capillary',
 'Dendritic',
 'Fibroblast',
 'Macrophage',
 'Monocyte',
 'Natural Killer T cell',
 'Smooth Muscle and Myofibroblast',
 'T cell']
len(SHARED_CELLTYPES)

10

In [18]:
adata_shared = adata[adata.obs.broad_group.isin(SHARED_CELLTYPES)]
adata_shared

  if not is_categorical(df_full[k]):


View of AnnData object with n_obs × n_vars = 89795 × 10560
    obs: 'index', 'age', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'free_annotation', 'individual', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'tissue', 'narrow_group', 'broad_group', 'compartment_group', 'compartment_narrow', 'channel_cleaned', 'batch', 'n_genes', 'n_counts', 'species_batch_v2', 'compartment_broad', 'compartment_broad_narrow', 'compartment_species', 'compartment_narrow_species', 'common_individual_id'
    var: 'bat__gene_name', 'mouse_lemur__gene_name-bat', 'mouse__gene_name-bat', 'mouse_lemur__gene_name_x-hlm', 'mouse__gene_name_x-hlm', 'gene_ids-lemur-hlm', 'n_cells-mouse-hlm', 'mouse_lemur__gene_name_y-hlm', 'mouse__gene_name_y-hlm'

In [34]:
CELL_IDS_TO_USE = set(adata_shared.obs.query("species == 'Human'").index)
len(CELL_IDS_TO_USE)

41675

In [35]:
# adata_shared.obs.query('species == "Mouse"').groupby(
#     ["channel", "broad_group"], observed=True
# ).size()

In [36]:
adata_shared.obs.query('species == "Mouse"').groupby(
    ['age', "channel"],
    observed=True,
).broad_group.nunique()

age  channel           
18m  MACA_18m_F_LUNG_50     9
     MACA_18m_F_LUNG_51    10
     MACA_18m_M_LUNG_52    10
     MACA_18m_M_LUNG_53    10
21m  MACA_21m_F_LUNG_54    10
     MACA_21m_F_LUNG_55    10
30m  10X_P1_6               8
     10X_P1_14              9
     10X_P2_5               9
     10X_P3_0               9
1m   10X_P5_13             10
     10X_P5_14              9
3m   10X_P7_8              10
     10X_P7_9              10
     10X_P8_12              8
     10X_P8_13              8
Name: broad_group, dtype: int64

## Set signature outdir

In [37]:
ls -lha $sig_outdir_base

total 11K
drwxr-xr-x  7 olga czb 4.0K Mar  5 12:02 [0m[01;34m.[0m/
drwxrwxr-x  8 olga czb 8.0K Mar 16 10:36 [01;34m..[0m/
drwxr-xr-x  4 olga czb 4.0K Feb 28 09:53 [01;34m0--mouse2mouse[0m/
drwxr-xr-x 20 olga czb 4.0K Apr 16 09:10 [01;34m1--train-mouse[0m/
drwxr-xr-x 17 olga czb 4.0K Apr 16 09:10 [01;34m2--test-human[0m/
drwxr-xr-x 21 olga czb 4.0K Apr 16 09:10 [01;34m3--test-bat[0m/
drwxr-xr-x 20 olga czb 4.0K Apr 16 17:56 [01;34m4--test-lemur[0m/


In [38]:
rm -rf $sig_outdir_base/mouse/single-cell

## sketch ids of interest

In [39]:
alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))
sketch_ids = [sig_utils.make_sketch_id(alpha=alpha, ksize=ksize, style='scaled', value=10) for alpha, ksize in alpha_ksizes]
sketch_ids

['alphabet-DNA__ksize-21__scaled-10',
 'alphabet-protein__ksize-30__scaled-10',
 'alphabet-dayhoff__ksize-51__scaled-10']

In [25]:
nucleotide_sketch_ids = sketch_ids[:1]
nucleotide_sketch_ids

['alphabet-DNA__ksize-21__scaled-10']

In [26]:
peptide_sketch_ids = sketch_ids[1:]
peptide_sketch_ids

['alphabet-protein__ksize-30__scaled-10',
 'alphabet-dayhoff__ksize-51__scaled-10']

In [27]:
import sig_reorg

# Human

## Single cell fastas

In [28]:
! ls -lha $sig_outdir_base

total 267M
drwxr-xr-x 7 olga czb  12K Apr 16 17:58  .
drwxr-xr-x 7 olga czb 4.0K Apr 16 08:15  ..
-rw-r--r-- 1 olga czb  18K Jan 11 10:02  000_initial_analyses_with_sketches.ipynb
-rw-r--r-- 1 olga czb  49K Jan 11 10:02  001_umap_initial_sketches.ipynb
-rw-r--r-- 1 olga czb  134 Jan 11 10:02  001_umap_initial_sketches-requirements.txt
-rw-r--r-- 1 olga czb  92K Feb 24 11:08  002_get_one2one_orthologs.ipynb
-rw-r--r-- 1 olga czb 117K Jan 11 10:02  003_subset_h5ad_one2one_orthologs.ipynb
-rw-r--r-- 1 olga czb 292K Jan 11 10:02  004_interferon_kmer_expression.ipynb
-rw-r--r-- 1 olga czb  71K Jan 11 10:02  005_lemur_fastqs_fastas_sigs.ipynb
-rw-r--r-- 1 olga czb  75K Jan 11 10:02  006_mismatched_ksize_molecule_signatures.ipynb
-rw-r--r-- 1 olga czb  90K Jan 11 10:02  007_bat_annotation_unification.ipynb
-rw-r--r-- 1 olga czb 1.5M Jan 11 10:02  009_nk_cell_receptor_sigs.ipynb
-rw-r--r-- 1 olga czb  34K Jan 11 10:02  010_orthogroup_subset_mammalia.ipynb
-rw-r--r-- 1 olga czb  36K Jan 11 10:0

In [29]:
human_kmermaid_base = "/home/olga/data_sm/tabula-microcebus/analyses/kmermaid/tenx-tgz--hlca-v2--normal-lung-remove-riborna"

In [30]:
sig_outdir_base = '/home/olga/data_sm/immune-evolution/kmer-signatures'

human_test_dir = os.path.join(sig_outdir_base, "2--test-human")

human_test_0_fastas = os.path.join(human_test_dir, "0--fastas")
human_test_1_single_cells = os.path.join(human_test_dir, "1--single-cell-sigs")
human_fasta_input_dir = os.path.join(human_kmermaid_base, "translate")

results_dir = os.path.join(human_test_dir, "2--mouse-search-results")
dirs = (human_test_0_fastas, human_test_1_single_cells, results_dir)

for outdir in dirs:
    if not os.path.exists(outdir):
        os.makedirs(outdir)


## Copy fastas

In [65]:
sig_reorg.copy_fastas(
    human_test_0_fastas,
    human_fasta_input_dir,
    select_cell_ids=CELL_IDS_TO_USE,
    dryrun=False,
)

0it [00:00, ?it/s]

Copying peptides


54305it [03:21, 269.80it/s]
0it [00:00, ?it/s]

Copying nucleotides


54305it [03:21, 269.13it/s]


## Single cell signatures

In [30]:

human_peptide_sketch_dir = os.path.join(
    human_kmermaid_base,
    "sketches_peptide_handmade_merged_remove_ribosomal_dissociation_allcells",
)

human_nucleotide_sketch_dir = os.path.join(
    human_kmermaid_base, "sketches_nucleotide_handmade_merged_remove_ribosomal_dissociation_allcells"
)

In [31]:
sig_reorg.copy_nucleotide_peptide_sketches(
    peptide_sketch_dir=human_peptide_sketch_dir,
    nucleotide_sketch_dir=human_nucleotide_sketch_dir,
    pre_sketch_id_outdir=human_test_1_single_cells,
    select_cell_ids=CELL_IDS_TO_USE,
    dryrun=False
)

219it [00:00, 2189.94it/s]

Copying alphabet-DNA__ksize-21__scaled-10


14153it [03:32, 66.47it/s] 
8it [00:00, 63.92it/s]

Copying alphabet-protein__ksize-30__scaled-10


20059it [01:27, 230.41it/s]
22it [00:00, 212.30it/s]

Copying alphabet-dayhoff__ksize-51__scaled-10


20059it [01:26, 231.48it/s]


# Create queries

In [45]:
human_test_dir

'/home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human'

## celltypes with removed common k-mers

In [62]:
moltypes = "DNA", "protein", "dayhoff"

alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))

# mouse celltype database directory
train_4_celltype_sbts = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/4--celltype-sbt-dbts"
train_3_merged_celltype_remove_common = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers"
train_2_merged_celltype = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs"

# Try both with removed common hashes and original
celltype_dirs = dict(
    remove_common=train_3_merged_celltype_remove_common,
    raw_merged=train_2_merged_celltype,
)

txts = []
for i, (name, celltype_dir) in enumerate(celltype_dirs.items()):
    this_celltype_dir_results = os.path.join(
        human_test_dir, f"3-{i}--mouse-search-results--{name}"
    )
    if not os.path.exists(this_celltype_dir_results):
        os.makedirs(this_celltype_dir_results)
    for alpha, ksize in alpha_ksizes:
        txt = csc_utils.make_sourmash_search_commands(
            this_celltype_dir_results,
            query_sigs_dir=human_test_1_single_cells,
            sbt_base_dir=celltype_dir,
            k_sizes=[ksize],
            scaled_sizes=[
                10,
            ],
            sbt_template_basename=None,
            query_sig_files=False,
            containment=False,
            moltype=alpha,
            threshold=1e-100,
            n_jobs=96,
            force=True,
            num_results=None,
            add_ksize_to_sig_dir=False,
            traverse_directory=True,
        )
        txts.append(txt)
txts_to_cat = " ".join(txts)
catted_txt = os.path.join(results_dir, "sourmash_search_commands_concatenated.txt")
! cat $txts_to_cat > $catted_txt

print(f"---\nparallel --progress --eta --jobs 96 < {catted_txt}")

parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--remove_common/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--remove_common/sourmash_search_commands_protein.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--remove_common/sourmash_search_commands_dayhoff.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-1--mouse-search-results--raw_merged/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-1--mouse-search-results--raw_merged/sourmash_search_commands_protein.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/

## Search mouse celltypes with k-mers shared by 10% of cells

In [68]:
moltypes = "DNA", "protein", "dayhoff"

alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))

# mouse celltype database directory
train_4_celltype_sbts = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/4--celltype-sbt-dbts"
train_3_sigs_min_10_percent = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs--min-kmer-count--10-percent"
train_3_sigs_min_5_percent = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs--min-kmer-count--5-percent"
# train_2_merged_celltype = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs"

# Try both with removed common hashes and original
celltype_dirs = {
    'min-5-percent-shared-kmers': train_3_sigs_min_5_percent,
    'min-10-percent-shared-kmers': train_3_sigs_min_10_percent,
}

txts = []
for i, (name, celltype_dir) in enumerate(celltype_dirs.items()):
    this_celltype_dir_results = os.path.join(
        human_test_dir, f"3-{i}--mouse-search-results--{name}"
    )
    if not os.path.exists(this_celltype_dir_results):
        os.makedirs(this_celltype_dir_results)
    for alpha, ksize in alpha_ksizes:
        txt = csc_utils.make_sourmash_search_commands(
            this_celltype_dir_results,
            query_sigs_dir=human_test_1_single_cells,
            sbt_base_dir=celltype_dir,
            k_sizes=[ksize],
            scaled_sizes=[
                10,
            ],
            sbt_template_basename=None,
            query_sig_files=False,
            containment=False,
            moltype=alpha,
            threshold=1e-100,
            n_jobs=96,
            force=True,
            num_results=None,
            add_ksize_to_sig_dir=False,
            traverse_directory=True,
        )
        txts.append(txt)
txts_to_cat = " ".join(txts)
catted_txt = os.path.join(results_dir, "sourmash_search_commands_concatenated.txt")
! cat $txts_to_cat > $catted_txt

print(f"---\nparallel --progress --eta --jobs 96 < {catted_txt}")

parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--min-5-percent-shared-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--min-5-percent-shared-kmers/sourmash_search_commands_protein.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--min-5-percent-shared-kmers/sourmash_search_commands_dayhoff.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-1--mouse-search-results--min-10-percent-shared-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-1--mouse-search-results--min-10-percent-shared-kmers/sourmash_search_commands_protein.txt
parallel --progress --eta --job

## Search mouse celltypes with k-mers shared by 5, 10% of cells, but remove k-mers shared by 80% of celltypes

In [70]:
moltypes = "DNA", "protein", "dayhoff"

alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))

# mouse celltype database directory
train_4_celltype_sbts = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/4--celltype-sbt-dbts"
train_3_sigs_min_5_percent_remove_common = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers--min-kmer-count--5-percent"
train_3_sigs_min_10_percent_remove_common = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers--min-kmer-count--10-percent"

# Try both with removed common hashes and original
celltype_dirs = {
    'min-5-percent-shared-kmers-remove-common-kmers': train_3_sigs_min_5_percent_remove_common,
    'min-10-percent-shared-kmers-remove-common-kmers': train_3_sigs_min_10_percent_remove_common,
}

txts = []
for i, (name, celltype_dir) in enumerate(celltype_dirs.items()):
    this_celltype_dir_results = os.path.join(
        human_test_dir, f"3-{i}--mouse-search-results--{name}"
    )
    if not os.path.exists(this_celltype_dir_results):
        os.makedirs(this_celltype_dir_results)
    for alpha, ksize in alpha_ksizes:
        txt = csc_utils.make_sourmash_search_commands(
            this_celltype_dir_results,
            query_sigs_dir=human_test_1_single_cells,
            sbt_base_dir=celltype_dir,
            k_sizes=[ksize],
            scaled_sizes=[
                10,
            ],
            sbt_template_basename=None,
            query_sig_files=False,
            containment=False,
            moltype=alpha,
            threshold=1e-100,
            n_jobs=96,
            force=True,
            num_results=None,
            add_ksize_to_sig_dir=False,
            traverse_directory=True,
        )
        txts.append(txt)
txts_to_cat = " ".join(txts)
catted_txt = os.path.join(results_dir, "sourmash_search_commands_concatenated.txt")
! cat $txts_to_cat > $catted_txt

print(f"---\nparallel --progress --eta --jobs 96 < {catted_txt}")

parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--min-5-percent-shared-kmers-remove-common-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--min-5-percent-shared-kmers-remove-common-kmers/sourmash_search_commands_protein.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-search-results--min-5-percent-shared-kmers-remove-common-kmers/sourmash_search_commands_dayhoff.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-1--mouse-search-results--min-10-percent-shared-kmers-remove-common-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-1--mouse-search-results--min-10-percent-s

## CONTAINMENT - Search mouse celltypes with k-mers shared by 5, 10% of cells, but remove k-mers shared by 80% of celltypes

In [71]:
moltypes = "DNA", "protein", "dayhoff"

alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))

# mouse celltype database directory
train_4_celltype_sbts = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/4--celltype-sbt-dbts"
train_3_sigs_min_5_percent_remove_common = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers--min-kmer-count--5-percent"
train_3_sigs_min_10_percent_remove_common = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers--min-kmer-count--10-percent"

# Try both with removed common hashes and original
celltype_dirs = {
        'min-5-percent-shared-kmers': train_3_sigs_min_5_percent,
    'min-10-percent-shared-kmers': train_3_sigs_min_10_percent,
    'min-5-percent-shared-kmers-remove-common-kmers': train_3_sigs_min_5_percent_remove_common,
    'min-10-percent-shared-kmers-remove-common-kmers': train_3_sigs_min_10_percent_remove_common,
}

txts = []
for i, (name, celltype_dir) in enumerate(celltype_dirs.items()):
    this_celltype_dir_results = os.path.join(
        human_test_dir, f"3-{i}--mouse-containment-search-results--{name}"
    )
    if not os.path.exists(this_celltype_dir_results):
        os.makedirs(this_celltype_dir_results)
    for alpha, ksize in alpha_ksizes:
        txt = csc_utils.make_sourmash_search_commands(
            this_celltype_dir_results,
            query_sigs_dir=human_test_1_single_cells,
            sbt_base_dir=celltype_dir,
            k_sizes=[ksize],
            scaled_sizes=[
                10,
            ],
            sbt_template_basename=None,
            query_sig_files=False,
            containment=True,
            moltype=alpha,
            threshold=1e-100,
            n_jobs=96,
            force=True,
            num_results=None,
            add_ksize_to_sig_dir=False,
            traverse_directory=True,
        )
        txts.append(txt)
txts_to_cat = " ".join(txts)
catted_txt = os.path.join(results_dir, "sourmash_containment_search_commands_concatenated.txt")
! cat $txts_to_cat > $catted_txt

print(f"---\nparallel --progress --eta --jobs 96 < {catted_txt}")

parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-containment-search-results--min-5-percent-shared-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-containment-search-results--min-5-percent-shared-kmers/sourmash_search_commands_protein.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-0--mouse-containment-search-results--min-5-percent-shared-kmers/sourmash_search_commands_dayhoff.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-1--mouse-containment-search-results--min-10-percent-shared-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human/3-1--mouse-containment-search-results--min-10-percent-shared-kmers/sourmash