# Goal for how to organize the signatures

Copy all ribosomal-removed signatures to these subfolders


Using `DNA`, `protein`, and `dayhoff` for shorthand now. In reality would use the full sketch id, e.g. `alphabet-DNA__ksize-21__scaled-10`

```
mouse2mouse/
    self2self-bootstrapped/
        0--train-mouse1-single-cells/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        1--test-mouse1-single-cells/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...                
        2--train-mouse1-merged-celltype-sigs/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        3--train-mouse1-celltype-remove-common-kmers/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        4--train-mouse1--celltype-sbt-dbs/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        5--search-results/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-00/
                ...
    mouse1-mouse2/
        0--train-mouse1-single-cells/
            DNA/
            protein/
            dayhoff/
        1--test-mouse2-single-cells/
            DNA/
            protein/
            dayhoff/
        2--train-mouse1-merged-celltype-sigs/
            DNA/
            protein/
            dayhoff/
        3--train-mouse1-celltype-remove-common-kmers/
            DNA/
            protein/
            dayhoff/
        4--train-mouse1-celltype-sbt-dbs/
            DNA/
            protein/
            dayhoff
        5--search-results/
            DNA/
            protein/
            dayhoff/
train-mouse/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        DNA/
        protein/
        dayhoff/
    2--merged-celltype-sigs/
        DNA/
        protein/
        dayhoff/
    3--celltype-remove-common-kmers/
        DNA/
        protein/
        dayhoff/
    4--celltype-sbt-dbs/
        DNA/
        protein/
        dayhoff/
test-human/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        DNA/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoffl
test-lemur/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoff/
test-bat/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoff/
```

# Imports

In [2]:
import glob
import os
import shutil

import pandas as pd
import scanpy as sc
import sig_utils
from joblib import Parallel, delayed
from tqdm import tqdm

import create_sourmash_command_utils as csc_utils

import sig_reorg

pd.options.display.max_rows = 500

%load_ext autoreload
%autoreload 2

## Def Describe

In [3]:
%%file pandas_utils.py

def describe(df, random=False):
    print(df.shape)
    print("--- First 5 entries ---")
    display(df.head())
    if random:
        print('--- Random subset ---')
        display(df.sample(5))

Overwriting pandas_utils.py


In [4]:
from pandas_utils import describe

## Read one2one h5ad

In [5]:
adata = sc.read(
#     "/home/phoenix/data_lg/data_sm_copy/code/immune-evolution/h5ads/human-lemur-mouse-bat__lung_only_copy.h5ad"
    "/home/olga/data_lg/data_sm_copy/immune-evolution/h5ads/human-lemur-mouse-bat/human-lemur-mouse-bat__lung_only.h5ad"
)
adata.obs = adata.obs.reset_index().set_index('cell_id')
print(adata)
adata.obs.head()

AnnData object with n_obs × n_vars = 126745 × 10560
    obs: 'index', 'age', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'free_annotation', 'individual', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'tissue', 'narrow_group', 'broad_group', 'compartment_group', 'compartment_narrow', 'channel_cleaned', 'batch', 'n_genes', 'n_counts', 'species_batch_v2', 'compartment_broad', 'compartment_broad_narrow', 'compartment_species', 'compartment_narrow_species', 'common_individual_id'
    var: 'bat__gene_name', 'mouse_lemur__gene_name-bat', 'mouse__gene_name-bat', 'mouse_lemur__gene_name_x-hlm', 'mouse__gene_name_x-hlm', 'gene_ids-lemur-hlm', 'n_cells-mouse-hlm', 'mouse_lemur__gene_name_y-hlm', 'mouse__gene_name_y-hlm'


Unnamed: 0_level_0,index,age,cell_barcode,cell_ontology_class,cell_ontology_id,channel,free_annotation,individual,sample,sequencing_run,...,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P2_1__AAACCTGAGAAACCAT,P2_1_AAACCTGAGAAACCAT-human-hlm,46y,AAACCTGAGAAACCAT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AAATGCCAGATGAGAG,P2_1_AAATGCCAGATGAGAG-human-hlm,46y,AAATGCCAGATGAGAG,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGATCCCT,P2_1_AACACGTTCGATCCCT-human-hlm,46y,AACACGTTCGATCCCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGCACTCT,P2_1_AACACGTTCGCACTCT-human-hlm,46y,AACACGTTCGCACTCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACCATGCAGCTCGCA,P2_1_AACCATGCAGCTCGCA-human-hlm,46y,AACCATGCAGCTCGCA,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2


In [6]:
adata.obs.species.value_counts()

Human          57353
Mouse lemur    36934
Mouse          23802
Bat             8656
Name: species, dtype: int64

In [7]:
adata.obs.query('species == "Mouse"').groupby(["age", "channel"], observed=True).size()

age  channel           
18m  MACA_18m_F_LUNG_50     320
     MACA_18m_F_LUNG_51    1138
     MACA_18m_M_LUNG_52    1573
     MACA_18m_M_LUNG_53    1256
21m  MACA_21m_F_LUNG_54    1312
     MACA_21m_F_LUNG_55    1526
30m  10X_P1_6              7728
     10X_P1_14             1565
     10X_P2_5               920
     10X_P3_0               496
1m   10X_P5_13             1043
     10X_P5_14             1384
3m   10X_P7_8               348
     10X_P7_9               979
     10X_P8_12              578
     10X_P8_13             1636
dtype: int64

## Set shared celltypes

In [8]:
SHARED_CELLTYPES = ['Alveolar Epithelial Type 2',
 'B cell',
 'Capillary',
 'Dendritic',
 'Fibroblast',
 'Macrophage',
 'Monocyte',
 'Natural Killer T cell',
 'Smooth Muscle and Myofibroblast',
 'T cell']
len(SHARED_CELLTYPES)

adata_shared = adata[adata.obs.broad_group.isin(SHARED_CELLTYPES)]
adata_shared

  if not is_categorical(df_full[k]):


View of AnnData object with n_obs × n_vars = 89795 × 10560
    obs: 'index', 'age', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'free_annotation', 'individual', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'tissue', 'narrow_group', 'broad_group', 'compartment_group', 'compartment_narrow', 'channel_cleaned', 'batch', 'n_genes', 'n_counts', 'species_batch_v2', 'compartment_broad', 'compartment_broad_narrow', 'compartment_species', 'compartment_narrow_species', 'common_individual_id'
    var: 'bat__gene_name', 'mouse_lemur__gene_name-bat', 'mouse__gene_name-bat', 'mouse_lemur__gene_name_x-hlm', 'mouse__gene_name_x-hlm', 'gene_ids-lemur-hlm', 'n_cells-mouse-hlm', 'mouse_lemur__gene_name_y-hlm', 'mouse__gene_name_y-hlm'

In [9]:
CELL_IDS_TO_USE = set(adata_shared.obs.query('species == "Mouse lemur"').index)
len(CELL_IDS_TO_USE)

19088

In [10]:
sorted(adata_shared.obs.query('species == "Mouse lemur"').channel.unique())

['Antoine_Lung_CD31_10X',
 'Antoine_Lung_EPCAM_10X',
 'Antoine_Lung_P3_10X',
 'Bernard_Lung_10X',
 'Martine_Lung_1_10X',
 'Martine_Lung_2_10X',
 'Stumpy_Lung_10X']

In [11]:
adata_shared.obs.query('species == "Mouse lemur" and channel == "Antoine_Lung_P3_10X"').head()

Unnamed: 0_level_0,index,age,cell_barcode,cell_ontology_class,cell_ontology_id,channel,free_annotation,individual,sample,sequencing_run,...,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANTOINE_LUNG_P3__AAACCTGAGCTTCGCG,Antoine_Lung_P3_10X_AAACCTGAGCTTCGCG-lemur-hlm,12y,AAACCTGAGCTTCGCG,capillary endothelial cell,CL:0002144,Antoine_Lung_P3_10X,capillary cells (type 2),Antoine,,180917_A00111_0211_AHGKCVDMXX,...,ANTOINE_LUNG_P3,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary,endothelial: lemur,endothelial: Capillary: lemur,lemur_Antoine
ANTOINE_LUNG_P3__AAACCTGAGTCAATAG,Antoine_Lung_P3_10X_AAACCTGAGTCAATAG-lemur-hlm,12y,AAACCTGAGTCAATAG,"CD4-positive, alpha-beta T cell",CL:0000624,Antoine_Lung_P3_10X,t cells (CD4+low CD8- IL7R+),Antoine,,180917_A00111_0211_AHGKCVDMXX,...,ANTOINE_LUNG_P3,,,,hlm,lymphoid: T cell,lymphoid: T cell: CD4+ T,lymphoid: lemur,lymphoid: CD4+ T: lemur,lemur_Antoine
ANTOINE_LUNG_P3__AAACCTGCACCGATAT,Antoine_Lung_P3_10X_AAACCTGCACCGATAT-lemur-hlm,12y,AAACCTGCACCGATAT,T cell,CL:0000084,Antoine_Lung_P3_10X,t cells (CD4+ CD8+),Antoine,,180917_A00111_0211_AHGKCVDMXX,...,ANTOINE_LUNG_P3,,,,hlm,lymphoid: T cell,lymphoid: T cell: CD4+ CD8+ T,lymphoid: lemur,lymphoid: CD4+ CD8+ T: lemur,lemur_Antoine
ANTOINE_LUNG_P3__AAACCTGCAGGTGCCT,Antoine_Lung_P3_10X_AAACCTGCAGGTGCCT-lemur-hlm,12y,AAACCTGCAGGTGCCT,T cell,CL:0000084,Antoine_Lung_P3_10X,t cells (CD4+ CD8+),Antoine,,180917_A00111_0211_AHGKCVDMXX,...,ANTOINE_LUNG_P3,,,,hlm,lymphoid: T cell,lymphoid: T cell: CD4+ CD8+ T,lymphoid: lemur,lymphoid: CD4+ CD8+ T: lemur,lemur_Antoine
ANTOINE_LUNG_P3__AAACCTGCATGTAGTC,Antoine_Lung_P3_10X_AAACCTGCATGTAGTC-lemur-hlm,12y,AAACCTGCATGTAGTC,fibroblast,CL:0000057,Antoine_Lung_P3_10X,fibroblasts (alveolar),Antoine,,180917_A00111_0211_AHGKCVDMXX,...,ANTOINE_LUNG_P3,,,,hlm,stromal: Fibroblast,stromal: Fibroblast: Alveolar Fibroblast,stromal: lemur,stromal: Alveolar Fibroblast: lemur,lemur_Antoine


# Lemur

## Single cell fastas

In [14]:
lemur_kmermaid_base = "/home/olga/data_lg/data_sm_copy/tabula-microcebus/analyses/kmermaid/tenx-tgz--alldata-v2-from-bams--remove-ribo"

In [15]:
ls -lha $lemur_kmermaid_base

total 29M
drwxr-xr-x 15 olga czb 4.0K Mar  1 15:38 [0m[01;34m.[0m/
drwxrwxrwx 26 olga czb 4.0K Oct 16  2020 [34;42m..[0m/
drwxr-xr-x  5 olga czb 4.0K Oct  9  2020 [01;34m10x-fastqs[0m/
drwxr-xr-x  4 olga czb 4.0K Oct  9  2020 [01;34mfastp[0m/
drwxr-xr-x  2 olga czb 4.0K Oct 18  2020 [01;34mpipeline_info[0m/
drwxr-xr-x  2 olga czb 4.0K Oct 12  2020 [01;34mprotein_index[0m/
drwxr-xr-x  7 olga czb 4.0K Mar  1 15:38 [01;34msearch_lemur_in_mouse_no_ribosome_no_containment_no_dissociation__remove_common_hashes_from_mouse_sbt__refseq_removed_ribo[0m[K/
drwxr-xr-x  2 olga czb 8.0M Apr 20 10:19 [01;34msketches_nucleotide_handmade[0m/
drwxr-xr-x  3 olga czb 4.0K Apr 25 17:46 [01;34msketches_nucleotide_handmade_merged[0m/
drwxr-xr-x  3 olga czb 4.0K Apr 29 13:15 [01;34msketches_nucleotide_handmade_merged_remove_ribosomal_dissociation_allcells[0m[K/
drwxr-xr-x  2 olga czb 4.0M Feb 28 19:56 [01;34msketches_peptide_handmade[0m/
drwxr-xr-x  6 olga czb 4.0K Apr 26 11:24 [01;3

In [16]:

sig_outdir_base = "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures"

test_dir = os.path.join(sig_outdir_base, "4--test-lemur")

test_0_fastas = os.path.join(test_dir, "0--fastas")
# ! rm -rf $test_0_fastas
test_1_single_cells = os.path.join(test_dir, "1--single-cell-sigs")
# ! rm -rf $test_1_single_cells
fasta_input_dir = os.path.join(lemur_kmermaid_base, "translate")

results_dir = os.path.join(test_dir, "2--mouse-search-results")
dirs = (test_0_fastas, test_1_single_cells, results_dir)

for outdir in dirs:
    if not os.path.exists(outdir):
        os.makedirs(outdir)

In [17]:
ls -lha $fasta_input_dir/ |tail

-rw-r--r--  1 olga czb   99K Oct 12  2020 ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__TTTGTCATCTTCGGTC__coding_reads_nucleotides.fasta
-rw-r--r--  1 olga czb   77K Oct 12  2020 ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__TTTGTCATCTTCGGTC__coding_reads_peptides.fasta
-rw-r--r--  1 olga czb  1.5M Oct 12  2020 ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__TTTGTCATCTTCGGTC__coding_scores.csv
-rw-r--r--  1 olga czb  1.7K Oct 12  2020 ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__TTTGTCATCTTCGGTC__coding_summary.json
-rw-r--r--  1 olga czb  410K Oct 12  2020 ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__TTTGTCATCTTCGGTC__noncoding_reads_nucleotides.fasta
-rw-r--r--  1 olga czb  105K Oct 12  2020 ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__TTTGTCATCTTTACGT__coding_reads_nucleotides.fasta
-rw-r--r--  1 olga czb   81K Oct 12  2020 ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__TTTGTCATC


## Copy fastas

In [17]:
def clean_fasta_name(basename, strings_to_remove=["__aligned", '__possorted_genome_bam', '_possorted_genome_bam', '__unaligned']):
    new_name = None
    for to_remove in strings_to_remove:
        if new_name is None:
            # First time --> take original basename
            new_name = basename.replace(to_remove, '')
        else:
            new_name = new_name.replace(to_remove, '')
            
    new_name = new_name.split('_coding_reads')[0].strip("_")
    return new_name

In [18]:
sig_reorg.copy_fastas(
    test_0_fastas,
    fasta_input_dir,
    select_cell_ids=CELL_IDS_TO_USE,
    dryrun=False,
    cell_id_fun=clean_fasta_name,
)

0it [00:00, ?it/s]

Copying peptides


19896it [00:01, 17227.27it/s]
0it [00:00, ?it/s]

Copying nucleotides


19896it [00:01, 15871.65it/s]


In [21]:
!ls -lha $test_0_fastas/nucleotides | head

total 4.9G
drwxr-xr-x 2 olga czb 256K Apr 16 17:47 .
drwxr-xr-x 4 olga czb 4.0K Apr 16 17:43 ..
-rw-r--r-- 1 olga czb 4.9M Apr 16 17:45 ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__AAACGGGGTAAGTGTA__coding_reads_nucleotides.fasta
-rw-r--r-- 1 olga czb 3.2M Apr 16 17:46 ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__AAACGGGGTTTCCACC__coding_reads_nucleotides.fasta
-rw-r--r-- 1 olga czb 5.1M Apr 16 17:45 ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__AAAGCAAAGACCTTTG__coding_reads_nucleotides.fasta
-rw-r--r-- 1 olga czb 3.1M Apr 16 17:45 ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__AAAGCAAAGCTGCCCA__coding_reads_nucleotides.fasta
-rw-r--r-- 1 olga czb 6.7M Apr 16 17:44 ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__AAAGCAACATGTCCTC__coding_reads_nucleotides.fasta
-rw-r--r-- 1 olga czb 3.2M Apr 16 17:45 ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__AAAGCAATCGTTTAGG__coding_reads_nucleotides.fasta
-rw-r--r-- 1 olga czb 4.2M A

In [22]:
!ls -lha $test_0_fastas/nucleotides | wc -l

1967


In [19]:
n = 0

for filename in glob.iglob(os.path.join(test_0_fastas, '*', '*.fasta')):
    if "P3" in filename and n < 10:
        n += 1
        print(filename)
#         break

/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/0--fastas/nucleotides/ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__AATCGGTCACGGCTAC__coding_reads_nucleotides.fasta
/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/0--fastas/nucleotides/ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__CATGGCGCAGATCCAT__coding_reads_nucleotides.fasta
/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/0--fastas/nucleotides/ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__GTTCATTTCCTTGACC__coding_reads_nucleotides.fasta
/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/0--fastas/nucleotides/ANTOINE_LUNG_P3__possorted_genome_bam__unaligned__unaligned__CCCAATCTCATCGATG__coding_reads_nucleotides.fasta
/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/0--fastas/nucleotides/ANTOINE_LUNG_P3__possorted_genome_bam__aligned__aligned__TGCTAC

## Single cell signatures

In [18]:

peptide_sketch_dir = os.path.join(
    lemur_kmermaid_base,
    "sketches_peptide_handmade_merged_remove_ribosomal_dissociation_allcells",
)

nucleotide_sketch_dir = os.path.join(
    lemur_kmermaid_base,
    "sketches_nucleotide_handmade_merged_remove_ribosomal_dissociation_allcells",
)

sig_reorg.copy_nucleotide_peptide_sketches(
    peptide_sketch_dir=peptide_sketch_dir,
    nucleotide_sketch_dir=nucleotide_sketch_dir,
    pre_sketch_id_outdir=test_1_single_cells,
    select_cell_ids=CELL_IDS_TO_USE,
    dryrun=False,
    # Remove all reference to "_POS" in the channel to match downstream work
    cell_id_fun=lambda x: x.replace('_POS', '')
)

Copying alphabet-DNA__ksize-21__scaled-10


5695it [03:30, 27.03it/s]  
20it [00:00, 196.09it/s]

Copying alphabet-protein__ksize-30__scaled-10


5695it [00:13, 422.00it/s]
0it [00:00, ?it/s]

Copying alphabet-dayhoff__ksize-51__scaled-10


5695it [00:13, 421.96it/s]


In [52]:
!ls -lha $test_1_single_cells

total 1.6M
drwxr-xr-x  5 olga czb 4.0K Apr 29 13:55 .
drwxr-xr-x 22 olga czb 4.0K Apr 21 09:49 ..
drwxr-xr-x  2 olga czb 512K Apr 29 13:57 alphabet-dayhoff__ksize-51__scaled-10
drwxr-xr-x  2 olga czb 512K Apr 29 13:51 alphabet-DNA__ksize-21__scaled-10
drwxr-xr-x  2 olga czb 512K Apr 29 13:55 alphabet-protein__ksize-30__scaled-10
-rw-r--r--  1 olga czb    0 Apr 16 18:10 sig_describe_commands.txt


In [53]:
# ! rm -rf $test_1_single_cells/alphabet*

In [57]:
! ls -lha $test_1_single_cells/alphabet-dayhoff__ksize-51__scaled-10 | head

total 913M
drwxr-xr-x 2 olga czb 512K Apr 29 13:57 .
drwxr-xr-x 5 olga czb 4.0K Apr 29 13:55 ..
-rw-r--r-- 1 olga czb 162K Apr 29 13:55 ANTOINE_LUNG_CD31__AAACCTGAGAAACGAG.sig
-rw-r--r-- 1 olga czb 179K Apr 29 13:55 ANTOINE_LUNG_CD31__AAACCTGAGGCAGTCA.sig
-rw-r--r-- 1 olga czb 216K Apr 29 13:57 ANTOINE_LUNG_CD31__AAACCTGAGGGTGTGT.sig
-rw-r--r-- 1 olga czb 216K Apr 29 13:56 ANTOINE_LUNG_CD31__AAACCTGGTATATGGA.sig
-rw-r--r-- 1 olga czb 219K Apr 29 13:56 ANTOINE_LUNG_CD31__AAACCTGTCCGTCATC.sig
-rw-r--r-- 1 olga czb  95K Apr 29 13:57 ANTOINE_LUNG_CD31__AAACCTGTCGAGAGCA.sig
-rw-r--r-- 1 olga czb 184K Apr 29 13:55 ANTOINE_LUNG_CD31__AAACCTGTCGGAGCAA.sig
ls: write error: Broken pipe


In [56]:
! ls -lha $test_1_single_cells/alphabet-DNA__ksize-21__scaled-10 | head

total 2.5G
drwxr-xr-x 2 olga czb  512K Apr 29 13:51 .
drwxr-xr-x 5 olga czb  4.0K Apr 29 13:55 ..
-rw-r--r-- 1 olga czb  579K Apr 29 13:49 ANTOINE_LUNG_CD31__AAACCTGAGAAACGAG.sig
-rw-r--r-- 1 olga czb  661K Apr 29 13:49 ANTOINE_LUNG_CD31__AAACCTGAGGCAGTCA.sig
-rw-r--r-- 1 olga czb  768K Apr 29 13:51 ANTOINE_LUNG_CD31__AAACCTGAGGGTGTGT.sig
-rw-r--r-- 1 olga czb  772K Apr 29 13:50 ANTOINE_LUNG_CD31__AAACCTGGTATATGGA.sig
-rw-r--r-- 1 olga czb  773K Apr 29 13:50 ANTOINE_LUNG_CD31__AAACCTGTCCGTCATC.sig
-rw-r--r-- 1 olga czb  352K Apr 29 13:51 ANTOINE_LUNG_CD31__AAACCTGTCGAGAGCA.sig
-rw-r--r-- 1 olga czb  656K Apr 29 13:50 ANTOINE_LUNG_CD31__AAACCTGTCGGAGCAA.sig
ls: write error: Broken pipe


In [58]:
adata_shared.obs.head()

Unnamed: 0_level_0,index,age,cell_barcode,cell_ontology_class,cell_ontology_id,channel,free_annotation,individual,sample,sequencing_run,...,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P2_1__AAACCTGAGAAACCAT,P2_1_AAACCTGAGAAACCAT-human-hlm,46y,AAACCTGAGAAACCAT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AAATGCCAGATGAGAG,P2_1_AAATGCCAGATGAGAG-human-hlm,46y,AAATGCCAGATGAGAG,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGATCCCT,P2_1_AACACGTTCGATCCCT-human-hlm,46y,AACACGTTCGATCCCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGCACTCT,P2_1_AACACGTTCGCACTCT-human-hlm,46y,AACACGTTCGCACTCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACCATGCAGCTCGCA,P2_1_AACCATGCAGCTCGCA-human-hlm,46y,AACCATGCAGCTCGCA,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2


In [59]:
lemur_obs = adata_shared.obs.query('species == "Mouse lemur"')
lemur_obs.broad_group.value_counts()

Macrophage                         6921
T cell                             4105
Monocyte                           2644
Capillary                          2136
Fibroblast                         1271
Alveolar Epithelial Type 2          804
Natural Killer T cell               446
B cell                              358
Dendritic                           316
Smooth Muscle and Myofibroblast      87
Name: broad_group, dtype: int64

In [60]:
lemur_obs.groupby(['broad_group', 'channel'], observed=True).size()

broad_group                      channel               
Macrophage                       Antoine_Lung_CD31_10X     3055
                                 Antoine_Lung_EPCAM_10X     911
                                 Antoine_Lung_P3_10X         20
                                 Bernard_Lung_10X          1198
                                 Martine_Lung_1_10X         224
                                 Martine_Lung_2_10X         205
                                 Stumpy_Lung_10X           1308
Alveolar Epithelial Type 2       Antoine_Lung_CD31_10X      329
                                 Antoine_Lung_EPCAM_10X     197
                                 Antoine_Lung_P3_10X         22
                                 Bernard_Lung_10X            89
                                 Stumpy_Lung_10X            167
T cell                           Antoine_Lung_CD31_10X      880
                                 Antoine_Lung_EPCAM_10X      13
                                 Antoine_Lung_P3

In [61]:
! ls -lha $peptide_sketch_dir/alphabet-protein__ksize-30__scaled-10/ANTOINE_LUNG_CD31_POS__AAAGTAGTCCGGGTGT* | head

zsh:1: no matches found: /home/olga/data_lg/data_sm_copy/tabula-microcebus/analyses/kmermaid/tenx-tgz--alldata-v2-from-bams--remove-ribo/sketches_peptide_handmade_merged_remove_ribosomal_dissociation_allcells/alphabet-protein__ksize-30__scaled-10/ANTOINE_LUNG_CD31_POS__AAAGTAGTCCGGGTGT*


In [62]:
lemur_obs.query('broad_group == "Smooth Muscle and Myofibroblast"')

Unnamed: 0_level_0,index,age,cell_barcode,cell_ontology_class,cell_ontology_id,channel,free_annotation,individual,sample,sequencing_run,...,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ANTOINE_LUNG_CD31__AAAGTAGTCCGGGTGT,Antoine_Lung_CD31_10X_AAAGTAGTCCGGGTGT-lemur-hlm,12y,AAAGTAGTCCGGGTGT,stromal cell,CL:0000499,Antoine_Lung_CD31_10X,smooth muscle cells (airway) and myofibroblasts,Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Airw...,stromal: lemur,stromal: Airway Smooth Muscle and Myofibroblas...,lemur_Antoine
ANTOINE_LUNG_CD31__ACGATGTTCTGCGTAA,Antoine_Lung_CD31_10X_ACGATGTTCTGCGTAA-lemur-hlm,12y,ACGATGTTCTGCGTAA,smooth muscle cell,CL:0000192,Antoine_Lung_CD31_10X,smooth muscle cells (vascular),Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Vasc...,stromal: lemur,stromal: Vascular Smooth Muscle: lemur,lemur_Antoine
ANTOINE_LUNG_CD31__ACTTGTTCACAGAGGT,Antoine_Lung_CD31_10X_ACTTGTTCACAGAGGT-lemur-hlm,12y,ACTTGTTCACAGAGGT,stromal cell,CL:0000499,Antoine_Lung_CD31_10X,smooth muscle cells (airway) and myofibroblasts,Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Airw...,stromal: lemur,stromal: Airway Smooth Muscle and Myofibroblas...,lemur_Antoine
ANTOINE_LUNG_CD31__AGAATAGTCTGAGGGA,Antoine_Lung_CD31_10X_AGAATAGTCTGAGGGA-lemur-hlm,12y,AGAATAGTCTGAGGGA,smooth muscle cell,CL:0000192,Antoine_Lung_CD31_10X,smooth muscle cells (vascular),Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Vasc...,stromal: lemur,stromal: Vascular Smooth Muscle: lemur,lemur_Antoine
ANTOINE_LUNG_CD31__AGCGTCGTCAGGCAAG,Antoine_Lung_CD31_10X_AGCGTCGTCAGGCAAG-lemur-hlm,12y,AGCGTCGTCAGGCAAG,smooth muscle cell,CL:0000192,Antoine_Lung_CD31_10X,smooth muscle cells (vascular),Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Vasc...,stromal: lemur,stromal: Vascular Smooth Muscle: lemur,lemur_Antoine
ANTOINE_LUNG_CD31__ATCATGGTCCAGTAGT,Antoine_Lung_CD31_10X_ATCATGGTCCAGTAGT-lemur-hlm,12y,ATCATGGTCCAGTAGT,smooth muscle cell,CL:0000192,Antoine_Lung_CD31_10X,smooth muscle cells (vascular),Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Vasc...,stromal: lemur,stromal: Vascular Smooth Muscle: lemur,lemur_Antoine
ANTOINE_LUNG_CD31__ATGTGTGGTTTGTGTG,Antoine_Lung_CD31_10X_ATGTGTGGTTTGTGTG-lemur-hlm,12y,ATGTGTGGTTTGTGTG,stromal cell,CL:0000499,Antoine_Lung_CD31_10X,smooth muscle cells (airway) and myofibroblasts,Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Airw...,stromal: lemur,stromal: Airway Smooth Muscle and Myofibroblas...,lemur_Antoine
ANTOINE_LUNG_CD31__CAGCGACGTTCCTCCA,Antoine_Lung_CD31_10X_CAGCGACGTTCCTCCA-lemur-hlm,12y,CAGCGACGTTCCTCCA,smooth muscle cell,CL:0000192,Antoine_Lung_CD31_10X,smooth muscle cells (vascular),Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Vasc...,stromal: lemur,stromal: Vascular Smooth Muscle: lemur,lemur_Antoine
ANTOINE_LUNG_CD31__CATATTCCAGACAAAT,Antoine_Lung_CD31_10X_CATATTCCAGACAAAT-lemur-hlm,12y,CATATTCCAGACAAAT,stromal cell,CL:0000499,Antoine_Lung_CD31_10X,smooth muscle cells (airway) and myofibroblasts,Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Airw...,stromal: lemur,stromal: Airway Smooth Muscle and Myofibroblas...,lemur_Antoine
ANTOINE_LUNG_CD31__CATCAGAAGGGATCTG,Antoine_Lung_CD31_10X_CATCAGAAGGGATCTG-lemur-hlm,12y,CATCAGAAGGGATCTG,stromal cell,CL:0000499,Antoine_Lung_CD31_10X,smooth muscle cells (airway) and myofibroblasts,Antoine,,180917_A00111_0212_BHGMF5DMXX,...,ANTOINE_LUNG_CD31,,,,hlm,stromal: Smooth Muscle and Myofibroblast,stromal: Smooth Muscle and Myofibroblast: Airw...,stromal: lemur,stromal: Airway Smooth Muscle and Myofibroblas...,lemur_Antoine


In [63]:
peptide_sketch_dir

'/home/olga/data_lg/data_sm_copy/tabula-microcebus/analyses/kmermaid/tenx-tgz--alldata-v2-from-bams--remove-ribo/sketches_peptide_handmade_merged_remove_ribosomal_dissociation_allcells'

In [64]:
! ls -lha $peptide_sketch_dir/alphabet-protein__ksize-30__scaled-10/ | head

total 1.2G
drwxr-xr-x 2 olga czb 512K Apr 26 22:22 .
drwxr-xr-x 6 olga czb 4.0K Apr 26 16:52 ..
-rw-r--r-- 1 olga czb 207K Apr 26 18:12 ANTOINE_LUNG_CD31_POS__AAACCTGAGAAACGAG.sig
-rw-r--r-- 1 olga czb 235K Apr 26 18:30 ANTOINE_LUNG_CD31_POS__AAACCTGAGGCAGTCA.sig
-rw-r--r-- 1 olga czb 279K Apr 26 20:50 ANTOINE_LUNG_CD31_POS__AAACCTGAGGGTGTGT.sig
-rw-r--r-- 1 olga czb 278K Apr 26 20:03 ANTOINE_LUNG_CD31_POS__AAACCTGGTATATGGA.sig
-rw-r--r-- 1 olga czb 281K Apr 26 19:35 ANTOINE_LUNG_CD31_POS__AAACCTGTCCGTCATC.sig
-rw-r--r-- 1 olga czb 125K Apr 26 20:50 ANTOINE_LUNG_CD31_POS__AAACCTGTCGAGAGCA.sig
-rw-r--r-- 1 olga czb 238K Apr 26 18:41 ANTOINE_LUNG_CD31_POS__AAACCTGTCGGAGCAA.sig
ls: write error: Broken pipe


In [65]:
! ls -1 $peptide_sketch_dir/alphabet-protein__ksize-30__scaled-10/ | wc -l

5695


In [66]:
! ls -lha $nucleotide_sketch_dir/alphabet-DNA__ksize-21__scaled-10/ | head

total 3.1G
drwxr-xr-x 2 olga czb  512K Apr 29 13:55 .
drwxr-xr-x 3 olga czb  4.0K Apr 29 13:15 ..
-rw-r--r-- 1 olga czb  579K Apr 29 13:28 ANTOINE_LUNG_CD31_POS__AAACCTGAGAAACGAG.sig
-rw-r--r-- 1 olga czb  661K Apr 29 13:30 ANTOINE_LUNG_CD31_POS__AAACCTGAGGCAGTCA.sig
-rw-r--r-- 1 olga czb  768K Apr 29 13:45 ANTOINE_LUNG_CD31_POS__AAACCTGAGGGTGTGT.sig
-rw-r--r-- 1 olga czb  772K Apr 29 13:40 ANTOINE_LUNG_CD31_POS__AAACCTGGTATATGGA.sig
-rw-r--r-- 1 olga czb  773K Apr 29 13:37 ANTOINE_LUNG_CD31_POS__AAACCTGTCCGTCATC.sig
-rw-r--r-- 1 olga czb  352K Apr 29 13:45 ANTOINE_LUNG_CD31_POS__AAACCTGTCGAGAGCA.sig
-rw-r--r-- 1 olga czb  656K Apr 29 13:31 ANTOINE_LUNG_CD31_POS__AAACCTGTCGGAGCAA.sig
ls: write error: Broken pipe


In [67]:
! ls -lha $nucleotide_sketch_dir

total 513K
drwxr-xr-x  3 olga czb 4.0K Apr 29 13:15 .
drwxr-xr-x 15 olga czb 4.0K Mar  1 15:38 ..
drwxr-xr-x  2 olga czb 512K Apr 29 13:55 alphabet-DNA__ksize-21__scaled-10


In [68]:
! ls -1 $nucleotide_sketch_dir/alphabet-DNA__ksize-21__scaled-10/ | wc -l

5695


In [69]:
set(os.path.basename(x).split('__')[0] for x in glob.iglob(os.path.join(nucleotide_sketch_dir, 'alphabet-DNA__ksize-21__scaled-10', '*.sig')))

{'ANTOINE_LUNG_CD31_POS', 'ANTOINE_LUNG_EPCAM_POS', 'ANTOINE_LUNG_P3'}

In [70]:
# CELL_IDS_TO_USE

In [71]:
for filename in glob.iglob(os.path.join(test_1_single_cells, '*', '*.sig')):
    if "P3" in filename:
        print(filename)
        break

/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/1--single-cell-sigs/alphabet-dayhoff__ksize-51__scaled-10/ANTOINE_LUNG_P3__TATCAGGGTTCGTTGA.sig


# Create queries

In [72]:
moltypes = "DNA", "protein", "dayhoff"

alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))

# mouse celltype database directory
train_4_celltype_sbts = "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/1--train-mouse/4--celltype-sbt-dbts"
train_3_merged_celltype_remove_common = "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers"
train_2_merged_celltype = "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs"

# Try both with removed common hashes and original
celltype_dirs = dict(
    remove_common=train_3_merged_celltype_remove_common,
    raw_merged=train_2_merged_celltype,
)

txts = []
for i, (name, celltype_dir) in enumerate(celltype_dirs.items()):
    this_celltype_dir_results = os.path.join(
        test_dir, f"3-{i}--mouse-search-results--{name}"
    )
    if not os.path.exists(this_celltype_dir_results):
        os.makedirs(this_celltype_dir_results)
    for alpha, ksize in alpha_ksizes:
        txt = csc_utils.make_sourmash_search_commands(
            this_celltype_dir_results,
            query_sigs_dir=test_1_single_cells,
            sbt_base_dir=celltype_dir,
            k_sizes=[ksize],
            scaled_sizes=[
                10,
            ],
            cell_ids=[],
            sbt_template_basename=None,
            query_sig_files=False,
            containment=True,
            moltype=alpha,
            threshold=1e-100,
            n_jobs=96,
            force=True,
            num_results=None,
            add_ksize_to_sig_dir=False,
            traverse_directory=True,
        )
        txts.append(txt)
txts_to_cat = " ".join(txts)
catted_txt = os.path.join(results_dir, "sourmash_search_commands_concatenated.txt")
! cat $txts_to_cat > $catted_txt

print(f"---\nparallel --progress --eta --jobs 96 < {catted_txt}")

NameError: name 'human_test_dir' is not defined

## Search mouse celltypes with k-mers shared by 5, 10% of cells, but remove k-mers shared by 80% of celltypes

In [73]:
moltypes = "DNA", "protein", "dayhoff"

alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))

# mouse celltype database directory
train_3_sigs_min_10_percent = "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs--min-kmer-count--10-percent"
train_3_sigs_min_5_percent = "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs--min-kmer-count--5-percent"
train_3_sigs_min_5_percent_remove_common = "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers--min-kmer-count--5-percent"
train_3_sigs_min_10_percent_remove_common = "/home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers--min-kmer-count--10-percent"

# Try both with removed common hashes and original
celltype_dirs = {
        'min-5-percent-shared-kmers': train_3_sigs_min_5_percent,
    'min-10-percent-shared-kmers': train_3_sigs_min_10_percent,
    'min-5-percent-shared-kmers-remove-common-kmers': train_3_sigs_min_5_percent_remove_common,
    'min-10-percent-shared-kmers-remove-common-kmers': train_3_sigs_min_10_percent_remove_common,
}

txts = []

containments = True, False
for containment in containments:
    for i, (name, celltype_dir) in enumerate(celltype_dirs.items()):
        suffix = f"3-{i}--mouse-containment-search-results--{name}" if containment else f"3-{i}--mouse-search-results--{name}"
        this_celltype_dir_results = os.path.join(
            test_dir, suffix
        )
        if not os.path.exists(this_celltype_dir_results):
            os.makedirs(this_celltype_dir_results)
        for alpha, ksize in alpha_ksizes:
            txt = csc_utils.make_sourmash_search_commands(
                this_celltype_dir_results,
                query_sigs_dir=test_1_single_cells,
                sbt_base_dir=celltype_dir,
                k_sizes=[ksize],
                scaled_sizes=[
                    10,
                ],
                sbt_template_basename=None,
                query_sig_files=False,
                containment=containment,
                moltype=alpha,
                threshold=1e-100,
                n_jobs=96,
                force=False,
                num_results=None,
                add_ksize_to_sig_dir=False,
                traverse_directory=True,
            )
            txts.append(txt)
txts_to_cat = " ".join(txts)
catted_txt = os.path.join(results_dir, "sourmash_search_commands_concatenated.txt")
! cat $txts_to_cat > $catted_txt

print(f"---\nparallel --progress --eta --jobs 96 < {catted_txt}")
! wc -l $catted_txt

parallel --progress --eta --jobs 96 < /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/3-0--mouse-containment-search-results--min-5-percent-shared-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/3-0--mouse-containment-search-results--min-5-percent-shared-kmers/sourmash_search_commands_protein.txt
parallel --progress --eta --jobs 96 < /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/3-0--mouse-containment-search-results--min-5-percent-shared-kmers/sourmash_search_commands_dayhoff.txt
parallel --progress --eta --jobs 96 < /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/3-1--mouse-containment-search-results--min-10-percent-shared-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_lg/data_sm_copy/immune-evolution/kmer-signatures/4--test-lemur/3-1--mouse

In [32]:
1+1

2