# Goal for how to organize the signatures

Copy all ribosomal-removed signatures to these subfolders


Using `DNA`, `protein`, and `dayhoff` for shorthand now. In reality would use the full sketch id, e.g. `alphabet-DNA__ksize-21__scaled-10`

```
mouse2mouse/
    self2self-bootstrapped/
        0--train-mouse1-single-cells/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        1--test-mouse1-single-cells/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...                
        2--train-mouse1-merged-celltype-sigs/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        3--train-mouse1-celltype-remove-common-kmers/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        4--train-mouse1--celltype-sbt-dbs/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        5--search-results/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-00/
                ...
    mouse1-mouse2/
        0--train-mouse1-single-cells/
            DNA/
            protein/
            dayhoff/
        1--test-mouse2-single-cells/
            DNA/
            protein/
            dayhoff/
        2--train-mouse1-merged-celltype-sigs/
            DNA/
            protein/
            dayhoff/
        3--train-mouse1-celltype-remove-common-kmers/
            DNA/
            protein/
            dayhoff/
        4--train-mouse1-celltype-sbt-dbs/
            DNA/
            protein/
            dayhoff
        5--search-results/
            DNA/
            protein/
            dayhoff/
train-mouse/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        DNA/
        protein/
        dayhoff/
    2--merged-celltype-sigs/
        DNA/
        protein/
        dayhoff/
    3--celltype-remove-common-kmers/
        DNA/
        protein/
        dayhoff/
    4--celltype-sbt-dbs/
        DNA/
        protein/
        dayhoff/
test-human/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        DNA/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoffl
test-lemur/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoff/
test-bat/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoff/
```

# Imports

In [1]:
import glob
import os
import shutil

import pandas as pd
import scanpy as sc
import sig_utils
from joblib import Parallel, delayed
from tqdm import tqdm

import create_sourmash_command_utils as csc_utils

import sig_reorg

pd.options.display.max_rows = 500

%load_ext autoreload
%autoreload 2

## Def Describe

In [2]:
%%file pandas_utils.py

def describe(df, random=False):
    print(df.shape)
    print("--- First 5 entries ---")
    display(df.head())
    if random:
        print('--- Random subset ---')
        display(df.sample(5))

Overwriting pandas_utils.py


In [3]:
from pandas_utils import describe

## Read one2one h5ad

In [4]:
adata = sc.read(
#     "/home/phoenix/data_sm/code/immune-evolution/h5ads/human-lemur-mouse-bat__lung_only_copy.h5ad"
    "/home/olga/data_sm/immune-evolution/h5ads/human-lemur-mouse-bat/human-lemur-mouse-bat__lung_only.h5ad"
)
adata.obs = adata.obs.reset_index().set_index('cell_id')
print(adata)
adata.obs.head()

AnnData object with n_obs × n_vars = 126745 × 10560
    obs: 'index', 'age', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'free_annotation', 'individual', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'tissue', 'narrow_group', 'broad_group', 'compartment_group', 'compartment_narrow', 'channel_cleaned', 'batch', 'n_genes', 'n_counts', 'species_batch_v2', 'compartment_broad', 'compartment_broad_narrow', 'compartment_species', 'compartment_narrow_species', 'common_individual_id'
    var: 'bat__gene_name', 'mouse_lemur__gene_name-bat', 'mouse__gene_name-bat', 'mouse_lemur__gene_name_x-hlm', 'mouse__gene_name_x-hlm', 'gene_ids-lemur-hlm', 'n_cells-mouse-hlm', 'mouse_lemur__gene_name_y-hlm', 'mouse__gene_name_y-hlm'


Unnamed: 0_level_0,index,age,cell_barcode,cell_ontology_class,cell_ontology_id,channel,free_annotation,individual,sample,sequencing_run,...,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P2_1__AAACCTGAGAAACCAT,P2_1_AAACCTGAGAAACCAT-human-hlm,46y,AAACCTGAGAAACCAT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AAATGCCAGATGAGAG,P2_1_AAATGCCAGATGAGAG-human-hlm,46y,AAATGCCAGATGAGAG,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGATCCCT,P2_1_AACACGTTCGATCCCT-human-hlm,46y,AACACGTTCGATCCCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGCACTCT,P2_1_AACACGTTCGCACTCT-human-hlm,46y,AACACGTTCGCACTCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACCATGCAGCTCGCA,P2_1_AACCATGCAGCTCGCA-human-hlm,46y,AACCATGCAGCTCGCA,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2


In [5]:
adata.obs.species.value_counts()

Human          57353
Mouse lemur    36934
Mouse          23802
Bat             8656
Name: species, dtype: int64

## Set shared celltypes

In [6]:
SHARED_CELLTYPES = ['Alveolar Epithelial Type 2',
 'B cell',
 'Capillary',
 'Dendritic',
 'Fibroblast',
 'Macrophage',
 'Monocyte',
 'Natural Killer T cell',
 'Smooth Muscle and Myofibroblast',
 'T cell']
len(SHARED_CELLTYPES)

10

In [7]:
adata_shared = adata[adata.obs.broad_group.isin(SHARED_CELLTYPES)]
adata_shared

  if not is_categorical(df_full[k]):


View of AnnData object with n_obs × n_vars = 89795 × 10560
    obs: 'index', 'age', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'free_annotation', 'individual', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'tissue', 'narrow_group', 'broad_group', 'compartment_group', 'compartment_narrow', 'channel_cleaned', 'batch', 'n_genes', 'n_counts', 'species_batch_v2', 'compartment_broad', 'compartment_broad_narrow', 'compartment_species', 'compartment_narrow_species', 'common_individual_id'
    var: 'bat__gene_name', 'mouse_lemur__gene_name-bat', 'mouse__gene_name-bat', 'mouse_lemur__gene_name_x-hlm', 'mouse__gene_name_x-hlm', 'gene_ids-lemur-hlm', 'n_cells-mouse-hlm', 'mouse_lemur__gene_name_y-hlm', 'mouse__gene_name_y-hlm'

In [8]:
CELL_IDS_TO_USE = set(adata_shared.obs.index)
len(CELL_IDS_TO_USE)

89795

In [45]:
import sig_reorg

# Bat

## Single cell fastas

In [11]:
! ls -lha $sig_outdir_base

total 11K
drwxr-xr-x  6 olga czb 4.0K Feb 28 19:43 .
drwxrwxr-x 81 olga czb 8.0K Feb 28 09:02 ..
drwxr-xr-x  4 olga czb 4.0K Feb 28 09:53 0--mouse2mouse
drwxr-xr-x  7 olga czb 4.0K Feb 28 16:57 1--train-mouse
drwxr-xr-x  7 olga czb 4.0K Feb 28 17:17 2--test-human
drwxr-xr-x  5 olga czb 4.0K Feb 28 19:43 3--test-bat


In [12]:
bat_kmermaid_base = '/home/olga/data_sm/immune-evolution/pipeline-results/bat/kmermaid/ksize_8'


sig_outdir_base = "/home/olga/data_sm/immune-evolution/kmer-signatures"

test_dir = os.path.join(sig_outdir_base, "3--test-bat")

test_0_fastas = os.path.join(test_dir, "0--protein-coding-fastas")
test_1_single_cells = os.path.join(test_dir, "1--single-cell-sigs")
fasta_input_dir = os.path.join(bat_kmermaid_base, "translate")

results_dir = os.path.join(test_dir, "2--mouse-search-results")
dirs = (test_0_fastas, test_1_single_cells, results_dir)

for outdir in dirs:
    if not os.path.exists(outdir):
        os.makedirs(outdir)


## Copy fastas

In [35]:
def clean_cell_id(x):
    cleaned = x.replace("_possorted_genome_bam", "")
    cleaned = cleaned.replace("__aligned__aligned", "")
                .replace("__aligned__", "__")
                .replace("__unaligned__unaligned", "")
                .replace("__unaligned__", "__")
                .split("__coding")[0]
    return cleaned

In [41]:
sig_reorg.copy_fastas(
    test_0_fastas,
    fasta_input_dir,
    select_cell_ids=CELL_IDS_TO_USE,
    dryrun=False,
    cell_id_fun=clean_cell_id
)

0it [00:00, ?it/s]

Copying peptides


20188it [24:36, 13.67it/s]
0it [00:00, ?it/s]

Copying nucleotides


20188it [29:38, 11.35it/s]


In [31]:
test_0_fastas

'/home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/0--protein-coding-fastas'

In [33]:
ls -lha $test_0_fastas/peptides | head

total 1.0K
drwxr-xr-x 2 olga czb 4.0K Mar  5 16:29 [0m[01;34m.[0m/
drwxr-xr-x 4 olga czb 4.0K Mar  5 16:29 [01;34m..[0m/


## Single cell signatures

In [58]:
ls $bat_kmermaid_base

[0m[34;42mfastp[0m/
[34;42mpipeline_info[0m/
[34;42mprotein_index[0m/
[34;42mrun-1[0m/
[34;42msbt_indices_handmade_mouse[0m/
[01;34msbt_indices_handmade_mouse_remove_ribosomal[0m/
[01;34msbt_indices_handmade_mouse__remove_ribosomal_dissociation[0m/
[01;34msbt_indices_handmade_mouse__remove_ribosomal_dissociation__all_mice[0m/
[01;34msbt_indices_handmade_mouse__remove_ribosomal_dissociation__all_mice__per_celltype[0m[K/
[01;34msbt_indices_handmade_mouse__remove_ribosomal_dissociation__all_mice__per_celltype__no_common_hashes[0m[K/
[01;34msbt_indices_handmade_mouse__remove_ribosomal_dissociation__all_mice__per_celltype__no_common_hashes__refseq_removed_ribo[0m[K/
[34;42msearch_bat_in_mouse[0m/
[01;34msearch_bat_in_mouse__kmer_feature_selection[0m/
[01;34msearch_bat_in_mouse__kmer_feature_selection__containment[0m/
[01;34msearch_bat_in_mouse_no_ribosome_no_containment_no_dissociation__per_celltype_remove_common_hashes[0m[K/
[01;34msearch_bat_in_mouse_no

In [63]:

peptide_sketch_dir = os.path.join(
    bat_kmermaid_base,
    "sketches_peptide_handmade_merged_remove_ribosomal_dissociation_allcells",
)


nucleotide_sketch_dir = os.path.join(
    bat_kmermaid_base,
    "sketches_nucleotide_handmade_merged_remove_ribosomal_dissociation_allcells",
)



In [64]:
sig_reorg.copy_nucleotide_peptide_sketches(
    peptide_sketch_dir=peptide_sketch_dir,
    nucleotide_sketch_dir=nucleotide_sketch_dir,
    pre_sketch_id_outdir=test_1_single_cells,
    select_cell_ids=CELL_IDS_TO_USE,
    dryrun=False
)

1it [00:00,  5.81it/s]

Copying alphabet-DNA__ksize-21__scaled-10


7776it [04:51, 26.71it/s]
1it [00:00,  6.78it/s]

Copying alphabet-protein__ksize-30__scaled-10


8656it [00:03, 2422.93it/s]
23it [00:00, 229.65it/s]

Copying alphabet-dayhoff__ksize-51__scaled-10


8574it [00:03, 2443.47it/s]


# Create queries

In [45]:
human_test_dir

'/home/olga/data_sm/immune-evolution/kmer-signatures/2--test-human'

In [26]:
moltypes = "DNA", "protein", "dayhoff"

alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))

# mouse celltype database directory
train_4_celltype_sbts = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/4--celltype-sbt-dbts"
train_3_merged_celltype_remove_common = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers"
train_2_merged_celltype = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs"

# Try both with removed common hashes and original
celltype_dirs = dict(
    remove_common=train_3_merged_celltype_remove_common,
    raw_merged=train_2_merged_celltype,
)

txts = []
for i, (name, celltype_dir) in enumerate(celltype_dirs.items()):
    this_celltype_dir_results = os.path.join(
        test_dir, f"3-{i}--mouse-search-results--{name}"
    )
    if not os.path.exists(this_celltype_dir_results):
        os.makedirs(this_celltype_dir_results)
    for alpha, ksize in alpha_ksizes:
        txt = csc_utils.make_sourmash_search_commands(
            this_celltype_dir_results,
            query_sigs_dir=test_1_single_cells,
            sbt_base_dir=celltype_dir,
            k_sizes=[ksize],
            scaled_sizes=[
                10,
            ],
            cell_ids=[],
            sbt_template_basename=None,
            query_sig_files=False,
            containment=False,
            moltype=alpha,
            threshold=1e-100,
            n_jobs=96,
            force=True,
            num_results=None,
            add_ksize_to_sig_dir=False,
            traverse_directory=True,
        )
        txts.append(txt)
txts_to_cat = " ".join(txts)
catted_txt = os.path.join(results_dir, "sourmash_search_commands_concatenated.txt")
! cat $txts_to_cat > $catted_txt

print(f"---\nparallel --progress --eta --jobs 96 < {catted_txt}")

parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-0--mouse-search-results--remove_common/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-0--mouse-search-results--remove_common/sourmash_search_commands_protein.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-0--mouse-search-results--remove_common/sourmash_search_commands_dayhoff.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-1--mouse-search-results--raw_merged/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-1--mouse-search-results--raw_merged/sourmash_search_commands_protein.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-1--mouse-s

In [24]:
! wc -l $catted_txt

30958 /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/2--mouse-search-results/sourmash_search_commands_concatenated.txt


In [25]:
adata.obs.loc['bat3-LU__CAGCCGATCCCTGACT', 'broad_group']

'Capillary'

In [22]:
! head -n 3 $catted_txt

sourmash search --quiet --protein --no-dna  --threshold 1e-100 -k 30 --traverse-directory --containment --output /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-0--mouse-search-results--remove_common/alphabet-protein__ksize-30__scaled-10/bat3-LU__CAGCCGATCCCTGACT.csv /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/1--single-cell-sigs/alphabet-protein__ksize-30__scaled-10/bat3-LU__CAGCCGATCCCTGACT.sig /home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers
sourmash search --quiet --protein --no-dna  --threshold 1e-100 -k 30 --traverse-directory --containment --output /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-0--mouse-search-results--remove_common/alphabet-protein__ksize-30__scaled-10/bat3-LU__ACCGTAAGTTCGCTAA.csv /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/1--single-cell-sigs/alphabet-protein__ksize-30__scaled-10/bat3-LU__ACCGTAAGTTCGCTAA.sig /home/olga/data_sm

In [28]:
1+1

2

In [29]:
! wc -l $catted_txt

30958 /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/2--mouse-search-results/sourmash_search_commands_concatenated.txt


In [39]:
! shuf $catted_txt | head -n 3

sourmash search --quiet --dna --no-protein --no-dayhoff --num-results 3 --threshold 1e-10 -k 21 --containment --output /mnt/ibm_sm/olga/immune-evolution/kmer-signatures/2--test-human/2--mouse-search-results/alphabet-DNA__ksize-21__scaled-10/P3_6__TTCGGTCCAAACCTAC.csv /mnt/ibm_sm/olga/immune-evolution/kmer-signatures/2--test-human/1--single-cell-sigs/alphabet-DNA__ksize-21__scaled-10/P3_6__TTCGGTCCAAACCTAC.sig /home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/4--celltype-sbt-dbts/alphabet-DNA__ksize-21__scaled-10.sbt.zip
sourmash search --quiet --protein --no-dna --num-results 3 --threshold 1e-10 -k 30 --containment --output /mnt/ibm_sm/olga/immune-evolution/kmer-signatures/2--test-human/2--mouse-search-results/alphabet-protein__ksize-30__scaled-10/P3_7__GTAACTGGTTTAGCTG.csv /mnt/ibm_sm/olga/immune-evolution/kmer-signatures/2--test-human/1--single-cell-sigs/alphabet-protein__ksize-30__scaled-10/P3_7__GTAACTGGTTTAGCTG.sig /home/olga/data_sm/immune-evolution/kmer-signatur

## Search mouse celltypes with k-mers shared by 5, 10% of cells, but remove k-mers shared by 80% of celltypes

In [43]:
1+1

2

In [65]:
moltypes = "DNA", "protein", "dayhoff"

alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))

# mouse celltype database directory
train_3_sigs_min_10_percent = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs--min-kmer-count--10-percent"
train_3_sigs_min_5_percent = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs--min-kmer-count--5-percent"
train_3_sigs_min_5_percent_remove_common = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers--min-kmer-count--5-percent"
train_3_sigs_min_10_percent_remove_common = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers--min-kmer-count--10-percent"

# Try both with removed common hashes and original
celltype_dirs = {
        'min-5-percent-shared-kmers': train_3_sigs_min_5_percent,
    'min-10-percent-shared-kmers': train_3_sigs_min_10_percent,
    'min-5-percent-shared-kmers-remove-common-kmers': train_3_sigs_min_5_percent_remove_common,
    'min-10-percent-shared-kmers-remove-common-kmers': train_3_sigs_min_10_percent_remove_common,
}

txts = []

containments = True, False
for containment in containments:
    for i, (name, celltype_dir) in enumerate(celltype_dirs.items()):
        suffix = f"3-{i}--mouse-containment-search-results--{name}" if containment else f"3-{i}--mouse-search-results--{name}"
        this_celltype_dir_results = os.path.join(
            test_dir, suffix
        )
        if not os.path.exists(this_celltype_dir_results):
            os.makedirs(this_celltype_dir_results)
        for alpha, ksize in alpha_ksizes:
            txt = csc_utils.make_sourmash_search_commands(
                this_celltype_dir_results,
                query_sigs_dir=test_1_single_cells,
                sbt_base_dir=celltype_dir,
                k_sizes=[ksize],
                scaled_sizes=[
                    10,
                ],
                sbt_template_basename=None,
                query_sig_files=False,
                containment=containment,
                moltype=alpha,
                threshold=1e-100,
                n_jobs=96,
                force=False,
                num_results=None,
                add_ksize_to_sig_dir=False,
                traverse_directory=True,
            )
            txts.append(txt)
txts_to_cat = " ".join(txts)
catted_txt = os.path.join(results_dir, "sourmash_search_commands_concatenated.txt")
! cat $txts_to_cat > $catted_txt

print(f"---\nparallel --progress --eta --jobs 96 < {catted_txt}")

parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-0--mouse-containment-search-results--min-5-percent-shared-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-0--mouse-containment-search-results--min-5-percent-shared-kmers/sourmash_search_commands_protein.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-0--mouse-containment-search-results--min-5-percent-shared-kmers/sourmash_search_commands_dayhoff.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-1--mouse-containment-search-results--min-10-percent-shared-kmers/sourmash_search_commands_DNA.txt
parallel --progress --eta --jobs 96 < /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-1--mouse-containment-search-results--min-10-percent-shared-kmers/sourmash_search_co

In [66]:
!wc -l $catted_txt

62208 /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/2--mouse-search-results/sourmash_search_commands_concatenated.txt


In [62]:
! head $catted_txt

In [57]:
! ls $train_3_sigs_min_5_percent_remove_common/alphabet-DNA__ksize-21__scaled-10

alveolar_epithelial_type_2.sig	macrophage.sig
b_cell.sig			monocyte.sig
capillary.sig			natural_killer_t_cell.sig
dendritic.sig			smooth_muscle_and_myofibroblast.sig
fibroblast.sig			t_cell.sig


In [48]:
ll /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-3--mouse-search-results--min-10-percent-shared-kmers-remove-common-kmers/

total 9977
drwxr-xr-x 2 olga  524288 Mar 29 08:17 [0m[01;34malphabet-dayhoff__ksize-51__scaled-10[0m/
drwxr-xr-x 2 olga    4096 Mar 29 07:03 [01;34malphabet-DNA__ksize-21__scaled-10[0m/
drwxr-xr-x 2 olga  524288 Mar 29 08:15 [01;34malphabet-protein__ksize-30__scaled-10[0m/
-rw-r--r-- 1 olga 4560176 Mar 29 07:04 sourmash_search_commands_dayhoff.txt
-rw-r--r-- 1 olga       0 Mar 29 07:04 sourmash_search_commands_DNA.txt
-rw-r--r-- 1 olga 4603392 Mar 29 07:04 sourmash_search_commands_protein.txt


In [54]:
!ls /home/olga/data_sm/immune-evolution/kmer-signatures/3--test-bat/3-3--mouse-search-results--min-10-percent-shared-kmers-remove-common-kmers/alphabet-DNA__ksize-21__scaled-10/