# Goal for how to organize the signatures

Copy all ribosomal-removed signatures to these subfolders


Using `DNA`, `protein`, and `dayhoff` for shorthand now. In reality would use the full sketch id, e.g. `alphabet-DNA__ksize-21__scaled-10`

```
mouse2mouse/
    self2self-bootstrapped/
        0--train-mouse1-single-cells/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        1--test-mouse1-single-cells/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...                
        2--train-mouse1-merged-celltype-sigs/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        3--train-mouse1-celltype-remove-common-kmers/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        4--train-mouse1--celltype-sbt-dbs/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-01/
                ...
        5--search-results/
            iteration-00/
                DNA/
                protein/
                dayhoff/
            iteration-00/
                ...
    mouse1-mouse2/
        0--train-mouse1-single-cells/
            DNA/
            protein/
            dayhoff/
        1--test-mouse2-single-cells/
            DNA/
            protein/
            dayhoff/
        2--train-mouse1-merged-celltype-sigs/
            DNA/
            protein/
            dayhoff/
        3--train-mouse1-celltype-remove-common-kmers/
            DNA/
            protein/
            dayhoff/
        4--train-mouse1-celltype-sbt-dbs/
            DNA/
            protein/
            dayhoff
        5--search-results/
            DNA/
            protein/
            dayhoff/
train-mouse/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        DNA/
        protein/
        dayhoff/
    2--merged-celltype-sigs/
        DNA/
        protein/
        dayhoff/
    3--celltype-remove-common-kmers/
        DNA/
        protein/
        dayhoff/
    4--celltype-sbt-dbs/
        DNA/
        protein/
        dayhoff/
test-human/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        DNA/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoffl
test-lemur/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoff/
test-bat/
    0--protein-coding-fastas/
        nucleotide/
        peptide/
    1--single-cell-sigs/
        protein/
        dayhoff/
    2--mouse-search-results/
        DNA/
        protein/
        dayhoff/
```

# Imports

In [119]:
import glob
import os
import re
import shutil

import pandas as pd
import scanpy as sc
import sig_utils
from joblib import Parallel, delayed
import seaborn as sns
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from tqdm import tqdm

pd.options.display.max_rows = 500

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Def Describe

In [16]:
%%file pandas_utils.py

def describe(df, random=False):
    print(df.shape)
    print("--- First 5 entries ---")
    display(df.head())
    if random:
        print('--- Random subset ---')
        display(df.sample(5))

Overwriting pandas_utils.py


In [17]:
from pandas_utils import describe

## Read one2one h5ad

In [18]:
adata = sc.read(
#     "/home/phoenix/data_sm/code/immune-evolution/h5ads/human-lemur-mouse-bat__lung_only_copy.h5ad"
    "/home/olga/data_sm/immune-evolution/h5ads/human-lemur-mouse-bat/human-lemur-mouse-bat__lung_only.h5ad"
)
adata.obs = adata.obs.reset_index().set_index('cell_id')
print(adata)
adata.obs.head()

AnnData object with n_obs × n_vars = 126745 × 10560
    obs: 'index', 'age', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'free_annotation', 'individual', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'tissue', 'narrow_group', 'broad_group', 'compartment_group', 'compartment_narrow', 'channel_cleaned', 'batch', 'n_genes', 'n_counts', 'species_batch_v2', 'compartment_broad', 'compartment_broad_narrow', 'compartment_species', 'compartment_narrow_species', 'common_individual_id'
    var: 'bat__gene_name', 'mouse_lemur__gene_name-bat', 'mouse__gene_name-bat', 'mouse_lemur__gene_name_x-hlm', 'mouse__gene_name_x-hlm', 'gene_ids-lemur-hlm', 'n_cells-mouse-hlm', 'mouse_lemur__gene_name_y-hlm', 'mouse__gene_name_y-hlm'


Unnamed: 0_level_0,index,age,cell_barcode,cell_ontology_class,cell_ontology_id,channel,free_annotation,individual,sample,sequencing_run,...,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P2_1__AAACCTGAGAAACCAT,P2_1_AAACCTGAGAAACCAT-human-hlm,46y,AAACCTGAGAAACCAT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AAATGCCAGATGAGAG,P2_1_AAATGCCAGATGAGAG-human-hlm,46y,AAATGCCAGATGAGAG,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGATCCCT,P2_1_AACACGTTCGATCCCT-human-hlm,46y,AACACGTTCGATCCCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACACGTTCGCACTCT,P2_1_AACACGTTCGCACTCT-human-hlm,46y,AACACGTTCGCACTCT,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2
P2_1__AACCATGCAGCTCGCA,P2_1_AACCATGCAGCTCGCA-human-hlm,46y,AACCATGCAGCTCGCA,capillary endothelial cell,,P2_1,Capillary Aerocyte,patient_2,distal 2,180607_A00111_0155_BHFCWYDMXX,...,P2_1,,,,hlm,endothelial: Capillary,endothelial: Capillary: Capillary Aerocyte,endothelial: human,endothelial: Capillary Aerocyte: human,human_patient_2


In [19]:
adata.obs.species.value_counts()

Human          57353
Mouse lemur    36934
Mouse          23802
Bat             8656
Name: species, dtype: int64

In [20]:
adata.obs.query('species == "Mouse"').groupby(["age", "channel"], observed=True).size()

age  channel           
18m  MACA_18m_F_LUNG_50     320
     MACA_18m_F_LUNG_51    1138
     MACA_18m_M_LUNG_52    1573
     MACA_18m_M_LUNG_53    1256
21m  MACA_21m_F_LUNG_54    1312
     MACA_21m_F_LUNG_55    1526
30m  10X_P1_6              7728
     10X_P1_14             1565
     10X_P2_5               920
     10X_P3_0               496
1m   10X_P5_13             1043
     10X_P5_14             1384
3m   10X_P7_8               348
     10X_P7_9               979
     10X_P8_12              578
     10X_P8_13             1636
dtype: int64

## Set shared celltypes

In [21]:
SHARED_CELLTYPES = ['Alveolar Epithelial Type 2',
 'B cell',
 'Capillary',
 'Dendritic',
 'Fibroblast',
 'Macrophage',
 'Monocyte',
 'Natural Killer T cell',
 'Smooth Muscle and Myofibroblast',
 'T cell']
len(SHARED_CELLTYPES)

10

In [22]:
adata_shared = adata[adata.obs.broad_group.isin(SHARED_CELLTYPES)]
adata_shared

  if not is_categorical(df_full[k]):


View of AnnData object with n_obs × n_vars = 89795 × 10560
    obs: 'index', 'age', 'cell_barcode', 'cell_ontology_class', 'cell_ontology_id', 'channel', 'free_annotation', 'individual', 'sample', 'sequencing_run', 'sex', 'species', 'species_batch', 'species_latin', 'tissue', 'narrow_group', 'broad_group', 'compartment_group', 'compartment_narrow', 'channel_cleaned', 'batch', 'n_genes', 'n_counts', 'species_batch_v2', 'compartment_broad', 'compartment_broad_narrow', 'compartment_species', 'compartment_narrow_species', 'common_individual_id'
    var: 'bat__gene_name', 'mouse_lemur__gene_name-bat', 'mouse__gene_name-bat', 'mouse_lemur__gene_name_x-hlm', 'mouse__gene_name_x-hlm', 'gene_ids-lemur-hlm', 'n_cells-mouse-hlm', 'mouse_lemur__gene_name_y-hlm', 'mouse__gene_name_y-hlm'

In [23]:
CELL_IDS_TO_USE = set(adata_shared.obs.index)
len(CELL_IDS_TO_USE)

89795

In [24]:
# adata_shared.obs.query('species == "Mouse"').groupby(
#     ["channel", "broad_group"], observed=True
# ).size()

In [25]:
adata_shared.obs.query('species == "Mouse"').groupby(
    ['age', "channel"],
    observed=True,
).broad_group.nunique()

age  channel           
18m  MACA_18m_F_LUNG_50     9
     MACA_18m_F_LUNG_51    10
     MACA_18m_M_LUNG_52    10
     MACA_18m_M_LUNG_53    10
21m  MACA_21m_F_LUNG_54    10
     MACA_21m_F_LUNG_55    10
30m  10X_P1_6               8
     10X_P1_14              9
     10X_P2_5               9
     10X_P3_0               9
1m   10X_P5_13             10
     10X_P5_14              9
3m   10X_P7_8              10
     10X_P7_9              10
     10X_P8_12              8
     10X_P8_13              8
Name: broad_group, dtype: int64

## Set signature outdir

In [26]:
sig_outdir_base = '/mnt/ibm_sm/olga/immune-evolution/kmer-signatures'

In [105]:
ls -lha $sig_outdir_base

total 11K
drwxr-xr-x 7 olga czb 4.0K Mar  5 12:02 [0m[01;34m.[0m/
drwxrwxr-x 8 olga czb 8.0K Mar 16 10:36 [01;34m..[0m/
drwxr-xr-x 4 olga czb 4.0K Feb 28 09:53 [01;34m0--mouse2mouse[0m/
drwxr-xr-x 7 olga czb 4.0K Feb 28 16:57 [01;34m1--train-mouse[0m/
drwxr-xr-x 8 olga czb 4.0K Mar  3 14:50 [01;34m2--test-human[0m/
drwxr-xr-x 9 olga czb 4.0K Mar  5 17:42 [01;34m3--test-bat[0m/
drwxr-xr-x 8 olga czb 4.0K Mar  5 12:06 [01;34m4--test-lemur[0m/


In [28]:
rm -rf $sig_outdir_base/mouse/single-cell

## sketch ids of interest

In [29]:
alpha_ksizes = (("DNA", 21), ("protein", 30), ("dayhoff", 51))
sketch_ids = [sig_utils.make_sketch_id(alpha=alpha, ksize=ksize, style='scaled', value=10) for alpha, ksize in alpha_ksizes]
sketch_ids

['alphabet-DNA__ksize-21__scaled-10',
 'alphabet-protein__ksize-30__scaled-10',
 'alphabet-dayhoff__ksize-51__scaled-10']

In [30]:
nucleotide_sketch_ids = sketch_ids[:1]
nucleotide_sketch_ids

['alphabet-DNA__ksize-21__scaled-10']

In [31]:
peptide_sketch_ids = sketch_ids[1:]
peptide_sketch_ids

['alphabet-protein__ksize-30__scaled-10',
 'alphabet-dayhoff__ksize-51__scaled-10']

In [45]:
import sig_reorg

## Set output folder names

In [99]:
train_dir = "/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse"

# --- Training data directories --- #
train_0_fastas = os.path.join(train_dir, "0--fastas")
train_1_single_cells = os.path.join(
    train_dir, "1--single-cell-sigs"
)
train_2_merged_celltype = os.path.join(
    train_dir, "2--merged-celltype-sigs"
)
train_3_merged_celltype_remove_common = os.path.join(
    train_dir, "3--merged-celltype-remove-common-kmers"
)
train_4_celltype_sbts = os.path.join(
    train_dir, "4--celltype-sbt-dbts"
)

dirs = (
    train_dir,
    train_0_fastas,
    train_1_single_cells,
    train_2_merged_celltype,
    train_3_merged_celltype_remove_common,
    train_4_celltype_sbts,
)
for outdir in dirs:
    if not os.path.exists(outdir):
        os.makedirs(outdir)

# Mouse

## Single cell fastas

In [47]:
! ls -lha $sig_outdir_base

total 11K
drwxr-xr-x  6 olga czb 4.0K Feb 28 09:33 .
drwxrwxr-x 81 olga czb 8.0K Feb 28 09:02 ..
drwxr-xr-x  3 olga czb 4.0K Feb 28 09:33 0--mouse2mouse
drwxr-xr-x  3 olga czb 4.0K Feb 28 09:29 1--train-mouse
drwxr-xr-x  2 olga czb 4.0K Feb 28 09:17 mouse
drwxr-xr-x  3 olga czb 4.0K Feb 27 09:59 mouse2mouse


In [54]:
mouse_train_dir = os.path.join(sig_outdir_base, "1--train-mouse")

mouse_kmermaid_base = '/home/olga/data_sm/immune-evolution/pipeline-results/mouse/kmermaid/lung--mouse--remove-ribo/'

bat_kmermaid_base = '/home/olga/data_sm/immune-evolution/pipeline-results/bat/kmermaid/ksize_8'

mouse_train_0_fastas = os.path.join(mouse_train_dir, "0--protein-coding-fastas")
mouse_fasta_input_dir = os.path.join(mouse_kmermaid_base, "translate")

sig_reorg.copy_fastas(
    mouse_train_0_fastas,
    mouse_fasta_input_dir,
    select_cell_ids=CELL_IDS_TO_USE,
    dryrun=False,
)

0it [00:00, ?it/s]

Copying peptides


27200it [30:02, 15.09it/s]
0it [00:00, ?it/s]

Copying nucleotides


7833it [09:36, 11.80it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

24622it [30:55, 13.27it/s]


KeyboardInterrupt: 

## Single cell signatures

In [41]:
mouse_peptide_sketch_dir = os.path.join(
    bat_kmermaid_base,
    "sketches_peptide_handmade_mouse_merged_remove_ribosomal_dissociation_all_mouse_cells",
)
mouse_nucleotide_sketch_dir = os.path.join(mouse_kmermaid_base, 'sketches_nucleotide_handmade_per_mouse_remove_ribosomal')

In [44]:
sig_reorg.copy_nucleotide_peptide_sketches(
    "1--train-mouse",
    "1--single-cell-sigs",
    peptide_sketch_dir=mouse_peptide_sketch_dir,
    nucleotide_sketch_dir=mouse_nucleotide_sketch_dir,
    sketch_outdir_base=sig_outdir_base,
    select_cell_ids=CELL_IDS_TO_USE,
    dryrun=False
)

4it [00:00, 28.78it/s]

Copying alphabet-DNA__ksize-21__scaled-10


7109it [02:32, 46.67it/s]
9it [00:00, 85.67it/s]

Copying alphabet-protein__ksize-30__scaled-10


7109it [01:10, 100.94it/s]
7it [00:00, 53.07it/s]

Copying alphabet-dayhoff__ksize-51__scaled-10


7109it [01:01, 114.92it/s]


## Merge signatures on cell types

### Read all copied signatures into dataframe

In [59]:
ls -lha $train_1_single_cells

total 1.6M
drwxr-xr-x 5 olga czb 4.0K Feb 28 09:34 [0m[01;34m.[0m/
drwxr-xr-x 4 olga czb 4.0K Feb 28 09:49 [01;34m..[0m/
drwxr-xr-x 2 olga czb 512K Feb 28 09:35 [01;34malphabet-dayhoff__ksize-51__scaled-10[0m/
drwxr-xr-x 2 olga czb 512K Feb 28 09:32 [01;34malphabet-DNA__ksize-21__scaled-10[0m/
drwxr-xr-x 2 olga czb 512K Feb 28 09:34 [01;34malphabet-protein__ksize-30__scaled-10[0m/


In [65]:

train_sig_df = pd.Series(
    glob.glob(os.path.join(train_1_single_cells, "*", "*.sig")),
    name="sig_path",
).to_frame()
describe(train_sig_df)

# Get iteration, sketch id
df = (
    train_sig_df["sig_path"]
    .str.split(train_1_single_cells + "/")
    .str[-1]
    .str.split("/", expand=True)
)
describe(df)


sketch_info_df = df[0].str.extractall(sig_utils.SKETCH_INFO_PATTERN)
sketch_info_df.index = sketch_info_df.index.droplevel(-1)
describe(sketch_info_df)
train_sig_df = pd.concat(
    [train_sig_df, df, sketch_info_df], axis=1, ignore_index=True
)
train_sig_df = train_sig_df.rename(
    columns={
        0: "sig_path",
        1: "sketch_id",
        2: "sig_basename",
        3: "alpha_moltype",
        4: "alphabet",
        5: "ksize",
        6: "sketch_style",
        7: "sketch_value",
    }
)
train_sig_df["cell_id"] = (
    train_sig_df["sig_basename"].str.split(".").str[0]
)

# Sourmash can't handle when ksize is str
train_sig_df['ksize'] = train_sig_df.ksize.astype(int)
describe(train_sig_df)

(17652, 1)
--- First 5 entries ---


Unnamed: 0,sig_path
0,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...
1,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...
2,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...
3,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...
4,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...


(17652, 2)
--- First 5 entries ---


Unnamed: 0,0,1
0,alphabet-dayhoff__ksize-51__scaled-10,MACA_21m_F_LUNG_54__TTCTCAAAGCGTGTCC.sig
1,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_M_LUNG_53__AAACGGGGTTGTCGCG.sig
2,alphabet-dayhoff__ksize-51__scaled-10,MACA_21m_F_LUNG_55__ATGGGAGAGACAATAC.sig
3,alphabet-dayhoff__ksize-51__scaled-10,MACA_21m_F_LUNG_55__TTGCGTCTCTATCGCC.sig
4,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__GTATTCTTCCTGCAGG.sig


(17652, 5)
--- First 5 entries ---


Unnamed: 0,mol_or_alpha,alphabet,ksize,sketch_style,sketch_value
0,alphabet,dayhoff,51,scaled,10
1,alphabet,dayhoff,51,scaled,10
2,alphabet,dayhoff,51,scaled,10
3,alphabet,dayhoff,51,scaled,10
4,alphabet,dayhoff,51,scaled,10


(17652, 9)
--- First 5 entries ---


Unnamed: 0,sig_path,sketch_id,sig_basename,alpha_moltype,alphabet,ksize,sketch_style,sketch_value,cell_id
0,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_21m_F_LUNG_54__TTCTCAAAGCGTGTCC.sig,alphabet,dayhoff,51,scaled,10,MACA_21m_F_LUNG_54__TTCTCAAAGCGTGTCC
1,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_M_LUNG_53__AAACGGGGTTGTCGCG.sig,alphabet,dayhoff,51,scaled,10,MACA_18m_M_LUNG_53__AAACGGGGTTGTCGCG
2,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_21m_F_LUNG_55__ATGGGAGAGACAATAC.sig,alphabet,dayhoff,51,scaled,10,MACA_21m_F_LUNG_55__ATGGGAGAGACAATAC
3,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_21m_F_LUNG_55__TTGCGTCTCTATCGCC.sig,alphabet,dayhoff,51,scaled,10,MACA_21m_F_LUNG_55__TTGCGTCTCTATCGCC
4,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__GTATTCTTCCTGCAGG.sig,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51__GTATTCTTCCTGCAGG


### Join with celltype metadata dataframe

In [66]:
train_sig_df_celltypes = train_sig_df.join(adata_shared.obs, on='cell_id')
describe(train_sig_df_celltypes)

(17652, 38)
--- First 5 entries ---


Unnamed: 0,sig_path,sketch_id,sig_basename,alpha_moltype,alphabet,ksize,sketch_style,sketch_value,cell_id,index,...,channel_cleaned,batch,n_genes,n_counts,species_batch_v2,compartment_broad,compartment_broad_narrow,compartment_species,compartment_narrow_species,common_individual_id
0,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_21m_F_LUNG_54__TTCTCAAAGCGTGTCC.sig,alphabet,dayhoff,51,scaled,10,MACA_21m_F_LUNG_54__TTCTCAAAGCGTGTCC,TTCTCAAAGCGTGTCC-1-52-0-0-mouse-hlm,...,MACA_21m_F_LUNG_54,,,,hlm,lymphoid: B cell,lymphoid: B cell: B cell,lymphoid: mouse,lymphoid: B cell: mouse,mouse_21-F-54
1,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_M_LUNG_53__AAACGGGGTTGTCGCG.sig,alphabet,dayhoff,51,scaled,10,MACA_18m_M_LUNG_53__AAACGGGGTTGTCGCG,AAACGGGGTTGTCGCG-1-32-0-0-mouse-hlm,...,MACA_18m_M_LUNG_53,,,,hlm,myeloid: Monocyte,myeloid: Monocyte: Monocyte,myeloid: mouse,myeloid: Monocyte: mouse,mouse_18-M-53
2,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_21m_F_LUNG_55__ATGGGAGAGACAATAC.sig,alphabet,dayhoff,51,scaled,10,MACA_21m_F_LUNG_55__ATGGGAGAGACAATAC,ATGGGAGAGACAATAC-1-53-0-0-mouse-hlm,...,MACA_21m_F_LUNG_55,,,,hlm,myeloid: Monocyte,myeloid: Monocyte: Monocyte,myeloid: mouse,myeloid: Monocyte: mouse,mouse_21-F-55
3,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_21m_F_LUNG_55__TTGCGTCTCTATCGCC.sig,alphabet,dayhoff,51,scaled,10,MACA_21m_F_LUNG_55__TTGCGTCTCTATCGCC,TTGCGTCTCTATCGCC-1-53-0-0-mouse-hlm,...,MACA_21m_F_LUNG_55,,,,hlm,lymphoid: B cell,lymphoid: B cell: B cell,lymphoid: mouse,lymphoid: B cell: mouse,mouse_21-F-55
4,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...,alphabet-dayhoff__ksize-51__scaled-10,MACA_18m_F_LUNG_51__GTATTCTTCCTGCAGG.sig,alphabet,dayhoff,51,scaled,10,MACA_18m_F_LUNG_51__GTATTCTTCCTGCAGG,GTATTCTTCCTGCAGG-1-12-0-0-mouse-hlm,...,MACA_18m_F_LUNG_51,,,,hlm,myeloid: Monocyte,myeloid: Monocyte: Monocyte,myeloid: mouse,myeloid: Monocyte: mouse,mouse_18-F-51


### Merge signatures on celltypes

In [83]:


grouped = train_sig_df_celltypes.groupby(
    ["sketch_id", "alphabet", "ksize", "broad_group"]
)
print(len(grouped))





per_celltype_sigs = Parallel(n_jobs=96)(
    delayed(sig_utils.merge_celltype_sigs)(
        sketch_id,
        ksize,
        moltype,
        broad_group,
        df,
        merged_celltype_outdir_base=train_2_merged_celltype,
        dryrun=False,
        force=True
    )
    for (sketch_id, moltype, ksize, broad_group), df in tqdm(
        grouped, total=len(grouped)
    )
)


per_celltype_sigs_df = pd.Series(per_celltype_sigs, index=grouped.groups).reset_index()
per_celltype_sigs_df = per_celltype_sigs_df.rename(
    columns={
        "level_0": "sketch_id",
        "level_1": "alphabet",
        "level_2": "ksize",
        "level_3": "broad_group",
        0: "sig_path",
    }
)
describe(per_celltype_sigs_df)


  0%|          | 0/30 [00:00<?, ?it/s][A

30



100%|██████████| 30/30 [00:01<00:00, 24.96it/s][A


(30, 5)
--- First 5 entries ---


Unnamed: 0,sketch_id,alphabet,ksize,broad_group,sig_path
0,alphabet-DNA__ksize-21__scaled-10,DNA,21,Alveolar Epithelial Type 2,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...
1,alphabet-DNA__ksize-21__scaled-10,DNA,21,B cell,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...
2,alphabet-DNA__ksize-21__scaled-10,DNA,21,Capillary,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...
3,alphabet-DNA__ksize-21__scaled-10,DNA,21,Dendritic,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...
4,alphabet-DNA__ksize-21__scaled-10,DNA,21,Fibroblast,/mnt/ibm_sm/olga/immune-evolution/kmer-signatu...


### Remove hashes present across 80% of celltypes

In [84]:
per_celltype_sigs_df.dtypes

sketch_id      object
alphabet       object
ksize           int64
broad_group    object
sig_path       object
dtype: object

In [85]:
%%time

grouped = per_celltype_sigs_df.groupby(["sketch_id", "alphabet", "ksize"])


sigs_without_common_hashes = Parallel(n_jobs=64)(
    delayed(sig_utils.remove_common_hashes_from_sig_df)(
        df,
        sketch_id,
        ksize,
        moltype,
        fraction_threshold=0.8,
        output_dir=train_3_merged_celltype_remove_common,
        force=True,
        create_hash_count_csv=True,
    )
    for (sketch_id, moltype, ksize), df in tqdm(grouped, total=len(grouped))
)
len(sigs_without_common_hashes)


  0%|          | 0/3 [00:00<?, ?it/s][A
100%|██████████| 3/3 [00:01<00:00,  2.60it/s][A


CPU times: user 32.5 s, sys: 6.29 s, total: 38.8 s
Wall time: 5h 35min 33s


3

In [104]:
train_2_merged_celltype

'/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs'

In [108]:
train_2_merged_celltype

'/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/2--merged-celltype-sigs'

In [110]:
ls -lha /home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse

total 3.5K
drwxr-xr-x 7 olga czb 4.0K Feb 28 16:57 [0m[01;34m.[0m/
drwxr-xr-x 7 olga czb 4.0K Mar  5 12:02 [01;34m..[0m/
drwxr-xr-x 4 olga czb 4.0K Feb 28 10:19 [01;34m0--protein-coding-fastas[0m/
drwxr-xr-x 5 olga czb 4.0K Feb 28 09:34 [01;34m1--single-cell-sigs[0m/
drwxr-xr-x 5 olga czb 4.0K Feb 28 10:58 [01;34m2--merged-celltype-sigs[0m/
drwxr-xr-x 8 olga czb 4.0K Feb 28 11:10 [01;34m3--merged-celltype-remove-common-kmers[0m/
drwxr-xr-x 2 olga czb 4.0K Mar 16 10:36 [01;34m4--celltype-sbt-dbts[0m/


In [109]:
train_3_merged_celltype_remove_common

'/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers'

In [113]:
ls -lha $train_3_merged_celltype_remove_common/*

/home/olga/data_sm/immune-evolution/kmer-signatures/1--train-mouse/3--merged-celltype-remove-common-kmers/alphabet-dayhoff__ksize-51__scaled-10:
total 245M
drwxr-xr-x 2 olga czb 4.0K Feb 28 11:39 [0m[01;34m.[0m/
drwxr-xr-x 8 olga czb 4.0K Feb 28 11:10 [01;34m..[0m/
-rw-r--r-- 1 olga czb 1.6M Feb 28 11:39 alveolar_epithelial_type_2.sig.sig
-rw-r--r-- 1 olga czb  19M Feb 28 11:39 b_cell.sig.sig
-rw-r--r-- 1 olga czb  22M Feb 28 11:39 capillary.sig.sig
-rw-r--r-- 1 olga czb 6.1M Feb 28 11:39 dendritic.sig.sig
-rw-r--r-- 1 olga czb 7.7M Feb 28 11:39 fibroblast.sig.sig
-rw-r--r-- 1 olga czb 123M Feb 28 11:11 hash_counts.csv
-rw-r--r-- 1 olga czb 5.7M Feb 28 11:39 macrophage.sig.sig
-rw-r--r-- 1 olga czb  40M Feb 28 11:39 monocyte.sig.sig
-rw-r--r-- 1 olga czb 3.7M Feb 28 11:39 natural_killer_t_cell.sig.sig
-rw-r--r-- 1 olga czb 703K Feb 28 11:39 smooth_muscle_and_myofibroblast.sig.sig
-rw-r--r-- 1 olga czb  17M Feb 28 11:39 t_cell.sig.sig

/home/olga/data_sm/immune-evolution/kmer-signa

In [107]:
ls -lha $train_3_merged_celltype_remove_common/dayhoff

total 1.5K
drwxr-xr-x 2 olga czb 4.0K Feb 28 11:03 [0m[01;34m.[0m/
drwxr-xr-x 8 olga czb 4.0K Feb 28 11:10 [01;34m..[0m/
-rw-r--r-- 1 olga czb   14 Feb 28 11:08 hash_counts.csv
