In [1]:
cd /mnt/data

/mnt/data


In [2]:
! aws s3 sync s3://czbiohub-maca/10x_data/10X_P7_8/ 10X_P7_8/

In [3]:
! samtools


Program: samtools (Tools for alignments in the SAM format)
Version: 1.9 (using htslib 1.9)

Usage:   samtools <command> [options]

Commands:
  -- Indexing
     dict           create a sequence dictionary file
     faidx          index/extract FASTA
     fqidx          index/extract FASTQ
     index          index alignment

  -- Editing
     calmd          recalculate MD/NM tags and '=' bases
     fixmate        fix mate information
     reheader       replace BAM header
     targetcut      cut fosmid regions (for fosmid pool only)
     addreplacerg   adds or replaces RG tags
     markdup        mark duplicates

  -- File operations
     collate        shuffle and group alignments by name
     cat            concatenate BAMs
     merge          merge sorted alignments
     mpileup        multi-way pileup
     sort           sort alignment file
     split          splits a file by read group
     quickcheck     quickly check if SAM/BAM/CRAM file appears inta

In [4]:
ptprc_location = 'chr1:138062861-138175708'

In [5]:
! samtools view -bh 10X_P7_8/possorted_genome_bam.bam $ptprc_location > 10X_P7_8/lung_ptprc.bam

In [6]:
ls -lha 10X_P7_8/

total 37G
drwxrwxr-x 2 ubuntu ubuntu 4.0K Sep  6 19:10 [0m[01;34m.[0m/
drwxr-xr-x 5 ubuntu root   4.0K Sep  7 21:31 [01;34m..[0m/
-rw-rw-r-- 1 ubuntu ubuntu  62M Aug 31  2017 10X_P7_8.mus.cell-gene.csv
-rw-rw-r-- 1 ubuntu ubuntu  19G Aug 31  2017 [01;31m10X_P7_8.tgz[0m
-rw-rw-r-- 1 ubuntu ubuntu  12K Sep  1  2017 barcodes.tsv
-rw-rw-r-- 1 ubuntu ubuntu 340K Sep  1  2017 genes.tsv
-rw-rw-r-- 1 ubuntu ubuntu 3.6M Sep  7 21:58 lung_ptprc.bam
-rw-rw-r-- 1 ubuntu ubuntu  69K Sep  7 21:32 lung_ptprc.bam.bai
-rw-rw-r-- 1 ubuntu ubuntu 931K Sep  7 21:32 lung_ptprc.sig
-rw-rw-r-- 1 ubuntu ubuntu  14M Sep  1  2017 matrix.mtx
-rw-rw-r-- 1 ubuntu ubuntu  612 Aug 31  2017 metrics_summary.csv
-rw-rw-r-- 1 ubuntu ubuntu  19G Jun 19 06:16 possorted_genome_bam.bam
-rw-rw-r-- 1 ubuntu ubuntu 5.5M Jun 19 06:16 possorted_genome_bam.bam.bai
-rw-rw-r-- 1 ubuntu ubuntu 9.2M Sep 16  2017 raw_gene_bc_matrices_h5.h5
-rw-rw-r-- 1 ubuntu ubuntu 2.6M Aug 31  2017 web_summary.html


In [7]:
! samtools index 10X_P7_8/lung_ptprc.bam

In [8]:
ksizes = 21, 27, 33, 51
protein = True
dna = True
seed = 42
track_abundance = True
scaled = 1000
num_hashes = 0
input_is_protein = False 
check_sequence = False

In [9]:
import pandas as pd
barcodes = set(pd.read_csv('/mnt/data/10X_P7_8/barcodes.tsv', squeeze=True))
genes = set(pd.read_csv('/mnt/data/10X_P7_8/genes.tsv', squeeze=True))
len(barcodes)

624

In [10]:
from sourmash import DEFAULT_SEED, MinHash, load_sbt_index, create_sbt_index
from sourmash import signature as sig
from sourmash import sourmash_args
from sourmash.logging import notify, error, print_results, set_quiet
from sourmash.sbtmh import SearchMinHashesFindBest, SigLeaf

from sourmash.sourmash_args import DEFAULT_LOAD_K
DEFAULT_COMPUTE_K = '21,31,51'

DEFAULT_N = 500
WATERMARK_SIZE = 10000
 
def make_minhashes():
    # one minhash for each ksize
    Elist = []
    for k in ksizes:
        if protein:
            E = MinHash(ksize=k, n=num_hashes,
                        is_protein=True,
                        track_abundance=track_abundance,
                        scaled=scaled,
                        seed=seed)
            Elist.append(E)
        if dna:
            E = MinHash(ksize=k, n=num_hashes,
                        is_protein=False,
                        track_abundance=track_abundance,
                        scaled=scaled,
                        seed=seed)
            Elist.append(E)
    return Elist

def add_seq(Elist, seq, input_is_protein, check_sequence):
    for E in Elist:
        if input_is_protein:
            E.add_protein(seq)
        else:
            E.add_sequence(seq, not check_sequence)

def build_siglist(Elist, filename, name=None):
    return [ sig.SourmashSignature(E, filename=filename,
                                   name=name) for E in Elist ]

def save_siglist(siglist, output_fp, filename=None):
    # save!
    if output_fp:
        sig.save_signatures(siglist, args.output)
    else:
        if filename is None:
            raise Exception("internal error, filename is None")
        with open(filename, 'w') as fp:
            sig.save_signatures(siglist, fp)
    notify('saved {} signature(s). Note: signature license is CC0.'.format(len(siglist)))

In [11]:
%load_ext line_profiler

In [39]:
import pandas as pd

annotations = pd.read_csv('~/tabula-muris/00_data_ingest/18_global_annotation_csv/annotations_droplet.csv')
annotations.columns = annotations.columns.map(lambda x: x.replace('.', '_'))

annotations_subset = annotations.loc[annotations.cell.str.startswith('10X_P7_8')]
annotations_subset['barcode'] = annotations_subset.cell.str.split('_').str[-1] + '-1'
annotations_subset['name'] = annotations_subset.apply(
    lambda x: '{cell_ontology_class}|{tissue}|{mouse_id}|{cell}'.format(**x), axis=1)
annotations_subset = annotations_subset.set_index('barcode')
print(annotations_subset.shape)
annotations_subset.head()

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


(621, 21)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,cell,cell_ontology_class,cell_ontology_id,channel,cluster_ids,free_annotation,mouse_id,mouse_sex,subsetA,subsetA_cluster_ids,...,subsetB_cluster_ids,subsetC,subsetC_cluster_ids,subsetD,subsetD_cluster_ids,subtissue,tissue,tissue_tSNE_1,tissue_tSNE_2,name
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACGGGAGGATATAC-1,10X_P7_8_AAACGGGAGGATATAC,myeloid cell,CL:0000763,10X_P7_8,20,dendritic cells and interstital macrophages,3-F-56,F,,,...,,,,,,,Lung,17.024721,-32.902836,myeloid cell|Lung|3-F-56|10X_P7_8_AAACGGGAGGAT...
AAACGGGTCTCGTATT-1,10X_P7_8_AAACGGGTCTCGTATT,alveolar macrophage,CL:0000583,10X_P7_8,5,,3-F-56,F,,,...,,,,,,,Lung,25.160619,25.066566,alveolar macrophage|Lung|3-F-56|10X_P7_8_AAACG...
AAAGATGCAGATCTGT-1,10X_P7_8_AAAGATGCAGATCTGT,B cell,CL:0000236,10X_P7_8,12,,3-F-56,F,,,...,,,,,,,Lung,1.740567,46.488878,B cell|Lung|3-F-56|10X_P7_8_AAAGATGCAGATCTGT
AAATGCCAGATAGTCA-1,10X_P7_8_AAATGCCAGATAGTCA,natural killer cell,CL:0000623,10X_P7_8,7,,3-F-56,F,,,...,,,,,,,Lung,-31.647934,-2.208061,natural killer cell|Lung|3-F-56|10X_P7_8_AAATG...
AAATGCCCAAACTGCT-1,10X_P7_8_AAATGCCCAAACTGCT,T cell,CL:0000084,10X_P7_8,21,,3-F-56,F,,,...,,,,,,,Lung,-37.281266,-5.619565,T cell|Lung|3-F-56|10X_P7_8_AAATGCCCAAACTGCT


In [40]:
def maybe_get_name(barcode):
    try:
        return names[barcode]
    except KeyError:
        return barcode

In [41]:

import pysam
from tqdm import tqdm
import itertools


bam_filename = '/mnt/data/10X_P7_8/lung_ptprc.bam'

def make_10x_signatures(bam_filename):
    output = bam_filename.replace('.bam', '.sig')

    bam_file = pysam.AlignmentFile(bam_filename, mode='rb')
    cell_seqs = {barcode: make_minhashes() for barcode in barcodes}

    for a in tqdm(bam_file):
        if (a.mapq == 255                                    # high quality mapping
            and a.has_tag('CB') and a.get_tag('CB') in barcodes  # in our set of barcodes,
    #         and a.has_tag('GN') and a.get_tag['GN'] in genes   # that maps to a single gene,
    #         and a.has_tag('RE') and a.get_tag('RE') == 'E'   # specifically to an exon,
            and a.has_tag('UB')):                            # and has a good UMI

            barcode = a.get_tag('CB')
    #         print(a)
            # if this isn't marked a duplicate, count it as a UMI
            if not a.is_duplicate:
    #             print(f"Adding {a.seq} to {barcode}")
                add_seq(cell_seqs[barcode], a.seq,
                                input_is_protein, check_sequence)
    cell_signatures = [build_siglist(seqs, filename=bam_filename, name=maybe_get_name(barcode)) 
                       for barcode, seqs in cell_seqs.items()]
    signatures_flat = list(itertools.chain(*cell_signatures))
    save_siglist(signatures_flat, output_fp=False, filename=output)

%lprun -f make_10x_signatures make_10x_signatures(bam_filename)

72221it [00:07, 10105.97it/s]
[Ksaved 4992 signature(s). Note: signature license is CC0.


In [42]:
make_10x_signatures('/mnt/data/10X_P7_8/possorted_genome_bam.bam')

66460272it [38:40, 28642.72it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

104211773it [1:02:25, 27824.02it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

143414463it [1:25:45, 27870.35it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iop

In [43]:
ls -lha /mnt/data/10X_P7_8/

total 37G
drwxrwxr-x  2 ubuntu ubuntu 4.0K Sep  8 01:31 [0m[01;34m.[0m/
drwxr-xr-x 12 ubuntu root   4.0K Sep 10 19:03 [01;34m..[0m/
-rw-rw-r--  1 ubuntu ubuntu  62M Aug 31  2017 10X_P7_8.mus.cell-gene.csv
-rw-rw-r--  1 ubuntu ubuntu  19G Aug 31  2017 [01;31m10X_P7_8.tgz[0m
-rw-rw-r--  1 ubuntu ubuntu  12K Sep  1  2017 barcodes.tsv
-rw-rw-r--  1 ubuntu ubuntu 340K Sep  1  2017 genes.tsv
-rw-rw-r--  1 ubuntu ubuntu 3.6M Sep  7 21:58 lung_ptprc.bam
-rw-rw-r--  1 ubuntu ubuntu  69K Sep  7 21:58 lung_ptprc.bam.bai
-rw-rw-r--  1 ubuntu ubuntu 953K Sep 10 20:50 lung_ptprc.sig
-rw-rw-r--  1 ubuntu ubuntu  14M Sep  1  2017 matrix.mtx
-rw-rw-r--  1 ubuntu ubuntu  612 Aug 31  2017 metrics_summary.csv
-rw-rw-r--  1 ubuntu ubuntu  19G Jun 19 06:16 possorted_genome_bam.bam
-rw-rw-r--  1 ubuntu ubuntu 5.5M Jun 19 06:16 possorted_genome_bam.bam.bai
-rw-rw-r--  1 ubuntu ubuntu 193M Sep 11 00:22 possorted_genome_bam.sig
-rw-rw-r--  1 ubuntu ubuntu 9.2M Sep 16  2017 raw_gene_bc_matr

In [19]:
ls -lha /mnt/data/sourmash_databases/

total 13M
drwxrwxr-x  3 ubuntu ubuntu 4.0K Sep  8 18:05 [0m[01;34m.[0m/
drwxr-xr-x 12 ubuntu root   4.0K Sep 10 19:03 [01;34m..[0m/
drwxrwxr-x  2 ubuntu ubuntu 3.4M Sep  8 18:05 [01;34m.sbt.tabula-muris-dna-k21[0m/
-rw-rw-r--  1 ubuntu ubuntu 8.8M Sep  8 18:05 tabula-muris-dna-k21.sbt.json


In [22]:
! sourmash categorize -h

usage: sourmash [-h] [-q] [-k KSIZE] [--threshold THRESHOLD]
                [--traverse-directory] [--protein] [--no-protein] [--dna]
                [--no-dna] [--csv CSV] [--load-csv LOAD_CSV]
                sbt_name queries [queries ...]

positional arguments:
  sbt_name              name of SBT to load
  queries               list of signatures to categorize

optional arguments:
  -h, --help            show this help message and exit
  -q, --quiet           suppress non-error output
  -k KSIZE, --ksize KSIZE
  --threshold THRESHOLD
  --traverse-directory
  --protein             choose a protein signature (default: False)
  --no-protein          do not choose a protein signature
  --dna                 choose a DNA signature (default: True)
  --no-dna              do not choose a DNA signature
  --csv CSV
  --load-csv LOAD_CSV


In [46]:
%%bash

sourmash search /mnt/data/maca-facs-sourmash_compute_all/A1-B000610-3_56_F-1-1.sig \
    /mnt/data/sourmash_databases/tabula-muris-dna-k21.sbt.json

[KWhen loading query from "/mnt/data/maca-facs-sourmash_compute_all/A1-B000610-3_56_F-1-1.sig"
[K8 signatures matching ksize and molecule type;
[Kneed exactly one. Specify --ksize or --dna/--protein.


In [None]:
%%bash
sourmash categorize --ksize 21 --dna \
    /mnt/data/sourmash_databases/tabula-muris-dna-k21.sbt.json \
    /mnt/data/10X_P7_8/possorted_genome_bam.sig | head

```
 Wed 12 Sep - 20:57  ~/kmer-hashing   origin ☊ olgabot/index-tabula-muris 4● 
  sourmash categorize --ksize 21 --dna \
    /mnt/data/sourmash_databases/tabula-muris-dna-k21.sbt.json \
    /mnt/data/10X_P7_8/possorted_genome_bam.sig | head
found 1 files to query

...sig loading 620
loaded query: alveolar macrophage|Lung|3-F-5... (k=21, DNA)
for alveolar macrophage|Lung|3-F-56|10X_P7_8_ACGGCCACAATGGTCT, found: 0.24 cell_ontology_class:epithelial_cell_of_lung|tissue:Lung|subtissue:nan|free_annotation:alveolar_epithelial_type_1_cells,_alveolar_epithelial_type_2_cells,_club_cells,_and_basal_cells|cell_id:K3-MAA000526-3_9_M-1-1
loaded query: classical monocyte|Lung|3-F-56... (k=21, DNA)
for classical monocyte|Lung|3-F-56|10X_P7_8_ACGAGGACAAGCCCAC, found: 0.40 cell_ontology_class:epithelial_cell_of_lung|tissue:Lung|subtissue:nan|free_annotation:alveolar_epithelial_type_1_cells,_alveolar_epithelial_type_2_cells,_club_cells,_and_basal_cells|cell_id:G4-MAA000526-3_9_M-1-1
loaded query: lung endothelial cell|Lung|3-F... (k=21, DNA)
for lung endothelial cell|Lung|3-F-56|10X_P7_8_AACACGTGTGGCTCCA, found: 0.35 cell_ontology_class:skeletal_muscle_satellite_stem_cell|tissue:Diaphragm|subtissue:nan|free_annotation:nan|cell_id:H3-MAA001454-3_38_F-1-1
loaded query: lung endothelial cell|Lung|3-F... (k=21, DNA)
for lung endothelial cell|Lung|3-F-56|10X_P7_8_CATCGGGCAGATGAGC, found: 0.36 cell_ontology_class:endothelial_cell|tissue:Heart|subtissue:Unknown|free_annotation:nan|cell_id:A2-MAA100037-3_10_M-1-1
loaded query: non-classical monocyte|Lung|3-... (k=21, DNA)
for non-classical monocyte|Lung|3-F-56|10X_P7_8_CTTAACTTCTCAAGTG, found: 0.45 cell_ontology_class:monocyte|tissue:Lung|subtissue:Endomucin|free_annotation:circulating_monocytes|cell_id:A22-MAA001892-3_38_F-1-1
loaded query: stromal cell|Lung|3-F-56|10X_P... (k=21, DNA)
for stromal cell|Lung|3-F-56|10X_P7_8_GTCAAGTAGCTAAACA, found: 0.38 cell_ontology_class:immature_B_cell|tissue:Marrow|subtissue:B-cells|free_annotation:nan|cell_id:E6-MAA001884-3_38_F-1-1
loaded query: stromal cell|Lung|3-F-56|10X_P... (k=21, DNA)
for stromal cell|Lung|3-F-56|10X_P7_8_GTCAAGTAGTTTCCTT, found: 0.27 cell_ontology_class:nan|tissue:Fat|subtissue:SCAT|free_annotation:nan|cell_id:N22-B000127-3_38_F-1-1
loaded query: alveolar macrophage|Lung|3-F-5... (k=21, DNA)
for alveolar macrophage|Lung|3-F-56|10X_P7_8_CCGTACTGTCAGATAA, found: 0.37 cell_ontology_class:monocyte|tissue:Lung|subtissue:Endomucin|free_annotation:circulating_monocytes|cell_id:A22-MAA001892-3_38_F-1-1
loaded query: lung endothelial cell|Lung|3-F... (k=21, DNA)
for lung endothelial cell|Lung|3-F-56|10X_P7_8_CTCAGAAAGTTTCCTT, found: 0.37 cell_ontology_class:skeletal_muscle_satellite_stem_cell|tissue:Diaphragm|subtissue:nan|free_annotation:nan|cell_id:H3-MAA001454-3_38_F-1-1
loaded query: non-classical monocyte|Lung|3-... (k=21, DNA)
for non-classical monocyte|Lung|3-F-56|10X_P7_8_GTCGGGTCACACATGT, found: 0.36 cell_ontology_class:epithelial_cell_of_lung|tissue:Lung|subtissue:nan|free_annotation:alveolar_epithelial_type_1_cells,_alveolar_epithelial_type_2_cells,_club_cells,_and_basal_cells|cell_id:G4-MAA000526-3_9_M-1-1
loaded query: lung endothelial cell|Lung|3-F... (k=21, DNA)
for lung endothelial cell|Lung|3-F-56|10X_P7_8_CCACGGATCAGCTCGG, found: 0.45 cell_ontology_class:bladder_cell|tissue:Bladder|subtissue:nan|free_annotation:Bladder_mesenchymal_cell|cell_id:A3-D041914-3_8_M-1-1
loaded query: non-classical monocyte|Lung|3-... (k=21, DNA)
for non-classical monocyte|Lung|3-F-56|10X_P7_8_GTAACGTAGATAGGAG, found: 0.56 cell_ontology_class:monocyte|tissue:Lung|subtissue:Endomucin|free_annotation:circulating_monocytes|cell_id:A22-MAA001892-3_38_F-1-1
loaded query: alveolar macrophage|Lung|3-F-5... (k=21, DNA)
for alveolar macrophage|Lung|3-F-56|10X_P7_8_GTTTCTAAGTGCAAGC, found: 0.18 cell_ontology_class:epithelial_cell_of_lung|tissue:Lung|subtissue:nan|free_annotation:alveolar_epithelial_type_1_cells,_alveolar_epithelial_type_2_cells,_club_cells,_and_basal_cells|cell_id:G4-MAA000526-3_9_M-1-1
loaded query: B cell|Lung|3-F-56|10X_P7_8_CT... (k=21, DNA)
for B cell|Lung|3-F-56|10X_P7_8_CTGATCCGTTGCGCAC, found: 0.23 cell_ontology_class:immature_B_cell|tissue:Marrow|subtissue:B-cells|free_annotation:nan|cell_id:G8-MAA000652-3_10_M-1-1
loaded query: alveolar macrophage|Lung|3-F-5... (k=21, DNA)
for alveolar macrophage|Lung|3-F-56|10X_P7_8_AACTCTTGTTCCAACA, found: 0.50 cell_ontology_class:monocyte|tissue:Lung|subtissue:Endomucin|free_annotation:circulating_monocytes|cell_id:A22-MAA001892-3_38_F-1-1
loaded query: myeloid cell|Lung|3-F-56|10X_P... (k=21, DNA)
for myeloid cell|Lung|3-F-56|10X_P7_8_GCTGCTTTCATGCTCC, found: 0.22 cell_ontology_class:monocyte|tissue:Lung|subtissue:Endomucin|free_annotation:circulating_monocytes|cell_id:A22-MAA001892-3_38_F-1-1
loaded query: stromal cell|Lung|3-F-56|10X_P... (k=21, DNA)
for stromal cell|Lung|3-F-56|10X_P7_8_AACCATGCACGTCAGC, found: 0.42 cell_ontology_class:endothelial_cell|tissue:Fat|subtissue:BAT|free_annotation:nan|cell_id:F21-B001216-3_38_F-1-1
loaded query: T cell|Lung|3-F-56|10X_P7_8_CC... (k=21, DNA)
for T cell|Lung|3-F-56|10X_P7_8_CCATGTCCACGGTTTA, found: 0.25 cell_ontology_class:T_cell|tissue:Lung|subtissue:EPCAM|free_annotation:nan|cell_id:H1-MAA001889-3_38_F-1-1
loaded query: alveolar macrophage|Lung|3-F-5... (k=21, DNA)
for alveolar macrophage|Lung|3-F-56|10X_P7_8_CGTTGGGCACCGAAAG, found: 0.28 cell_ontology_class:epithelial_cell_of_lung|tissue:Lung|subtissue:nan|free_annotation:alveolar_epithelial_type_1_cells,_alveolar_epithelial_type_2_cells,_club_cells,_and_basal_cells|cell_id:K3-MAA000526-3_9_M-1-1
loaded query: T cell|Lung|3-F-56|10X_P7_8_CC... (k=21, DNA)
for T cell|Lung|3-F-56|10X_P7_8_CCGTTCACATAGTAAG, found: 0.25 cell_ontology_class:epithelial_cell_of_large_intestine|tissue:Large_Intestine|subtissue:Distal|free_annotation:Lgr5+_undifferentiated_cell_(Distal)|cell_id:M2-MAA001873-3_38_F-1-1
loaded query: lung endothelial cell|Lung|3-F... (k=21, DNA)

```

In [1]:
mkdir /mnt/data/10X_P7_8/sourmash_search/


### Can't run this because there's multiple signatures in the sig file

```
sourmash search --ksize 27 --dna --ignore-abundance \
    --output /mnt/data/10X_P7_8/sourmash_search/tabula-muris-dna-k27.csv \
    /mnt/data/10X_P7_8/possorted_genome_bam.sig \
    /mnt/data/maca-facs-sourmash_index_all/tabula-muris-k27-dna/tabula-muris-k27-dna/tabula-muris-k27-dna.sbt.json 
```

In [2]:
mkdir /mnt/data/10X_P7_8/sourmash_categorize

```
sourmash categorize --ksize 27 --dna \
    --csv /mnt/data/10X_P7_8/sourmash_categorize/tabula-muris-dna-k27_ignore-abundance=True.csv \
    --ignore-abundance \
    /mnt/data/maca-facs-sourmash_index_all/tabula-muris-k27-dna/tabula-muris-k27-dna/tabula-muris-k27-dna.sbt.json  \
    /mnt/data/10X_P7_8/possorted_genome_bam.sig
```

In [3]:
ll  /mnt/data/10X_P7_8/sourmash_categorize

total 0
-rw-rw-r-- 1 ubuntu 0 Sep 21 02:15 tabula-muris-dna-k27.csv
-rw-rw-r-- 1 ubuntu 0 Sep 21 02:15 tabula-muris-dna-k27_ignore-abundance=True.csv
