In [64]:



import glob
import itertools
import math
import os
import sys

import igraph as ig
import leidenalg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import khtools


ksize = 21
moltype = 'DNA'

ignore_abundance = False
downsample = False


# Defaults from 'sourmash index'

bf_size = 1e5
n_children = 2
scaled = False

n_neighbors = 3

try:
    import scanpy.api as sc
except FutureWarning:
    import scanpy as sc

sc.logging.print_versions()


from sourmash import signature as sig
from sourmash.compare import compare_all_pairs
from sourmash.sbt import Leaf
from sourmash.sbtmh import SigLeaf, create_sbt_index
from sourmash import sourmash_args
from sourmash.logging import notify

import umap

scanpy==1.4.4 anndata==0.6.22.post1 umap==0.3.9 numpy==1.17.0 scipy==1.3.1 pandas==0.25.0 scikit-learn==0.21.3 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1


In [8]:
similarities_tidy = pd.read_csv('s3://kmer-hashing/tabula-muris/n_hashes=500/bladder/similarities_tidy.csv')
print(similarities_tidy.shape)
similarities_tidy.head()

(1612900, 6)


Unnamed: 0,cell1,cell2,similarity_without_abundance,similarity_with_abundance,cell_ontology_class_cell1,cell_ontology_class
0,A1-B000610-3_56_F-1-1,A1-B000610-3_56_F-1-1,1.0,1.0,bladder cell,bladder cell
1,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1,0.018,0.22846,bladder cell,bladder urothelial cell
2,A1-B000610-3_56_F-1-1,A1-B002771-3_39_F-1-1,0.072,0.24519,bladder cell,bladder cell
3,A1-B000610-3_56_F-1-1,A1-D041914-3_8_M-1-1,0.098,0.454728,bladder cell,bladder cell
4,A1-B000610-3_56_F-1-1,A1-D042253-3_9_M-1-1,0.09,0.231834,bladder cell,bladder cell


In [9]:
%%time

annotations = pd.read_csv('https://github.com/czbiohub/tabula-muris/raw/master/00_data_ingest/18_global_annotation_csv/annotations_facs.csv', 
                          index_col='cell')
annotations.index = annotations.index.str.replace('.', '-')
annotations.columns = annotations.columns.str.replace('.', '_')
annotations['sample_id'] = annotations.index
annotations = annotations.fillna("NA")
bladder_annotations = annotations.query('tissue == "Bladder"')

folder = '/home/olga/pureScratch/olgabot-maca/facs/sourmash/'

all_signatures = glob.glob(f'{folder}/*.sig')
basenames = [os.path.basename(x) for x in all_signatures]
bladder_files = list(itertools.chain(*[[folder + x for x in basenames if x.startswith(cell_id)] for cell_id in bladder_annotations.index]))
len(bladder_files)

ksize = 21
moltype = "DNA"
ignore_abundance = True


bladder_signatures = []
for filename in bladder_files:
     bladder_signatures.extend(sig.load_signatures(filename, ksize=ksize, select_moltype="DNA"))
print(len(bladder_signatures))
bladder_signatures[:5]

1270
CPU times: user 37.6 s, sys: 116 ms, total: 37.7 s
Wall time: 39.2 s


In [10]:
# bladder cell
cell1 = 'A1-B000610-3_56_F-1-1'

sig1 = [x for x in bladder_signatures if x.name().startswith(cell1)][0]
sig1

SourmashSignature('A1-B000610-3_56_F-1-1_S28', 280b9bab)

In [45]:
sig1.minhash.ksize

21

In [11]:
# bladder urothelial cell
cell2 = 'A1-B002764-3_38_F-1-1'

sig2 = [x for x in bladder_signatures if x.name().startswith(cell2)][0]
sig2

SourmashSignature('A1-B002764-3_38_F-1-1_S291', ef185816)

In [24]:
sig1_hashes = pd.Series(sig1.minhash.get_mins(with_abundance=True), name=cell1)
sig1_hashes.head()

3282762037724      2
5959368950693      1
7215285795798     20
11329527245923     1
12290763577842     1
Name: A1-B000610-3_56_F-1-1, dtype: int64

In [23]:
sig2_hashes = pd.Series(sig2.minhash.get_mins(with_abundance=True), name=cell2)
sig2_hashes.head()

4319888920071     1
11897683056067    4
18288931836992    1
27514404317028    1
28801856705165    1
Name: A1-B002764-3_38_F-1-1, dtype: int64

In [22]:
hash_diff = (sig1_hashes - sig2_hashes).dropna()
hash_diff

171794956858086        1.0
394031160505449        0.0
513049510280006        5.0
635563321059591       54.0
734478954482604        0.0
790330283259107        0.0
860437213521678       -9.0
936637619733676       -4.0
937624453106451    -4675.0
1012839673718406    -240.0
1026077759632952    -276.0
1067106972354716       0.0
1215285019941982       0.0
1225906413461687       0.0
dtype: float64

In [26]:
combined_hashes = pd.concat([sig1_hashes, sig2_hashes], axis=1)
combined_hashes = combined_hashes.dropna()
combined_hashes

Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1
171794956858086,2.0,1.0
394031160505449,1.0,1.0
513049510280006,6.0,1.0
635563321059591,55.0,1.0
734478954482604,1.0,1.0
790330283259107,1.0,1.0
860437213521678,1.0,10.0
936637619733676,3.0,7.0
937624453106451,301.0,4976.0
1012839673718406,5.0,245.0


In [42]:
hashes_of_interest = combined_hashes.loc[hash_diff.abs() > 50]
hashes_of_interest

Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1
635563321059591,55.0,1.0
937624453106451,301.0,4976.0
1012839673718406,5.0,245.0
1026077759632952,39.0,315.0


In [27]:
! aws s3 ls s3://czb-maca/Plate_seq/3_month/

                           PRE 170907_A00111_0051_BH2HWLDMXX/
                           PRE 170907_A00111_0052_AH2HTCDMXX/
                           PRE 170910_A00111_0053_BH2HGKDMXX/
                           PRE 170910_A00111_0054_AH2HGWDMXX/
                           PRE 170910_A00111_0054_AH2HGWDMXX__170910_A00111_0053_BH2HGKDMXX/
                           PRE 170914_A00111_0057_BH3FY7DMXX/
                           PRE 170914_A00111_0058_AH3FYKDMXX/
                           PRE 170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/
                           PRE 170918_A00111_0059_BH3G22DMXX/
                           PRE 170918_A00111_0060_AH3FYVDMXX/
                           PRE 170918_A00111_0060_AH3FYVDMXX__170918_A00111_0059_BH3G22DMXX/
                           PRE 170921_A00111_0062_BH3FYHDMXX/
                           PRE 170921_A00111_0063_AH3G23DMXX/
                           PRE 170921_A00111_0063_AH3G23DMXX__170921_A00111_0062_BH3FYHDMXX/
        

In [29]:
fastqs = pd.read_csv("s3://czb-maca/Plate_seq/3_month/fastqs_read1_read2.csv", index_col=0)
print(fastqs.shape)
fastqs.head()

(53760, 2)


Unnamed: 0_level_0,read1,read2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
A1-B000126-3_39_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000127-3_38_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...
A1-B000167-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000168-3_57_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000412-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...


In [31]:
these_cell_fastqs = fastqs.loc[[cell1, cell2]]
these_cell_fastqs

Unnamed: 0_level_0,read1,read2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
A1-B000610-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...
A1-B002764-3_38_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...


### Copy fastqs over

In [56]:
fastq_dir = '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/'

for fastq in these_cell_fastqs.values.flatten():
    print(fastq)
    ! aws s3 cp $fastq $fastq_dir

s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
download: s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz to ../../../../pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
download: s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz to ../../../../pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B0027

In [57]:
fastq_dir

'/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/'

In [54]:
ls $fastq_dir

## Extract reads containing the hashes of interest

In [58]:
import screed

ksize = 21

fastqs = glob.glob("/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/*")
len(fastqs)
fastqs

['/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R1_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R2_001.fastq.gz']

In [65]:
%%file load_two_cells_igv.txt

genome mm10

load https://czb-maca.s3-us-west-2.amazonaws.com/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam
load https://czb-maca.s3-us-west-2.amazonaws.com/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam

Writing load_two_cells_igv.txt
ERROR! Session/line number was not unique in database. History logging moved to new session 584531


In [68]:
!cat load_two_cells_igv.txt


genome mm10

load https://czb-maca.s3-us-west-2.amazonaws.com/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam
load https://czb-maca.s3-us-west-2.amazonaws.com/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam


In [70]:
ls -lha

total 28M
drwxrwxr-x 4 olga olga 4.0K Aug 20 14:26 [0m[01;34m.[0m/
drwxrwxr-x 5 olga olga 4.0K Aug  9 17:56 [01;34m..[0m/
-rw-rw-r-- 1 olga olga 153K Apr 10 10:51 074_get_num_hashes_for_tabula_muris_signatures.ipynb
-rw-rw-r-- 1 olga olga 203K Apr 10 10:51 075_compare_10x_facs_num_hashes.ipynb
-rw-rw-r-- 1 olga olga 740K Aug 10 13:56 075_visualize_knn_graph_on_similarities.ipynb
-rw-rw-r-- 1 olga olga 519K Apr 10 10:51 900_hashing_things_out.ipynb
-rw-rw-r-- 1 olga olga 453K Apr 12 07:44 choanoflagellate_similarity_matrices.ipynb
-rw-rw-r-- 1 olga olga 9.5K Apr 12 07:44 choanoflagellate_transcriptome_similarity_files.txt
-rw-rw-r-- 1 olga olga 9.1M Aug  8 13:38 hematopoeisis_human_mouse_zebrafish_kidney.ipynb
-rw-rw-r-- 1 olga olga 138K Aug  8 13:38 hematopoeisis_tf-idf.ipynb
-rw-rw-r-- 1 olga olga  70K Aug  9 11:22 hg38_vs_mm38_orthologues-Copy1.ipynb
-rw-rw-r-- 1 olga olga 486K Aug 10 13:54 hg38_vs_mm38_orthologues.ipynb
-rw-rw-r-- 1 olga olga 219K Aug 19 10:57 hg38_vs_mm38_orth

In [72]:
metadata = pd.read_csv("s3://darmanis-group/singlecell_lungadeno/20190820_metadata_epi_cells.csv", index_col=0)
print(metadata.shape)
metadata.head()

(5109, 62)


Unnamed: 0,nGene,nReads,orig.ident,well,plate,cell_id,sample_name,patient_id,DOB,gender,...,S.Score,G2M.Score,Phase,main_seurat_cluster,immune_annotation,res.1,nonimmune_seurat_cluster,nonimmune_general_annotation,epi_seurat_cluster,inferCNV_annotation
A10_1001000407,2265,662644,SeuratProject,A10,1001000407,A10_1001000407,LT_S21,TH185,1961-12-29,Male,...,-0.185666,-0.063043,G1,3,non-immune,14,14,Epithelial,14,tumor
A10_B000863,1093,391731,SeuratProject,A10,B000863,A10_B000863,LT_S47,TH220,1946-09-18,Female,...,-0.124229,-0.20354,G1,3,non-immune,14,14,Epithelial,16,nontumor
A11_B000860,790,546027,SeuratProject,A11,B000860,A11_B000860,LT_S47,TH220,1946-09-18,Female,...,-0.059634,0.090602,G2M,3,non-immune,14,14,Epithelial,16,nontumor
A12_B003103,1784,900730,SeuratProject,A12,B003103,A12_B003103,LT_S72,TH222,1959-03-28,Female,...,-0.216072,-0.234825,G1,3,non-immune,14,14,Epithelial,26,nontumor
A15_B000420,510,140556,SeuratProject,A15,B000420,A15_B000420,LT_S66,TH238,1949-08-25,Female,...,-0.059694,-0.067962,G1,3,non-immune,14,14,Epithelial,14,nontumor


In [64]:

query_hashes = set(hashes_of_interest.index)
minhash = sig1.minhash.copy_and_clear()


# now, iterate over the input sequences and output those that add
# hashes!
n = 0
m = 0

NOTIFY_EVERY_BP=1e6

watermark = NOTIFY_EVERY_BP


for fastq in fastqs:
    for record in screed.open(fastq):
        n += len(record.sequence)
        while n >= watermark:
            sys.stderr.write('... {} {}\r'.format(watermark, fastq))
            watermark += NOTIFY_EVERY_BP

        minhash.add_sequence(record.sequence, force=True)
        hashes = set(minhash.get_hashes())
        match = hashes.intersection(query_hashes)
        if match:
            match_str = [ str(i) for i in match ]
            match_str = ",".join(match_str)
            sys.stdout.write('>{} {}\n{}\n'.format(record.name, match_str,
                                                   record.sequence))
            m += len(record.sequence)

>A00111:58:H3FYKDMXX:1:1109:19352:36323 1:N:0:CCGATGTA+GAGAGTAC 937624453106451
GCGATGGATCGAGGTCATGACTGGACACTGCATCGGAAGACACTTCGTGCCGACACGAAGCAGCCTCAATATCTAACACGAGGATGTTCCTGTTGATACC
>A00111:58:H3FYKDMXX:1:1109:16016:36714 1:N:0:CCGATGTA+GAGAGTAC 937624453106451
AGCATGTTCCCCTCACCTCTCCCCACCCCCTGCCACTTGAAACCTTCTACTAATCAAGAGAAACTTCCAAGCCAACGGAATGGTCAGATCTCACAGGCTG
>A00111:58:H3FYKDMXX:1:1109:15953:36980 1:N:0:CCGATGTA+GAGAGTAC 937624453106451
GGTCTTTATTGCCCACCAGCCACCAACAGTTTCCCAGCCACAGACAGGGTCCTCTGGGTCTCTGTAACCCACTCTCAGGACCCTGAGATGCAGCTACAGT
>A00111:58:H3FYKDMXX:1:1110:11388:1078 1:N:0:CCGATGTA+NAGAGTAC 937624453106451
GTTAGGGGTAATATTCATTTAGCCTTCTGAGCTTTCTGGGCAGACTTGGTGACTTTGCCAGCTCCAGCAGCCTTCTTGTCCACAGCTTTGATGACACCCA
>A00111:58:H3FYKDMXX:1:1110:10384:1219 1:N:0:CCGATGTA+NAGAGTAC 937624453106451
GTTGAGGTACTCAGGTATCAGCTCGTCACAGCTGTCCATGATGAACACACGGCGGACATACAATTTGATGGTGTTCTTCTTCTTCTTGTTCTCAAAAAGG
>A00111:58:H3FYKDMXX:1:1110:5846:1344 1:N:0:CCGATGTA+GAGAGTAC 937624453106451
CTGTTGAATATGCTGCAGG

... 1000000.0 /home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz

>A00111:58:H3FYKDMXX:1:1139:26811:32299 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
GTCCAGGCCTCCAAAGACCGGAATGTGGTCTTCTCTCCCTATGGCGTGTCCTCGGTGCTGGCTATGCTGCAGATGACCACAGCGGGGAAAACCCGGCGGC
>A00111:58:H3FYKDMXX:1:1139:30418:32377 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
CTGTCTCGTGTTTCATGGCAGACACCTGAAGATCTGGGTCCTCGGTCAGCCCGCACAGGTTGAGATCGTAAGTGAAGTCAAAGAATTCCAGTTTCTCCAT
>A00111:58:H3FYKDMXX:1:1139:32859:32659 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
GGAATGGCAGTGTGGACCTCTGTGATGGGACATCTTGTGGGATCTCACAGCCAGTTCTGTGGCAAAGGAGGATGAAGGCTTACGAGGACTCGTCAGGAAG
>A00111:58:H3FYKDMXX:1:1139:14073:32690 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
GTACAAGATCACACTACCGGACTTCAGCGGGGACTTCAAGATCAAGGCCGTGGGTCGTGGGCAGTACGAGTTTCATAGCCTGGAGATTCATAACTGTGAT
>A00111:58:H3FYKDMXX:1:1139:7410:32706 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
GTGAAGAGTGGAAGGGTTATGTGGTCCGGATC

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



>A00111:58:H3FYKDMXX:1:1211:16486:4867 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
CCTGTGCGCAGTTTACGAGTCAGGTCTTCTATCTGTAGTTGCACTATGTAAGCTCTTTCCTGTTCCCGAGTCAGTCCAGGGGGTATAACTGTAGGCATGC
>A00111:58:H3FYKDMXX:1:1211:5828:4914 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
GGGCCCGGCTGACAGTTACACGTGTGTTGCGTCAGTCCCGTGTCCACACGCGCTCAGCCACGTTTGATCCGGATTGCATCAAGTCCCGAAACCCGGTGCG
>A00111:58:H3FYKDMXX:1:1211:2112:4993 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
CTGCGGAGGTTGGTCAGATGGTTGCCCATCTTCTTGATGAGTTTCACCTCCTTATCCAGATAGTGGCATTCCAGGAAGTCACAGAGATGAGGGTCCGCGC
>A00111:58:H3FYKDMXX:1:1211:9697:5008 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
CTTCCACACATAAGTGTCCTGTGCTTGCTCTGGTCCACGAGCAAAGACTAAACAAGCATGGACTGTCAGTGATGGTCTGCAGCAGGGCTGGTGGCAGCCA
>A00111:58:H3FYKDMXX:1:1211:12283:5134 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
GCAGAGGCTGATCTTTGCCGGCAAGCAGCTAGAAGATGG

ERROR:root:Internal Python error in the inspect module./3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
Below is the traceback from this internal error.



>A00111:58:H3FYKDMXX:1:1220:23873:12649 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
ATGCAGATCTTCGTGAAGACCCTGACCGGCAAGACCATCACCCTGGAGGTGGAGCCCAGTGACACCATCGAGAAAGTGAAGGCCAAGATCCAGGATAAAG
>A00111:58:H3FYKDMXX:1:1220:7817:12680 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
GGTTTAATATGGGGTGGGGTGTTTAGTGGATTAGCTGGTATGTAGTTGTCTGGGTCTCCTAGTATGTCTGGGAAAAATAATACTAGGGTTATGAGAATTA
>A00111:58:H3FYKDMXX:1:1220:29803:12680 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
GGAGTGAGAGGGCCTCTGGGTGGATTTCCACACAATTTTCTCCAGGGTGGCATTCTCATCGAAGGACCAGTTCCTCCCTTGGTCTGCCAATCAGTTGCCC
>A00111:58:H3FYKDMXX:1:1220:3106:12759 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
AGGTGAAGTGCGACATGGAGGTGAGCTGCCCTGAAGGATATACCTGCTGCCGCCTCAACACTGGGGCCTGGGGCTGCTGTCCATTTGCCAAGGCCGTGTG
>A00111:58:H3FYKDMXX:1:1220:28456:12947 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,1012839673718406,635563321059591
CACCTGCATCTATGAGGTTCTTGGCTTGCGCAG


KeyboardInterrupt



KeyboardInterrupt: 

In [None]:
%%file

#! /usr/bin/env python
"""
Given a signature file and a collection of sequences, output all of the
sequences that contain a k-mer in the signature file.
"""
import sys
import argparse
import sourmash
from sourmash import sourmash_args
import screed


NOTIFY_EVERY_BP=1e6


def main():
    p = argparse.ArgumentParser()
    p.add_argument('sigfile')
    p.add_argument('seqfiles', nargs='+')
    sourmash_args.add_ksize_arg(p, 31)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args()

    # load the minhash object that we'll use, and construct an empty one.
    moltype = sourmash_args.calculate_moltype(args)
    sig = sourmash.load_one_signature(args.sigfile,
                                      ksize=args.ksize,
                                      select_moltype=moltype)
    query_hashes = set(sig.minhash.get_mins())
    minhash = sig.minhash.copy_and_clear()
    

    # now, iterate over the input sequences and output those that add
    # hashes!
    n = 0
    m = 0
    watermark = NOTIFY_EVERY_BP
    for filename in args.seqfiles:
        for record in screed.open(filename):
            n += len(record.sequence)
            while n >= watermark:
                sys.stderr.write('... {} {}\r'.format(watermark, filename))
                watermark += NOTIFY_EVERY_BP

            hashes = minhash.add_sequence(record.sequence, force=True,
                                          output_added=True)
            hashes = set(hashes)
            match = hashes.intersection(query_hashes)
            if match:
                match_str = [ str(i) for i in match ]
                match_str = ",".join(match_str)
                sys.stdout.write('>{} {}\n{}\n'.format(record.name, match_str,
                                                       record.sequence))
                m += len(record.sequence)

    sys.stderr.write('read {}, wrote {}\n'.format(n, m))


if __name__ == '__main__':
    sys.exit(main())


### Copy bams over

In [43]:
! aws s3 ls s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/$cell1 

2019-03-14 14:18:06   62996320 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam
2019-03-14 14:18:08    1446080 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam.bai
2019-03-14 14:18:06     409320 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.SJ.out.tab
2019-03-14 14:18:06     540460 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.htseq-count.txt
2019-03-14 14:18:06       1849 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.log.final.out


In [44]:
! aws s3 ls s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/$cell2

2019-03-14 14:21:26   37259956 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam
2019-03-14 14:21:27    1425896 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam.bai
2019-03-14 14:21:26      19274 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.SJ.out.tab
2019-03-14 14:21:26     537300 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.htseq-count.txt
2019-03-14 14:21:26       1840 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.log.final.out
