In [64]:



import glob
import itertools
import math
import os
import sys

import igraph as ig
import leidenalg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import khtools


ksize = 21
moltype = 'DNA'

ignore_abundance = False
downsample = False


# Defaults from 'sourmash index'

bf_size = 1e5
n_children = 2
scaled = False

n_neighbors = 3

try:
    import scanpy.api as sc
except FutureWarning:
    import scanpy as sc

sc.logging.print_versions()


from sourmash import signature as sig
from sourmash.compare import compare_all_pairs
from sourmash.sbt import Leaf
from sourmash.sbtmh import SigLeaf, create_sbt_index
from sourmash import sourmash_args
from sourmash.logging import notify
from sourmash._minhash import hash_murmur
import umap

scanpy==1.4.4 anndata==0.6.22.post1 umap==0.3.9 numpy==1.17.0 scipy==1.3.1 pandas==0.25.0 scikit-learn==0.21.3 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1


In [8]:
similarities_tidy = pd.read_csv('s3://kmer-hashing/tabula-muris/n_hashes=500/bladder/similarities_tidy.csv')
print(similarities_tidy.shape)
similarities_tidy.head()

(1612900, 6)


Unnamed: 0,cell1,cell2,similarity_without_abundance,similarity_with_abundance,cell_ontology_class_cell1,cell_ontology_class
0,A1-B000610-3_56_F-1-1,A1-B000610-3_56_F-1-1,1.0,1.0,bladder cell,bladder cell
1,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1,0.018,0.22846,bladder cell,bladder urothelial cell
2,A1-B000610-3_56_F-1-1,A1-B002771-3_39_F-1-1,0.072,0.24519,bladder cell,bladder cell
3,A1-B000610-3_56_F-1-1,A1-D041914-3_8_M-1-1,0.098,0.454728,bladder cell,bladder cell
4,A1-B000610-3_56_F-1-1,A1-D042253-3_9_M-1-1,0.09,0.231834,bladder cell,bladder cell


In [9]:
%%time

annotations = pd.read_csv('https://github.com/czbiohub/tabula-muris/raw/master/00_data_ingest/18_global_annotation_csv/annotations_facs.csv', 
                          index_col='cell')
annotations.index = annotations.index.str.replace('.', '-')
annotations.columns = annotations.columns.str.replace('.', '_')
annotations['sample_id'] = annotations.index
annotations = annotations.fillna("NA")
bladder_annotations = annotations.query('tissue == "Bladder"')

folder = '/home/olga/pureScratch/olgabot-maca/facs/sourmash/'

all_signatures = glob.glob(f'{folder}/*.sig')
basenames = [os.path.basename(x) for x in all_signatures]
bladder_files = list(itertools.chain(*[[folder + x for x in basenames if x.startswith(cell_id)] for cell_id in bladder_annotations.index]))
len(bladder_files)

ksize = 21
moltype = "DNA"
ignore_abundance = True


bladder_signatures = []
for filename in bladder_files:
     bladder_signatures.extend(sig.load_signatures(filename, ksize=ksize, select_moltype="DNA"))
print(len(bladder_signatures))
bladder_signatures[:5]

1270
CPU times: user 37.6 s, sys: 116 ms, total: 37.7 s
Wall time: 39.2 s


In [10]:
# bladder cell
cell1 = 'A1-B000610-3_56_F-1-1'

sig1 = [x for x in bladder_signatures if x.name().startswith(cell1)][0]
sig1

SourmashSignature('A1-B000610-3_56_F-1-1_S28', 280b9bab)

In [166]:
sig1.minhash.is_protein

False

In [45]:
sig1.minhash.ksize

21

In [11]:
# bladder urothelial cell
cell2 = 'A1-B002764-3_38_F-1-1'

sig2 = [x for x in bladder_signatures if x.name().startswith(cell2)][0]
sig2

SourmashSignature('A1-B002764-3_38_F-1-1_S291', ef185816)

In [24]:
sig1_hashes = pd.Series(sig1.minhash.get_mins(with_abundance=True), name=cell1)
sig1_hashes.head()

3282762037724      2
5959368950693      1
7215285795798     20
11329527245923     1
12290763577842     1
Name: A1-B000610-3_56_F-1-1, dtype: int64

In [23]:
sig2_hashes = pd.Series(sig2.minhash.get_mins(with_abundance=True), name=cell2)
sig2_hashes.head()

4319888920071     1
11897683056067    4
18288931836992    1
27514404317028    1
28801856705165    1
Name: A1-B002764-3_38_F-1-1, dtype: int64

In [22]:
hash_diff = (sig1_hashes - sig2_hashes).dropna()
hash_diff

171794956858086        1.0
394031160505449        0.0
513049510280006        5.0
635563321059591       54.0
734478954482604        0.0
790330283259107        0.0
860437213521678       -9.0
936637619733676       -4.0
937624453106451    -4675.0
1012839673718406    -240.0
1026077759632952    -276.0
1067106972354716       0.0
1215285019941982       0.0
1225906413461687       0.0
dtype: float64

In [26]:
combined_hashes = pd.concat([sig1_hashes, sig2_hashes], axis=1)
combined_hashes = combined_hashes.dropna()
combined_hashes

Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1
171794956858086,2.0,1.0
394031160505449,1.0,1.0
513049510280006,6.0,1.0
635563321059591,55.0,1.0
734478954482604,1.0,1.0
790330283259107,1.0,1.0
860437213521678,1.0,10.0
936637619733676,3.0,7.0
937624453106451,301.0,4976.0
1012839673718406,5.0,245.0


In [42]:
hashes_of_interest = combined_hashes.loc[hash_diff.abs() > 50]
hashes_of_interest

Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1
635563321059591,55.0,1.0
937624453106451,301.0,4976.0
1012839673718406,5.0,245.0
1026077759632952,39.0,315.0


In [27]:
! aws s3 ls s3://czb-maca/Plate_seq/3_month/

                           PRE 170907_A00111_0051_BH2HWLDMXX/
                           PRE 170907_A00111_0052_AH2HTCDMXX/
                           PRE 170910_A00111_0053_BH2HGKDMXX/
                           PRE 170910_A00111_0054_AH2HGWDMXX/
                           PRE 170910_A00111_0054_AH2HGWDMXX__170910_A00111_0053_BH2HGKDMXX/
                           PRE 170914_A00111_0057_BH3FY7DMXX/
                           PRE 170914_A00111_0058_AH3FYKDMXX/
                           PRE 170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/
                           PRE 170918_A00111_0059_BH3G22DMXX/
                           PRE 170918_A00111_0060_AH3FYVDMXX/
                           PRE 170918_A00111_0060_AH3FYVDMXX__170918_A00111_0059_BH3G22DMXX/
                           PRE 170921_A00111_0062_BH3FYHDMXX/
                           PRE 170921_A00111_0063_AH3G23DMXX/
                           PRE 170921_A00111_0063_AH3G23DMXX__170921_A00111_0062_BH3FYHDMXX/
        

In [29]:
fastqs = pd.read_csv("s3://czb-maca/Plate_seq/3_month/fastqs_read1_read2.csv", index_col=0)
print(fastqs.shape)
fastqs.head()

(53760, 2)


Unnamed: 0_level_0,read1,read2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
A1-B000126-3_39_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000127-3_38_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...
A1-B000167-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000168-3_57_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000412-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...


In [31]:
these_cell_fastqs = fastqs.loc[[cell1, cell2]]
these_cell_fastqs

Unnamed: 0_level_0,read1,read2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
A1-B000610-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...
A1-B002764-3_38_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...


### Copy fastqs over

In [56]:
fastq_dir = '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/'

for fastq in these_cell_fastqs.values.flatten():
    print(fastq)
    ! aws s3 cp $fastq $fastq_dir

s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
download: s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz to ../../../../pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
download: s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz to ../../../../pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B0027

In [57]:
fastq_dir

'/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/'

In [54]:
ls $fastq_dir

## Extract reads containing the hashes of interest

In [58]:
import screed

ksize = 21

fastqs = glob.glob("/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/*")
len(fastqs)
fastqs

['/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R1_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R2_001.fastq.gz']

In [65]:
%%file load_two_cells_igv.txt

genome mm10

load https://czb-maca.s3-us-west-2.amazonaws.com/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam
load https://czb-maca.s3-us-west-2.amazonaws.com/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam

Writing load_two_cells_igv.txt
ERROR! Session/line number was not unique in database. History logging moved to new session 584531


In [68]:
!cat load_two_cells_igv.txt


genome mm10

load https://czb-maca.s3-us-west-2.amazonaws.com/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam
load https://czb-maca.s3-us-west-2.amazonaws.com/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam


In [70]:
ls -lha

total 28M
drwxrwxr-x 4 olga olga 4.0K Aug 20 14:26 [0m[01;34m.[0m/
drwxrwxr-x 5 olga olga 4.0K Aug  9 17:56 [01;34m..[0m/
-rw-rw-r-- 1 olga olga 153K Apr 10 10:51 074_get_num_hashes_for_tabula_muris_signatures.ipynb
-rw-rw-r-- 1 olga olga 203K Apr 10 10:51 075_compare_10x_facs_num_hashes.ipynb
-rw-rw-r-- 1 olga olga 740K Aug 10 13:56 075_visualize_knn_graph_on_similarities.ipynb
-rw-rw-r-- 1 olga olga 519K Apr 10 10:51 900_hashing_things_out.ipynb
-rw-rw-r-- 1 olga olga 453K Apr 12 07:44 choanoflagellate_similarity_matrices.ipynb
-rw-rw-r-- 1 olga olga 9.5K Apr 12 07:44 choanoflagellate_transcriptome_similarity_files.txt
-rw-rw-r-- 1 olga olga 9.1M Aug  8 13:38 hematopoeisis_human_mouse_zebrafish_kidney.ipynb
-rw-rw-r-- 1 olga olga 138K Aug  8 13:38 hematopoeisis_tf-idf.ipynb
-rw-rw-r-- 1 olga olga  70K Aug  9 11:22 hg38_vs_mm38_orthologues-Copy1.ipynb
-rw-rw-r-- 1 olga olga 486K Aug 10 13:54 hg38_vs_mm38_orthologues.ipynb
-rw-rw-r-- 1 olga olga 219K Aug 19 10:57 hg38_vs_mm38_orth

(5109, 62)


Unnamed: 0,nGene,nReads,orig.ident,well,plate,cell_id,sample_name,patient_id,DOB,gender,...,S.Score,G2M.Score,Phase,main_seurat_cluster,immune_annotation,res.1,nonimmune_seurat_cluster,nonimmune_general_annotation,epi_seurat_cluster,inferCNV_annotation
A10_1001000407,2265,662644,SeuratProject,A10,1001000407,A10_1001000407,LT_S21,TH185,1961-12-29,Male,...,-0.185666,-0.063043,G1,3,non-immune,14,14,Epithelial,14,tumor
A10_B000863,1093,391731,SeuratProject,A10,B000863,A10_B000863,LT_S47,TH220,1946-09-18,Female,...,-0.124229,-0.20354,G1,3,non-immune,14,14,Epithelial,16,nontumor
A11_B000860,790,546027,SeuratProject,A11,B000860,A11_B000860,LT_S47,TH220,1946-09-18,Female,...,-0.059634,0.090602,G2M,3,non-immune,14,14,Epithelial,16,nontumor
A12_B003103,1784,900730,SeuratProject,A12,B003103,A12_B003103,LT_S72,TH222,1959-03-28,Female,...,-0.216072,-0.234825,G1,3,non-immune,14,14,Epithelial,26,nontumor
A15_B000420,510,140556,SeuratProject,A15,B000420,A15_B000420,LT_S66,TH238,1949-08-25,Female,...,-0.059694,-0.067962,G1,3,non-immune,14,14,Epithelial,14,nontumor


In [118]:
%%time
query_hashes = set(hashes_of_interest.index)
minhash = sig1.minhash.copy_and_clear()


# now, iterate over the input sequences and output those that add
# hashes!
n = 0
m = 0

NOTIFY_EVERY_BP=1e6

watermark = NOTIFY_EVERY_BP

with open("bladder_test_cells_different_hashes.fasta", 'w') as f:

    for fastq in fastqs:
        sample_id = os.path.basename(fastq).split("_R")[0]
        for record in screed.open(fastq):
            n += len(record.sequence)
            while n >= watermark:
                sys.stderr.write('... {} {}\r'.format(watermark, fastq))
                watermark += NOTIFY_EVERY_BP

            minhash.add_sequence(record.sequence, force=True)
            hashes = set(minhash.get_hashes())
            match = hashes.intersection(query_hashes)
            if match:
                match_str = [ str(i) for i in match ]
                match_str = ",".join(match_str)
                f.write('>{} {} {}\n{}\n'.format(record.name, sample_id, match_str,
                                                       record.sequence))
                m += len(record.sequence)

... 310000000.0 /home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R2_001.fastq.gz

CPU times: user 3min 16s, sys: 1.26 s, total: 3min 17s
Wall time: 3min 17s


... 311000000.0 /home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R2_001.fastq.gz

In [116]:
match_str

'635563321059591'

In [119]:
! head bladder_test_cells_different_hashes.fasta

>A00111:58:H3FYKDMXX:1:1109:19352:36323 1:N:0:CCGATGTA+GAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
GCGATGGATCGAGGTCATGACTGGACACTGCATCGGAAGACACTTCGTGCCGACACGAAGCAGCCTCAATATCTAACACGAGGATGTTCCTGTTGATACC
>A00111:58:H3FYKDMXX:1:1109:16016:36714 1:N:0:CCGATGTA+GAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
AGCATGTTCCCCTCACCTCTCCCCACCCCCTGCCACTTGAAACCTTCTACTAATCAAGAGAAACTTCCAAGCCAACGGAATGGTCAGATCTCACAGGCTG
>A00111:58:H3FYKDMXX:1:1109:15953:36980 1:N:0:CCGATGTA+GAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
GGTCTTTATTGCCCACCAGCCACCAACAGTTTCCCAGCCACAGACAGGGTCCTCTGGGTCTCTGTAACCCACTCTCAGGACCCTGAGATGCAGCTACAGT
>A00111:58:H3FYKDMXX:1:1110:11388:1078 1:N:0:CCGATGTA+NAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
GTTAGGGGTAATATTCATTTAGCCTTCTGAGCTTTCTGGGCAGACTTGGTGACTTTGCCAGCTCCAGCAGCCTTCTTGTCCACAGCTTTGATGACACCCA
>A00111:58:H3FYKDMXX:1:1110:10384:1219 1:N:0:CCGATGTA+NAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
GTTGAGGTACTCAGGTATCAGCTCGTCACAGCTGTCCATGATGAACACACGGCGGACATACAATTTGATGGTGTTCTTCTTCTTCTTG

In [98]:
! grep -A 1 ',' bladder_test_cells_different_hashes.fasta | head -n 20


>A00111:58:H3FYKDMXX:1:1137:9543:21684 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,635563321059591
GCCGTGTCGTGTCACCGTTTCTGCAGGCACCATGAGCCAGGACACCGAAGTGGACATGAAAGATGTGGAGCTGAACGAGCTAGAACCGGAGAAGCAGCCC
>A00111:58:H3FYKDMXX:1:1137:15673:21746 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,635563321059591
GACCAGAGCTCTGGGGTCGAAATAAAGCAGCAACATGAAAGCACAGAGACCAGAGCTCTGGGGTCGAAATTCATACACCTTTGCACAGGGTAGAGGAGTC
>A00111:58:H3FYKDMXX:1:1137:29378:21966 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,635563321059591
GGGGTGCTGGAGCCACTGTCGCCGAGCTCGGGCCACGCTGCTTCTCCTCGCCAGTCGCCCCCCATCGTGCACTAGCGGTCTCAAAAGATTCAAAGTCCAA
>A00111:58:H3FYKDMXX:1:1137:26449:22028 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,635563321059591
ATCCTCCTGTGATCAGGAGCATCAGTCAGCTCTTGAGGAAGCCAAGCAGCCCAAGAATGACAATGTAGTGATCCCCGAGTGTGCACATGGTGGTCTCTAC
>A00111:58:H3FYKDMXX:1:1137:1705:22075 1:N:0:CCGATGTA+GAGAGTAC 937624453106451,635563321059591
GTATCAACGCAGAGTACATGGGGCTCTTTTCCTCTGGCGCGCCACCGACGATCCTATTGTCATCATGGGCCGCCGCCCCGCCCGGGGATACCGGTAATGT
>A00111:58:H3FYKD

In [83]:
! wc -l bladder_test_cells_different_hashes.fasta

6046332 bladder_test_cells_different_hashes.fasta


In [115]:
hashes_of_interest

Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1
635563321059591,55.0,1.0
937624453106451,301.0,4976.0
1012839673718406,5.0,245.0
1026077759632952,39.0,315.0


## Match the hashes to kmers

In [167]:
%%time

from collections import Counter, defaultdict

fasta = 'bladder_test_cells_different_hashes.fasta'

query_hashes = set(hashes_of_interest.index)
minhash = sig1.minhash.copy_and_clear()


ksize = sig1.minhash.ksize
# sample_to_kmers = defaultdict(Counter)
kmers_to_hashes = {}

# now, iterate over the input sequences and output those that add
# hashes!
n = 0
m = 0

NOTIFY_EVERY_BP=1e4

seen_hashes = Counter(query_hashes)

for record in screed.open(fasta):
    n += len(record.sequence)
    while n >= watermark:
        sys.stderr.write('... {} {}\r'.format(watermark, fastq))
        watermark += NOTIFY_EVERY_BP

    minhash.add_sequence(record.sequence, force=True)
    hashes = set(minhash.get_hashes())
    match = hashes.intersection(query_hashes)
    
    sample_id = record.name.split()[2]
    
    if match:
        seen_hashes.update(match)
#         for matched_hash in match:
        n_kmers = len(record.sequence) - ksize + 1
        kmers = [record.sequence[i:(i+ksize)] for i in range(n_kmers)]
        hashed_kmers = [hash_murmur(kmer) for kmer in kmers]
        kmer_to_hash = dict(zip(kmers, hashed_kmers))
        
        kmer_to_hashes_of_interest = {kmer: h for kmer, h in kmer_to_hash.items() if h in query_hashes}
        if len(kmer_to_hashes_of_interest) != len(match):
            print("Something is weird! Not all the hashes were found...")
            break
        kmers_to_hashes.update(kmer_to_hashes_of_interest)
    if len(kmers_to_hashes) == len(query_hashes):
        print("Seen all hashes, quitting")
        break
#         sample_to_kmers[sample_id].update(kmers)
#     if all(v > 100 for k, v in seen_hashes.items()):
#         print("Seen all the hashes 100 times! ending")
#         break

Something is weird! Not all the hashes were found...
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 901 µs


In [168]:
match

{937624453106451}

In [169]:
record

{'name': 'A00111:58:H3FYKDMXX:1:1109:16016:36714 1:N:0:CCGATGTA+GAGAGTAC A1-B000610-3_56_F-1-1 937624453106451', 'sequence': 'AGCATGTTCCCCTCACCTCTCCCCACCCCCTGCCACTTGAAACCTTCTACTAATCAAGAGAAACTTCCAAGCCAACGGAATGGTCAGATCTCACAGGCTG', 'description': ''}

In [170]:
kmer_to_hashes_of_interest

{}

In [171]:
kmer_to_hashes_of_interest

{}

In [155]:
match

{635563321059591, 937624453106451, 1012839673718406, 1026077759632952}

In [154]:
! head bladder_test_cells_different_hashes.fasta

>A00111:58:H3FYKDMXX:1:1109:19352:36323 1:N:0:CCGATGTA+GAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
GCGATGGATCGAGGTCATGACTGGACACTGCATCGGAAGACACTTCGTGCCGACACGAAGCAGCCTCAATATCTAACACGAGGATGTTCCTGTTGATACC
>A00111:58:H3FYKDMXX:1:1109:16016:36714 1:N:0:CCGATGTA+GAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
AGCATGTTCCCCTCACCTCTCCCCACCCCCTGCCACTTGAAACCTTCTACTAATCAAGAGAAACTTCCAAGCCAACGGAATGGTCAGATCTCACAGGCTG
>A00111:58:H3FYKDMXX:1:1109:15953:36980 1:N:0:CCGATGTA+GAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
GGTCTTTATTGCCCACCAGCCACCAACAGTTTCCCAGCCACAGACAGGGTCCTCTGGGTCTCTGTAACCCACTCTCAGGACCCTGAGATGCAGCTACAGT
>A00111:58:H3FYKDMXX:1:1110:11388:1078 1:N:0:CCGATGTA+NAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
GTTAGGGGTAATATTCATTTAGCCTTCTGAGCTTTCTGGGCAGACTTGGTGACTTTGCCAGCTCCAGCAGCCTTCTTGTCCACAGCTTTGATGACACCCA
>A00111:58:H3FYKDMXX:1:1110:10384:1219 1:N:0:CCGATGTA+NAGAGTAC A1-B000610-3_56_F-1-1 937624453106451
GTTGAGGTACTCAGGTATCAGCTCGTCACAGCTGTCCATGATGAACACACGGCGGACATACAATTTGATGGTGTTCTTCTTCTTCTTG

In [138]:
kmers_to_hashes

{'ATCGAGGTCATGACTGGACAC': 937624453106451,
 'AGTGGACATGAAAGATGTGGA': 635563321059591,
 'GGAGTGGGTAAATTGCGCGCC': 1012839673718406,
 'GCTCAGTTATTGACTACGAGC': 1026077759632952}

In [139]:
hashes_to_kmers = {v: k for k, v in kmers_to_hashes.items()}
hashes_to_kmers

{937624453106451: 'ATCGAGGTCATGACTGGACAC',
 635563321059591: 'AGTGGACATGAAAGATGTGGA',
 1012839673718406: 'GGAGTGGGTAAATTGCGCGCC',
 1026077759632952: 'GCTCAGTTATTGACTACGAGC'}

In [172]:
hashes_of_interest

Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1,kmer
635563321059591,55.0,1.0,AGTGGACATGAAAGATGTGGA
937624453106451,301.0,4976.0,ATCGAGGTCATGACTGGACAC
1012839673718406,5.0,245.0,GGAGTGGGTAAATTGCGCGCC
1026077759632952,39.0,315.0,GCTCAGTTATTGACTACGAGC


In [174]:
%%time
query_hashes = set(hashes_of_interest.index)
minhash = sig1.minhash.copy_and_clear()


# now, iterate over the input sequences and output those that add
# hashes!
n = 0
m = 0

NOTIFY_EVERY_BP=1e6

watermark = NOTIFY_EVERY_BP

with open("bladder_test_cells_different_hashes.fasta", 'w') as f:

    for fastq in fastqs:
        sample_id = os.path.basename(fastq).split("_R")[0]
        for record in screed.open(fastq):
            n += len(record.sequence)
            while n >= watermark:
                sys.stderr.write('... {} {}\r'.format(watermark, fastq))
                watermark += NOTIFY_EVERY_BP

            minhash.add_sequence(record.sequence, force=True)
            hashes = set(minhash.get_hashes())
            match = hashes.intersection(query_hashes)
            if match:
                n_kmers = len(record.sequence) - ksize + 1
                kmers = [record.sequence[i:(i+ksize)] for i in range(n_kmers)]
                hashed_kmers = [hash_murmur(kmer) for kmer in kmers]
                kmer_to_hash = dict(zip(kmers, hashed_kmers))

                kmer_to_hashes_of_interest = {kmer: h for kmer, h in kmer_to_hash.items() if h in query_hashes}
                if len(kmer_to_hashes_of_interest) != len(match):
                    print("Something is weird! Not all the hashes were found...")
                    print(match)
                    break

Something is weird! Not all the hashes were found...
{937624453106451}
Something is weird! Not all the hashes were found...
{937624453106451}
Something is weird! Not all the hashes were found...
{937624453106451}
Something is weird! Not all the hashes were found...
{937624453106451}
CPU times: user 188 ms, sys: 16 ms, total: 204 ms
Wall time: 212 ms


In [141]:
for h, k in hashes_to_kmers.items():
    print(f">{h}\n{k}")

>937624453106451
ATCGAGGTCATGACTGGACAC
>635563321059591
AGTGGACATGAAAGATGTGGA
>1012839673718406
GGAGTGGGTAAATTGCGCGCC
>1026077759632952
GCTCAGTTATTGACTACGAGC


In [152]:
import igv

b = igv.Browser({"genome": "mm10"})
b.show()

bam_base_folder = 's3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/'


b.load_track(
    {
        "name": cell1,
        "url": f"{bam_base_folder}{cell1}.gencode.vM19.ERCC.Aligned.out.sorted.bam",
        "indexURL": f"{bam_base_folder}{cell1}.gencode.vM19.ERCC.Aligned.out.sorted.bam.bai",
        "indexed": True
    })


'IGV Browser not ready'

In [140]:
hashes_of_interest['kmer'] = hashes_of_interest.index.map(hashes_to_kmers)
hashes_of_interest

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1,kmer
635563321059591,55.0,1.0,AGTGGACATGAAAGATGTGGA
937624453106451,301.0,4976.0,ATCGAGGTCATGACTGGACAC
1012839673718406,5.0,245.0,GGAGTGGGTAAATTGCGCGCC
1026077759632952,39.0,315.0,GCTCAGTTATTGACTACGAGC


In [136]:
hash_murmur("TTTTTTTTTTTTTTTTTTTTT")

15066920339359888050

In [126]:
seen_hashes

Counter({1026077759632952: 101,
         937624453106451: 39829,
         1012839673718406: 31133,
         635563321059591: 31797})

In [131]:
# sample_to_kmers

In [127]:
for k, v in hash_to_kmers.items():
    print(k, v.most_common(5))

937624453106451 [('TTTTTTTTTTTTTTTTTTTTT', 6670), ('AAAAAAAAAAAAAAAAAAAAA', 2270), ('CTTTTTTTTTTTTTTTTTTTT', 513), ('ACTTTTTTTTTTTTTTTTTTT', 510), ('GTACTTTTTTTTTTTTTTTTT', 508)]
635563321059591 [('TTTTTTTTTTTTTTTTTTTTT', 5518), ('AAAAAAAAAAAAAAAAAAAAA', 1877), ('CTTTTTTTTTTTTTTTTTTTT', 417), ('ACTTTTTTTTTTTTTTTTTTT', 415), ('GTACTTTTTTTTTTTTTTTTT', 414)]
1012839673718406 [('TTTTTTTTTTTTTTTTTTTTT', 5409), ('AAAAAAAAAAAAAAAAAAAAA', 1849), ('CTTTTTTTTTTTTTTTTTTTT', 408), ('GTACTTTTTTTTTTTTTTTTT', 406), ('ACTTTTTTTTTTTTTTTTTTT', 406)]
1026077759632952 [('GCTCGTAGTCAATAACTGAGC', 2), ('CTCGTAGTCAATAACTGAGCC', 2), ('TCGTAGTCAATAACTGAGCCA', 2), ('CGTAGTCAATAACTGAGCCAT', 2), ('GTAGTCAATAACTGAGCCATC', 2)]


In [95]:
%%time

from collections import Counter, defaultdict

fasta = 'bladder_test_cells_different_hashes.fasta'

query_hashes = set(hashes_of_interest.index)
minhash = sig1.minhash.copy_and_clear()


ksize = 21
sample_to_kmer_counts = {}

# now, iterate over the input sequences and output those that add
# hashes!
n = 0
m = 0

NOTIFY_EVERY_BP=1e4


seen_hashes = Counter(query_hashes)

for record in screed.open(fasta):
    n += len(record.sequence)
    while n >= watermark:
        sys.stderr.write('... {} {}\r'.format(watermark, fastq))
        watermark += NOTIFY_EVERY_BP

    minhash.add_sequence(record.sequence, force=True)
    hashes = set(minhash.get_hashes())
    match = hashes.intersection(query_hashes)
    
    
    
    if match:
        seen_hashes.update(match)
        for matched_hash in match:
            n_kmers = len(record.sequence) - ksize + 1
            kmers = [record.sequence[i:(i+ksize)] for i in range(n_kmers)]
            hash_to_kmers[matched_hash].update(kmers)
    if all(v > 100 for k, v in seen_hashes.items()):
        print("Seen all the hashes 100 times! ending")
        break

'A00111:58:H3FYKDMXX:1:1110:3332:1814 1:N:0:CCGATGTA+GAGAGTAC 937624453106451'

In [122]:
record.name

'A00111:57:H3FY7DMXX:2:2488:17372:36996 2:N:0:GAGTGGTT+ACTTGGAG'

In [90]:
record.sequence

'GCCCAGTCCATCAGTTTCTCAGCCCAGTTCTTCTCAAAGTGAAGAAAAAGCTCCTGAGTTGCCCAAACCAAAGAAGAACAGATGTTTTATGTGTAGAAAG'

In [81]:
# hash_to_kmers

In [None]:
hash_to_kmers[635563321059591]

In [None]:
%%file

#! /usr/bin/env python
"""
Given a signature file and a collection of sequences, output all of the
sequences that contain a k-mer in the signature file.
"""
import sys
import argparse
import sourmash
from sourmash import sourmash_args
import screed


NOTIFY_EVERY_BP=1e6


def main():
    p = argparse.ArgumentParser()
    p.add_argument('sigfile')
    p.add_argument('seqfiles', nargs='+')
    sourmash_args.add_ksize_arg(p, 31)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args()

    # load the minhash object that we'll use, and construct an empty one.
    moltype = sourmash_args.calculate_moltype(args)
    sig = sourmash.load_one_signature(args.sigfile,
                                      ksize=args.ksize,
                                      select_moltype=moltype)
    query_hashes = set(sig.minhash.get_mins())
    minhash = sig.minhash.copy_and_clear()
    

    # now, iterate over the input sequences and output those that add
    # hashes!
    n = 0
    m = 0
    watermark = NOTIFY_EVERY_BP
    for filename in args.seqfiles:
        for record in screed.open(filename):
            n += len(record.sequence)
            while n >= watermark:
                sys.stderr.write('... {} {}\r'.format(watermark, filename))
                watermark += NOTIFY_EVERY_BP

            hashes = minhash.add_sequence(record.sequence, force=True,
                                          output_added=True)
            hashes = set(hashes)
            match = hashes.intersection(query_hashes)
            if match:
                match_str = [ str(i) for i in match ]
                match_str = ",".join(match_str)
                sys.stdout.write('>{} {}\n{}\n'.format(record.name, match_str,
                                                       record.sequence))
                m += len(record.sequence)

    sys.stderr.write('read {}, wrote {}\n'.format(n, m))


if __name__ == '__main__':
    sys.exit(main())


### Copy bams over

In [43]:
! aws s3 ls s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/$cell1 

2019-03-14 14:18:06   62996320 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam
2019-03-14 14:18:08    1446080 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam.bai
2019-03-14 14:18:06     409320 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.SJ.out.tab
2019-03-14 14:18:06     540460 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.htseq-count.txt
2019-03-14 14:18:06       1849 A1-B000610-3_56_F-1-1.gencode.vM19.ERCC.log.final.out


In [44]:
! aws s3 ls s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/results_gencode_ercc/$cell2

2019-03-14 14:21:26   37259956 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam
2019-03-14 14:21:27    1425896 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.Aligned.out.sorted.bam.bai
2019-03-14 14:21:26      19274 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.SJ.out.tab
2019-03-14 14:21:26     537300 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.htseq-count.txt
2019-03-14 14:21:26       1840 A1-B002764-3_38_F-1-1.gencode.vM19.ERCC.log.final.out
