In [1]:
import glob
import os

import pandas as pd
import screed

from sourmash import signature as sig
from sourmash._minhash import hash_murmur


In [2]:
similarities_tidy = pd.read_csv('s3://kmer-hashing/tabula-muris/n_hashes=500/bladder/similarities_tidy.csv')
print(similarities_tidy.shape)
similarities_tidy.head()

(1612900, 6)


Unnamed: 0,cell1,cell2,similarity_without_abundance,similarity_with_abundance,cell_ontology_class_cell1,cell_ontology_class
0,A1-B000610-3_56_F-1-1,A1-B000610-3_56_F-1-1,1.0,1.0,bladder cell,bladder cell
1,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1,0.018,0.22846,bladder cell,bladder urothelial cell
2,A1-B000610-3_56_F-1-1,A1-B002771-3_39_F-1-1,0.072,0.24519,bladder cell,bladder cell
3,A1-B000610-3_56_F-1-1,A1-D041914-3_8_M-1-1,0.098,0.454728,bladder cell,bladder cell
4,A1-B000610-3_56_F-1-1,A1-D042253-3_9_M-1-1,0.09,0.231834,bladder cell,bladder cell


In [3]:
folder = '/home/olga/pureScratch/olgabot-maca/facs/sourmash/'


In [4]:
# bladder cell
cell1 = 'A1-B000610-3_56_F-1-1'

ksize = 21
moltype = 'DNA'


sig1 = list(sig.load_signatures(f"{folder}/A1-B000610-3_56_F-1-1_S28.sig", ksize=ksize, 
                                select_moltype=moltype))[0]
sig1

SourmashSignature('A1-B000610-3_56_F-1-1_S28', 280b9bab)

In [5]:
# bladder urothelial cell
cell2 = 'A1-B002764-3_38_F-1-1'

sig2 = list(sig.load_signatures(f"{folder}/A1-B002764-3_38_F-1-1_S291.sig", ksize=ksize, 
                                select_moltype=moltype))[0]
sig2

SourmashSignature('A1-B002764-3_38_F-1-1_S291', ef185816)

In [6]:
sig1_hashes = pd.Series(sig1.minhash.get_mins(with_abundance=True), name=cell1)
sig1_hashes.head()

3282762037724      2
5959368950693      1
7215285795798     20
11329527245923     1
12290763577842     1
Name: A1-B000610-3_56_F-1-1, dtype: int64

In [7]:
sig2_hashes = pd.Series(sig2.minhash.get_mins(with_abundance=True), name=cell2)
sig2_hashes.head()

4319888920071     1
11897683056067    4
18288931836992    1
27514404317028    1
28801856705165    1
Name: A1-B002764-3_38_F-1-1, dtype: int64

In [8]:
hash_diff = (sig1_hashes - sig2_hashes).dropna()
hash_diff

171794956858086        1.0
394031160505449        0.0
513049510280006        5.0
635563321059591       54.0
734478954482604        0.0
790330283259107        0.0
860437213521678       -9.0
936637619733676       -4.0
937624453106451    -4675.0
1012839673718406    -240.0
1026077759632952    -276.0
1067106972354716       0.0
1215285019941982       0.0
1225906413461687       0.0
dtype: float64

In [9]:
combined_hashes = pd.concat([sig1_hashes, sig2_hashes], axis=1)
combined_hashes = combined_hashes.dropna()
combined_hashes

Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1
171794956858086,2.0,1.0
394031160505449,1.0,1.0
513049510280006,6.0,1.0
635563321059591,55.0,1.0
734478954482604,1.0,1.0
790330283259107,1.0,1.0
860437213521678,1.0,10.0
936637619733676,3.0,7.0
937624453106451,301.0,4976.0
1012839673718406,5.0,245.0


In [10]:
hashes_of_interest = combined_hashes.loc[hash_diff.abs() > 50]
hashes_of_interest

Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1
635563321059591,55.0,1.0
937624453106451,301.0,4976.0
1012839673718406,5.0,245.0
1026077759632952,39.0,315.0


In [11]:
fastqs = pd.read_csv("s3://czb-maca/Plate_seq/3_month/fastqs_read1_read2.csv", index_col=0)
print(fastqs.shape)
fastqs.head()

(53760, 2)


Unnamed: 0_level_0,read1,read2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
A1-B000126-3_39_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000127-3_38_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...
A1-B000167-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000168-3_57_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000412-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...


In [12]:
these_cell_fastqs = fastqs.loc[[cell1, cell2]]
these_cell_fastqs

Unnamed: 0_level_0,read1,read2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
A1-B000610-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...
A1-B002764-3_38_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...


### Copy fastqs over

In [14]:
fastq_dir = '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/'

for fastq in these_cell_fastqs.values.flatten():
    print(fastq)
    ! aws s3 cp $fastq $fastq_dir

! ls $fastq_dir

s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
download: s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz to ../../../../pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
download: s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz to ../../../../pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B0027

## Extract reads containing the hashes of interest

In [16]:
fastqs = glob.glob(f"{fastq_dir}*")
len(fastqs)
fastqs

['/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R1_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R2_001.fastq.gz']

## Check if each sequence contains hashes, then extract them

In [18]:
%%time
query_hashes = set(hashes_of_interest.index)
minhash = sig1.minhash.copy_and_clear()


# now, iterate over the input sequences and output those that add
# hashes!
n = 0
m = 0

NOTIFY_EVERY_BP=1e6

all_kmers_to_hashes_of_interest = {}

watermark = NOTIFY_EVERY_BP

with open("bladder_test_cells_different_hashes.fasta", 'w') as f:

    for fastq in fastqs:
        sample_id = os.path.basename(fastq).split("_R")[0]
        for record in screed.open(fastq):
            n += len(record.sequence)
            while n >= watermark:
                sys.stderr.write('... {} {}\r'.format(watermark, fastq))
                watermark += NOTIFY_EVERY_BP

            minhash.add_sequence(record.sequence, force=True)
            hashes = set(minhash.get_hashes())
            match = hashes.intersection(query_hashes)
            if match:
                n_kmers = len(record.sequence) - ksize + 1
                kmers = [record.sequence[i:(i+ksize)] for i in range(n_kmers)]
                hashed_kmers = [hash_murmur(kmer) for kmer in kmers]
                kmer_to_hash = dict(zip(kmers, hashed_kmers))

                kmer_to_hashes_of_interest = {kmer: h for kmer, h in kmer_to_hash.items() if h in query_hashes}
                if len(kmer_to_hashes_of_interest) != len(match):
                    print("Something is weird! Not all the hashes were found...")
                    print(match)
                    break
                all_kmers_to_hashes_of_interest.update(kmer_to_hashes_of_interest)
            if len(all_kmers_to_hashes_of_interest) == len(query_hashes):
                print("Found all hashes! Exiting.")
                break

Something is weird! Not all the hashes were found...
{937624453106451}
Something is weird! Not all the hashes were found...
{937624453106451}
Something is weird! Not all the hashes were found...
{937624453106451}
Something is weird! Not all the hashes were found...
{937624453106451}
CPU times: user 160 ms, sys: 16 ms, total: 176 ms
Wall time: 365 ms
