In [1]:
import glob
import os

import pandas as pd
import screed

from sourmash import signature as sig
from sourmash._minhash import hash_murmur


In [2]:
similarities_tidy = pd.read_csv('s3://kmer-hashing/tabula-muris/n_hashes=500/bladder/similarities_tidy.csv')
print(similarities_tidy.shape)
similarities_tidy.head()

(1612900, 6)


Unnamed: 0,cell1,cell2,similarity_without_abundance,similarity_with_abundance,cell_ontology_class_cell1,cell_ontology_class
0,A1-B000610-3_56_F-1-1,A1-B000610-3_56_F-1-1,1.0,1.0,bladder cell,bladder cell
1,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1,0.018,0.22846,bladder cell,bladder urothelial cell
2,A1-B000610-3_56_F-1-1,A1-B002771-3_39_F-1-1,0.072,0.24519,bladder cell,bladder cell
3,A1-B000610-3_56_F-1-1,A1-D041914-3_8_M-1-1,0.098,0.454728,bladder cell,bladder cell
4,A1-B000610-3_56_F-1-1,A1-D042253-3_9_M-1-1,0.09,0.231834,bladder cell,bladder cell


In [3]:
folder = '/home/olga/pureScratch/olgabot-maca/facs/sourmash/'


In [87]:
import json

from IPython.lib.pretty import pretty

with open(f"{folder}/A1-B000610-3_56_F-1-1_S28.sig") as f:
    signature_data = json.load(f)

print(pretty(signature_data, max_seq_length=10))

[{'class': 'sourmash_signature',
  'email': '',
  'filename': '/arg/2/0',
  'hash_function': '0.murmur64',
  'license': 'CC0',
  'name': 'A1-B000610-3_56_F-1-1_S28',
  'signatures': [{'abundances': [5, 1, 14, 32, 8, 2, 1, 1, 46, 1, ...],
    'ksize': 21,
    'max_hash': 0,
    'md5sum': '280b9babf31a3b7f820f9695f948808c',
    'mins': [321797371588609,
     224101973024771,
     113000222478351,
     1333158846758931,
     1102392644880415,
     587674942934704,
     586113718319139,
     278315472003108,
     875509323118629,
     399207939020840,
     ...],
    'molecule': 'DNA',
    'num': 500,
    'seed': 42},
   {'abundances': [7, 11, 1, 2, 1, 1, 2, 8, 5, 1, ...],
    'ksize': 31,
    'max_hash': 0,
    'md5sum': '9d82f6c550e83cdd97a9190e2fa7d1f0',
    'mins': [1098995281889281,
     494274597793794,
     805332503111685,
     944591797540874,
     433029384222735,
     777578192238617,
     815900320305157,
     729433322275847,
     593759110850609,
     823805458110523,
     ...

In [81]:
from IPython.display import JSON

JSON(signature_data)

<IPython.core.display.JSON object>

In [4]:
# bladder cell
cell1 = 'A1-B000610-3_56_F-1-1'

ksize = 21
moltype = 'DNA'


sig1 = list(sig.load_signatures(f"{folder}/A1-B000610-3_56_F-1-1_S28.sig", ksize=ksize, 
                                select_moltype=moltype))[0]
sig1

SourmashSignature('A1-B000610-3_56_F-1-1_S28', 280b9bab)

In [5]:
# bladder urothelial cell
cell2 = 'A1-B002764-3_38_F-1-1'

sig2 = list(sig.load_signatures(f"{folder}/A1-B002764-3_38_F-1-1_S291.sig", ksize=ksize, 
                                select_moltype=moltype))[0]
sig2

SourmashSignature('A1-B002764-3_38_F-1-1_S291', ef185816)

In [6]:
sig1_hashes = pd.Series(sig1.minhash.get_mins(with_abundance=True), name=cell1)
sig1_hashes.head()

3282762037724      2
5959368950693      1
7215285795798     20
11329527245923     1
12290763577842     1
Name: A1-B000610-3_56_F-1-1, dtype: int64

In [7]:
sig2_hashes = pd.Series(sig2.minhash.get_mins(with_abundance=True), name=cell2)
sig2_hashes.head()

4319888920071     1
11897683056067    4
18288931836992    1
27514404317028    1
28801856705165    1
Name: A1-B002764-3_38_F-1-1, dtype: int64

In [8]:
hash_diff = (sig1_hashes - sig2_hashes).dropna()
hash_diff

171794956858086        1.0
394031160505449        0.0
513049510280006        5.0
635563321059591       54.0
734478954482604        0.0
790330283259107        0.0
860437213521678       -9.0
936637619733676       -4.0
937624453106451    -4675.0
1012839673718406    -240.0
1026077759632952    -276.0
1067106972354716       0.0
1215285019941982       0.0
1225906413461687       0.0
dtype: float64

In [65]:
combined_hashes = pd.concat([sig1_hashes, sig2_hashes], axis=1)
combined_hashes = combined_hashes.dropna()
print(combined_hashes.shape)
combined_hashes

(14, 2)


Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1
171794956858086,2.0,1.0
394031160505449,1.0,1.0
513049510280006,6.0,1.0
635563321059591,55.0,1.0
734478954482604,1.0,1.0
790330283259107,1.0,1.0
860437213521678,1.0,10.0
936637619733676,3.0,7.0
937624453106451,301.0,4976.0
1012839673718406,5.0,245.0


In [66]:
combined_hashes.to_csv("bladder_combined_hashes_table.csv")

In [68]:
pd.Series(combined_hashes.index).to_csv('bladder_combined_hashes.txt', index=False, header=False)

In [10]:
hashes_of_interest = combined_hashes.loc[hash_diff.abs() > 50]
hashes_of_interest

Unnamed: 0,A1-B000610-3_56_F-1-1,A1-B002764-3_38_F-1-1
635563321059591,55.0,1.0
937624453106451,301.0,4976.0
1012839673718406,5.0,245.0
1026077759632952,39.0,315.0


In [63]:
hashes_of_interest.to_csv('bladder_hashes_different_abundances_table.csv')

In [64]:
pwd

'/home/olga/code/kmer-hashing/kh-analysis/notebooks'

In [69]:
txt = "bladder_hashes_different_abundances.txt"

pd.Series(hashes_of_interest.index).to_csv(txt, index=False, header=False)

In [62]:
! cat $txt

635563321059591
937624453106451
1012839673718406
1026077759632952


In [11]:
fastqs = pd.read_csv("s3://czb-maca/Plate_seq/3_month/fastqs_read1_read2.csv", index_col=0)
print(fastqs.shape)
fastqs.head()

(53760, 2)


Unnamed: 0_level_0,read1,read2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
A1-B000126-3_39_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000127-3_38_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...
A1-B000167-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000168-3_57_F-1-1,s3://czb-maca/Plate_seq/3_month/170925_A00111_...,s3://czb-maca/Plate_seq/3_month/170925_A00111_...
A1-B000412-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...


In [12]:
these_cell_fastqs = fastqs.loc[[cell1, cell2]]
these_cell_fastqs

Unnamed: 0_level_0,read1,read2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
A1-B000610-3_56_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...
A1-B002764-3_38_F-1-1,s3://czb-maca/Plate_seq/3_month/170914_A00111_...,s3://czb-maca/Plate_seq/3_month/170914_A00111_...


### Copy fastqs over

In [14]:
fastq_dir = '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/'

for fastq in these_cell_fastqs.values.flatten():
    print(fastq)
    ! aws s3 cp $fastq $fastq_dir

! ls $fastq_dir

s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
download: s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz to ../../../../pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
download: s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz to ../../../../pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
s3://czb-maca/Plate_seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B0027

## Extract reads containing the hashes of interest

In [16]:
fastqs = glob.glob(f"{fastq_dir}*")
len(fastqs)
fastqs

['/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R1_001.fastq.gz',
 '/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R2_001.fastq.gz']

## Check if each sequence contains hashes, then extract them

In [37]:
%%time
query_hashes = set(hashes_of_interest.index)
minhash = sig1.minhash.copy_and_clear()


# now, iterate over the input sequences and output those that add
# hashes!
n = 0
m = 0

NOTIFY_EVERY_BP=1e6

all_hashes_of_interest_to_kmer = {}

watermark = NOTIFY_EVERY_BP

with open("bladder_test_cells_different_hashes.fasta", 'w') as f:

    for fastq in fastqs:
        sample_id = os.path.basename(fastq).split("_R")[0]
        for record in screed.open(fastq):
            minhash = sig1.minhash.copy_and_clear()

            n += len(record.sequence)
            while n >= watermark:
                sys.stderr.write('... {} {}\r'.format(watermark, fastq))
                watermark += NOTIFY_EVERY_BP

            minhash.add_sequence(record.sequence, force=True)
            hashes = set(minhash.get_hashes())
            match = hashes.intersection(query_hashes)
            if match:
                n_kmers = len(record.sequence) - ksize + 1
                kmers = [record.sequence[i:(i+ksize)] for i in range(n_kmers)]
                hashed_kmers = [hash_murmur(kmer) for kmer in kmers]
                hash_to_kmer = dict(zip(hashed_kmers, kmers))

                hashes_of_interest_to_kmer = {h: kmer for h, kmer in kmer_to_hash.items() if h in query_hashes}
                if len(hashes_of_interest_to_kmer) != len(match):
                    print("Something is weird! Not all the hashes were found...")
                    print(match)
                    break
                all_hashes_of_interest_to_kmer.update(hashes_of_interest_to_kmer)
            if len(all_hashes_of_interest_to_kmer) == len(query_hashes):
                print("Found all hashes! Exiting.")
                break

Something is weird! Not all the hashes were found...
{937624453106451}
Something is weird! Not all the hashes were found...
{937624453106451}
Something is weird! Not all the hashes were found...
{937624453106451}
Something is weird! Not all the hashes were found...
{937624453106451}
CPU times: user 364 ms, sys: 0 ns, total: 364 ms
Wall time: 371 ms


In [39]:
hash_to_kmer_series = pd.Series(hash_to_kmer)
hash_to_kmer_series

9882986690890161194     AGCAGGGGGCGTCCCTATACG
16742629904472777585    GCAGGGGGCGTCCCTATACGC
9725330995545589343     CAGGGGGCGTCCCTATACGCG
12046988543056069749    AGGGGGCGTCCCTATACGCGA
9044752809925671151     GGGGGCGTCCCTATACGCGAT
                                ...          
6682627613489612120     CGAAGCAGCCTCAATATCTAA
14734515817039671978    GAAGCAGCCTCAATATCTAAC
5343364941329140042     AAGCAGCCTCAATATCTAACA
6531532663094830506     AGCAGCCTCAATATCTAACAC
16249662080779570112    GCAGCCTCAATATCTAACACG
Length: 80, dtype: object

In [57]:
len(kmers)

80

In [59]:
# kmers

In [42]:
manual_hash_with_minhash = set(hashed_kmers) & set(minhash.get_hashes())
len(manual_hash_with_minhash)

48

In [45]:
intersecting_hashes = hash_to_kmer_series[list(manual_hash_with_minhash)]
intersecting_hashes.head()

2.267453e+18    CCCTATACGCGATGGATCGAG
4.345891e+18    CCTATACGCGATGGATCGAGG
1.434296e+19    AGACACTTCGTGCCGACACGA
1.013144e+19    CATGACTGGACACTGCATCGG
7.240080e+18    GGCGTCCCTATACGCGATGGA
dtype: object

In [46]:
intersecting_hashes.sort_values()

3.900669e+17    AAGACACTTCGTGCCGACACG
5.343365e+18    AAGCAGCCTCAATATCTAACA
4.062782e+18    ACACGAAGCAGCCTCAATATC
1.197956e+19    ACACTTCGTGCCGACACGAAG
1.652205e+19    ACGAAGCAGCCTCAATATCTA
7.878499e+18    ACGCGATGGATCGAGGTCATG
1.458269e+19    ACTGCATCGGAAGACACTTCG
1.201782e+18    ACTGGACACTGCATCGGAAGA
6.856428e+18    ACTTCGTGCCGACACGAAGCA
1.434296e+19    AGACACTTCGTGCCGACACGA
6.531533e+18    AGCAGCCTCAATATCTAACAC
9.882987e+18    AGCAGGGGGCGTCCCTATACG
1.204699e+19    AGGGGGCGTCCCTATACGCGA
7.272113e+18    AGGTCATGACTGGACACTGCA
4.573475e+18    ATACGCGATGGATCGAGGTCA
9.376245e+14    ATCGAGGTCATGACTGGACAC
1.102780e+19    ATCGGAAGACACTTCGTGCCG
1.486561e+19    ATGACTGGACACTGCATCGGA
1.081915e+19    ATGGATCGAGGTCATGACTGG
1.438740e+19    CACTGCATCGGAAGACACTTC
8.438121e+18    CACTTCGTGCCGACACGAAGC
9.725331e+18    CAGGGGGCGTCCCTATACGCG
2.033339e+18    CATCGGAAGACACTTCGTGCC
1.013144e+19    CATGACTGGACACTGCATCGG
2.267453e+18    CCCTATACGCGATGGATCGAG
4.345891e+18    CCTATACGCGATGGATCGAGG
6.682628e+18

In [50]:
len('TGGACACTGCATCGGAAGACA')

21

In [47]:
record.sequence

'AGCAGGGGGCGTCCCTATACGCGATGGATCGAGGTCATGACTGGACACTGCATCGGAAGACACTTCGTGCCGACACGAAGCAGCCTCAATATCTAACACG'

In [53]:
intersecting_hashes_df = intersecting_hashes.to_frame()
intersecting_hashes_df = intersecting_hashes_df.rename(columns={0: 'kmer'})
intersecting_hashes_df.head()

Unnamed: 0,kmer
2.267453e+18,CCCTATACGCGATGGATCGAG
4.345891e+18,CCTATACGCGATGGATCGAGG
1.434296e+19,AGACACTTCGTGCCGACACGA
1.013144e+19,CATGACTGGACACTGCATCGG
7.24008e+18,GGCGTCCCTATACGCGATGGA


In [54]:
intersecting_hashes_df['start_index'] = intersecting_hashes_df.kmer.map(lambda x: record.sequence.find(x))
intersecting_hashes_df.head()

Unnamed: 0,kmer,start_index
2.267453e+18,CCCTATACGCGATGGATCGAG,12
4.345891e+18,CCTATACGCGATGGATCGAGG,13
1.434296e+19,AGACACTTCGTGCCGACACGA,57
1.013144e+19,CATGACTGGACACTGCATCGG,35
7.24008e+18,GGCGTCCCTATACGCGATGGA,7


In [56]:
len(record.sequence)

100

In [55]:
intersecting_hashes_df.sort_values('start_index')

Unnamed: 0,kmer,start_index
9.882987e+18,AGCAGGGGGCGTCCCTATACG,0
1.674263e+19,GCAGGGGGCGTCCCTATACGC,1
9.725331e+18,CAGGGGGCGTCCCTATACGCG,2
1.204699e+19,AGGGGGCGTCCCTATACGCGA,3
7.24008e+18,GGCGTCCCTATACGCGATGGA,7
1.766957e+19,CGTCCCTATACGCGATGGATC,9
2.411537e+18,TCCCTATACGCGATGGATCGA,11
2.267453e+18,CCCTATACGCGATGGATCGAG,12
4.345891e+18,CCTATACGCGATGGATCGAGG,13
4.573475e+18,ATACGCGATGGATCGAGGTCA,16


In [49]:
for kmer in intersecting_hashes.values:
    print(record.sequence.find(kmer))

12
13
57
35
7
27
26
11
30
46
50
18
78
25
0
76
23
60
42
36
53
74
40
77
68
31
58
41
51
61
55
16
75
19
45
2
20
72
70
9
56
37
39
1
52
59
3
54


In [36]:
# manual_hash_with_minhash

In [32]:
len(set(minhash.get_hashes()))

80

In [34]:
minhash.add_sequence(record.sequence, force=True)
len(set(minhash.get_hashes()))

80

In [31]:
len(set(hashed_kmers))

80

In [25]:
hashes_of_interest_to_kmer

{}

In [26]:
kmer_to_hash

{'GTATCAACAGGAACATCCTCG': 11332569200613985422,
 'TATCAACAGGAACATCCTCGT': 13631614553446097611,
 'ATCAACAGGAACATCCTCGTG': 2872127591073786859,
 'TCAACAGGAACATCCTCGTGT': 7665439403394214361,
 'CAACAGGAACATCCTCGTGTT': 10755392401481311365,
 'AACAGGAACATCCTCGTGTTA': 16116729706801105248,
 'ACAGGAACATCCTCGTGTTAG': 4928222811555832491,
 'CAGGAACATCCTCGTGTTAGA': 497161709746860352,
 'AGGAACATCCTCGTGTTAGAT': 17807558691675200061,
 'GGAACATCCTCGTGTTAGATA': 10713124010397424574,
 'GAACATCCTCGTGTTAGATAT': 3812529855415123365,
 'AACATCCTCGTGTTAGATATT': 15055893384930568398,
 'ACATCCTCGTGTTAGATATTG': 15090075336066576766,
 'CATCCTCGTGTTAGATATTGA': 9761570019713904088,
 'ATCCTCGTGTTAGATATTGAG': 4424664125813896497,
 'TCCTCGTGTTAGATATTGAGG': 3215167914112085868,
 'CCTCGTGTTAGATATTGAGGC': 16533125832238829217,
 'CTCGTGTTAGATATTGAGGCT': 5199017375569319483,
 'TCGTGTTAGATATTGAGGCTG': 11855670425143298460,
 'CGTGTTAGATATTGAGGCTGC': 5401960134113432925,
 'GTGTTAGATATTGAGGCTGCT': 16924423555563794259,
 'T