In [1]:
import sourmash
from collections import Counter

In [2]:
ls

Snakefile
VC2010_2.0_Dec2022.fasta.gz
VC2010_2.0_Dec2022.fasta.gz.sig
c_elegans.PRJEB28388.WS287.genomic.fa.gz
c_elegans.PRJEB28388.WS287.genomic.fa.gz.sig
c_elegans.PRJNA13758.WS287.genomic.fa.gz
c_elegans.PRJNA13758.WS287.genomic.fa.gz.sig
diff-kmers.ipynb


In [3]:
ksize=31

sig_1 = sourmash.load_one_signature('c_elegans.PRJNA13758.WS287.genomic.fa.gz.sig', ksize=ksize)
sig_2 = sourmash.load_one_signature('c_elegans.PRJEB28388.WS287.genomic.fa.gz.sig', ksize=ksize)
sig_3 = sourmash.load_one_signature('VC2010_2.0_Dec2022.fasta.gz.sig', ksize=ksize)

assert sig_1.minhash.track_abundance
assert sig_2.minhash.track_abundance
assert sig_3.minhash.track_abundance

scaled = sig_1.minhash.scaled
assert sig_2.minhash.scaled == scaled
assert sig_3.minhash.scaled == scaled

In [4]:
mh = sig_1.minhash
len(mh)


939374

In [5]:
def find_diff_abund(mh1, mh2):
    in_1_only = {}
    diff_abund_1 = {}
    
    h1 = mh1.hashes
    h2 = mh2.hashes

    n = 0
    for k, abund1 in h1.items():
        if n % 10000 == 0:
            print('...', n)
            print(k, abund1, k in h2, h2.get(k))
        if k in h2:
            abund2 = h2[k]
            if abund1 != abund2:
                diff_abund_1[k] = abund1 - abund2
        else:
            in_1_only[k] = abund1
            
        n += 1

    return in_1_only, diff_abund_1



In [6]:
in_3_only, diff_abund_3 = find_diff_abund(sig_3.minhash, sig_2.minhash)

... 0
192756907392 1 True 1
... 10000
1931940918799916 1 True 1
... 20000
3873773213364650 1 True 1
... 30000
5837051504262474 1 True 1
... 40000
7815872606851564 1 True 1
... 50000
9728795872874795 1 True 1
... 60000
11692788122321311 1 True 1
... 70000
13636761436420096 1 True 1
... 80000
15609017637309739 1 True 1
... 90000
17591181170697941 1 True 1
... 100000
19566467239711786 1 True 1
... 110000
21534925485610695 1 True 1
... 120000
23482045736348901 1 True 1
... 130000
25446217329956189 1 True 1
... 140000
27442290591624285 1 True 1
... 150000
29382071596536919 1 True 1
... 160000
31330155430025470 1 True 1
... 170000
33274534840303462 1 True 1
... 180000
35218870332199322 1 True 1
... 190000
37174718028925714 1 True 1
... 200000
39110708212260599 1 True 1
... 210000
41077520710997889 1 True 1
... 220000
43024453002076143 1 True 1
... 230000
44967534204047842 1 True 1
... 240000
46941298384521699 1 True 1
... 250000
48878587997971552 1 True 1
... 260000
50821700809176058 1 True 

## Estimates of differentially present k-mers

In [7]:
print(f"There are {len(in_3_only)} hashes only present in #3 vs #2")
print(f"(Multiply by {scaled} to get a close estimate of # of k-mers.)")

There are 1711 hashes only present in #3 vs #2
(Multiply by 100 to get a close estimate of # of k-mers.)


In [8]:
print(f"There are {len(diff_abund_3)} hashes present in #3 vs #2 that have different counts")
print(f"(Multiply by {scaled} to get a close estimate of # of k-mers.)")

There are 1666 hashes present in #3 vs #2 that have different counts
(Multiply by 100 to get a close estimate of # of k-mers.)


## Differentially abundant k-mer counts

Here is the distribution of the differences in count number.

For example, the output "2 298" means that there are 2 k-mers with a count difference of 298 more in #3 than in #2.

In [9]:
c = Counter()
for k, v in diff_abund_3.items():
    c[v] += 1

In [10]:
keys = list(sorted(c.keys()))
for k in keys:
    print(k, c[k])

-1762 1
-161 1
-77 1
-71 1
-54 1
-47 1
-41 1
-32 1
-23 1
-22 1
-11 1
-9 2
-8 1
-7 3
-6 3
-5 1
-4 7
-3 1
-2 9
-1 49
1 749
2 298
3 84
4 27
5 22
6 17
7 39
8 12
9 10
10 14
11 11
12 6
13 1
14 5
15 4
16 4
17 9
18 4
19 4
20 4
21 5
22 2
23 3
24 4
25 5
26 5
27 3
28 4
29 3
30 1
31 4
33 1
34 4
35 1
36 1
37 1
38 2
39 1
40 4
42 27
43 58
45 2
48 1
50 2
51 1
52 1
53 2
54 3
55 1
56 2
57 1
58 1
59 2
61 1
62 1
63 3
65 1
66 2
68 2
69 2
70 1
71 1
72 1
73 1
75 3
76 1
77 3
78 1
79 2
80 2
85 2
86 3
95 1
97 2
102 2
106 1
114 1
115 1
119 1
120 1
125 1
127 1
129 1
137 1
139 1
141 1
144 1
145 1
146 1
151 1
158 1
176 1
178 1
181 1
182 1
183 1
184 1
185 1
192 1
194 1
202 1
204 1
207 1
212 1
223 1
228 1
229 1
233 1
237 1
253 1
275 1
284 1
285 1
291 1
298 1
318 1
367 1
386 1
425 1
470 1
524 1
547 1
602 1
614 1
645 1
732 1
737 1
743 1
769 1
1120 1
1155 1
1358 1
1561 1
1714 1
