# Extract k-mers that have different abundances from otherwise similar bladder cells

In [34]:
ksize = 21

In [36]:
! /home/olga/code/sourmash/utils/hashvals-to-signature.py --help

usage: hashvals-to-signature.py [-h] [-o OUTPUT] [-k KSIZE] [--scaled SCALED]
                                [--num NUM] [--name NAME]
                                [--filename FILENAME]
                                hashfile

positional arguments:
  hashfile

optional arguments:
  -h, --help            show this help message and exit
  -o OUTPUT, --output OUTPUT
                        file to output signature to
  -k KSIZE, --ksize KSIZE
  --scaled SCALED
  --num NUM
  --name NAME           signature name
  --filename FILENAME   filename to add to signature


In [37]:
%%bash
/home/olga/code/sourmash/utils/hashvals-to-signature.py \
    --ksize 21 \
    --output bladder_hashes_different_abundances.sig \
    bladder_hashes_different_abundances.txt

[Kloaded 4 distinct hashes from bladder_hashes_different_abundances.txt
[Ksetting --num automatically from the number of hashes.
[Kwrote signature to bladder_hashes_different_abundances.sig


In [39]:
! /home/olga/code/sourmash/utils/signature-to-kmers.py --help

usage: signature-to-kmers.py [-h] [--output-sequences OUTPUT_SEQUENCES]
                             [--output-kmers OUTPUT_KMERS]
                             query seqfiles [seqfiles ...]

positional arguments:
  query
  seqfiles

optional arguments:
  -h, --help            show this help message and exit
  --output-sequences OUTPUT_SEQUENCES
                        save matching sequences to this file.
  --output-kmers OUTPUT_KMERS
                        save matching kmers to this file.


In [40]:
%%bash

CELL1_R1=/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
CELL1_R2=/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz


/home/olga/code/sourmash/utils/signature-to-kmers.py \
    --output-kmers bladder_hashes_different_abundances_kmers.txt \
    --output-sequences bladder_hashes_different_abundances_sequences.txt \
    bladder_hashes_different_abundances.sig $CELL1_R1 $CELL1_R2

[Kread 159324600 bp, wrote 41500 bp in matching sequencesate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
[Kread 159324600 bp, found 4 kmers matching hashvals


In [41]:
! cat bladder_hashes_different_abundances_kmers.txt

kmer,hashval
ATCGAGGTCATGACTGGACAC,937624453106451
AGTGGACATGAAAGATGTGGA,635563321059591
GGAGTGGGTAAATTGCGCGCC,1012839673718406
GCTCAGTTATTGACTACGAGC,1026077759632952


In [42]:
! cat bladder_hashes_different_abundances_kmers.txt | cut -f 1 -d,

kmer
ATCGAGGTCATGACTGGACAC
AGTGGACATGAAAGATGTGGA
GGAGTGGGTAAATTGCGCGCC
GCTCAGTTATTGACTACGAGC


## Since there's only 14 shared hashes anyway, look at all of them!

In [48]:
%%bash
/home/olga/code/sourmash/utils/hashvals-to-signature.py \
    --ksize 21 \
    --output bladder_combined_hashes.sig \
    bladder_combined_hashes.txt

[Kloaded 14 distinct hashes from bladder_combined_hashes.txt
[Ksetting --num automatically from the number of hashes.
[Kwrote signature to bladder_combined_hashes.sig


In [53]:
%%bash

CELL1_R1=/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R1_001.fastq.gz
CELL1_R2=/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B000610-3_56_F-1-1_R2_001.fastq.gz
CELL2_R1=/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R1_001.fastq.gz
CELL2_R2=/home/olga/pureScratch/czb-maca/Plate_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R2_001.fastq.gz

/home/olga/code/sourmash/utils/signature-to-kmers.py \
    --output-kmers bladder_combined_hashes_kmers.txt \
    --output-sequences bladder_combined_hashes_sequences.txt \
    bladder_combined_hashes.sig $CELL1_R1 $CELL1_R2 $CELL2_R1 $CELL2_R2

[Kread 311332200 bp, wrote 1177900 bp in matching sequencese_Seq/3_month/170914_A00111_0058_AH3FYKDMXX__170914_A00111_0057_BH3FY7DMXX/fastqs/A1-B002764-3_38_F-1-1_R2_001.fastq.gz
[Kread 311332200 bp, found 14 kmers matching hashvals


In [57]:
! cat /home/olga/code/sourmash/utils/signature-to-kmers.py

#! /usr/bin/env python3
"""
Given a signature file and a collection of sequences, output all of the
k-mers and sequences that match a hashval in the signature file.

NOTE: for now, only works for DNA.
"""
import sys
import argparse
import sourmash
from sourmash import MinHash
from sourmash import sourmash_args
from sourmash._minhash import hash_murmur
import screed
import csv
from sourmash.logging import notify, error


NOTIFY_EVERY_BP=1e7


def get_kmers_for_hashvals(sequence, hashvals, ksize):
    "Return k-mers from 'sequence' that yield hashes in 'hashvals'."
    # uppercase!
    sequence = sequence.upper()

    for start in range(0, len(sequence) - ksize + 1):
        kmer = sequence[start:start + ksize]
        kmer_rc = screed.rc(kmer)
        if kmer > kmer_rc:                # choose fwd or rc
            kmer = kmer_rc

        # NOTE: we do not avoid non-ACGT characters, because those k-mers,
        # when hashed, shouldn't match anything that sourmash outputs.
        hash

In [54]:
! cat bladder_combined_hashes_kmers.txt

kmer,hashval
ATCGAGGTCATGACTGGACAC,937624453106451
AGTGGACATGAAAGATGTGGA,635563321059591
GGAGTGGGTAAATTGCGCGCC,1012839673718406
GCTCAGTTATTGACTACGAGC,1026077759632952
GAGGATGGTGAAGTAAAGACC,1067106972354716
AAGAGGGTCGGCCGGGGGCAT,513049510280006
ACTGTGGTAATTCTAGAGCTG,936637619733676
ATTTTTTTTTTTTTTACTAAA,790330283259107
GTGTAATATAAAAAAAAACCA,1215285019941982
AAAAATAAAAAAAAAAACACA,171794956858086
AAAGTCGTCGGAAGATCCTGA,860437213521678
CCCAACTTCTTAGAGGGATAA,734478954482604
GTATAAGAGACAGGGGCTGTA,1225906413461687
CATGATTAAGAGTGACTGCCG,394031160505449


In [55]:
import pandas as pd

kmer_hashes = pd.read_csv('bladder_combined_hashes_kmers.txt')
kmer_hashes = kmer_hashes.sort_values('hashval')
kmer_hashes

Unnamed: 0,kmer,hashval
9,AAAAATAAAAAAAAAAACACA,171794956858086
13,CATGATTAAGAGTGACTGCCG,394031160505449
5,AAGAGGGTCGGCCGGGGGCAT,513049510280006
1,AGTGGACATGAAAGATGTGGA,635563321059591
11,CCCAACTTCTTAGAGGGATAA,734478954482604
7,ATTTTTTTTTTTTTTACTAAA,790330283259107
10,AAAGTCGTCGGAAGATCCTGA,860437213521678
6,ACTGTGGTAATTCTAGAGCTG,936637619733676
0,ATCGAGGTCATGACTGGACAC,937624453106451
2,GGAGTGGGTAAATTGCGCGCC,1012839673718406


In [56]:
fasta = 'bladder_combined_hashes_kmers.fasta'

with open(fasta, 'w') as f:
    for i, (_, row) in enumerate(kmer_hashes.iterrows()):
        f.write(">kmer{i:02}_{hashval}\n{kmer}\n".format(i=i, **row))
! cat $fasta

>kmer00_171794956858086
AAAAATAAAAAAAAAAACACA
>kmer01_394031160505449
CATGATTAAGAGTGACTGCCG
>kmer02_513049510280006
AAGAGGGTCGGCCGGGGGCAT
>kmer03_635563321059591
AGTGGACATGAAAGATGTGGA
>kmer04_734478954482604
CCCAACTTCTTAGAGGGATAA
>kmer05_790330283259107
ATTTTTTTTTTTTTTACTAAA
>kmer06_860437213521678
AAAGTCGTCGGAAGATCCTGA
>kmer07_936637619733676
ACTGTGGTAATTCTAGAGCTG
>kmer08_937624453106451
ATCGAGGTCATGACTGGACAC
>kmer09_1012839673718406
GGAGTGGGTAAATTGCGCGCC
>kmer10_1026077759632952
GCTCAGTTATTGACTACGAGC
>kmer11_1067106972354716
GAGGATGGTGAAGTAAAGACC
>kmer12_1215285019941982
GTGTAATATAAAAAAAAACCA
>kmer13_1225906413461687
GTATAAGAGACAGGGGCTGTA
