In [1]:
folder = '/home/olga/data_sm/kmer-hashing/classify_coding_vs_noncoding/'

In [2]:
cd $folder

/home/seqbot/ibm_sm/olga/kmer-hashing/classify_coding_vs_noncoding


In [30]:
ls -lha

total 239G
drwxrwxr-x 2 olga olga 4.0K Sep 18 05:33 [0m[01;34m.[0m/
drwxrwxr-x 5 olga olga 4.0K Sep 14 08:39 [01;34m..[0m/
-rw-rw-r-- 1 olga olga  21M Sep 14 06:46 [01;31mHomo_sapiens.GRCh38.cds.all.fa.gz[0m
-rw-rw-r-- 1 olga olga  16M Sep 14 06:46 [01;31mHomo_sapiens.GRCh38.ncrna.fa.gz[0m
-rw-rw-r-- 1 olga olga  14M Sep 14 06:46 [01;31mHomo_sapiens.GRCh38.pep.all.fa.gz[0m
-rw-rw-r-- 1 olga olga  15G Sep 14 07:11 human_cds_k15.nodegraph
-rw-rw-r-- 1 olga olga   65 Sep 14 07:11 human_cds_k15.nodegraph.info
-rw-rw-r-- 1 olga olga  15G Sep 14 07:22 human_cds_k17.nodegraph
-rw-rw-r-- 1 olga olga   65 Sep 14 07:22 human_cds_k17.nodegraph.info
-rw-rw-r-- 1 olga olga  15G Sep 14 07:30 human_cds_k19.nodegraph
-rw-rw-r-- 1 olga olga   65 Sep 14 07:30 human_cds_k19.nodegraph.info
-rw-rw-r-- 1 olga olga  15G Sep 14 07:39 human_cds_k21.nodegraph
-rw-rw-r-- 1 olga olga   65 Sep 14 07:39 human_cds_k21.nodegraph.info
-rw-rw-r-- 1 olga olga  15G Sep 14 07:47 human_cds_k23.nodegraph
-rw-rw-r

In [32]:
import gzip

from khmer import Nodegraph
import screed
from sourmash._minhash import hash_murmur
from khmer.khmer_args import calculate_graphsize
from sourmash.logging import notify

from khtools.compare_peptide import kmerize

from Bio.Seq import Seq
from Bio import SeqIO

In [60]:
tablesize = 1e10

DEFAULT_K = 32
DEFAULT_N_TABLES = 4
DEFAULT_MAX_TABLESIZE = 1e6
DEFAULT_N_THREADS = 1

peptide_ksize = 7

# tablesize = calculate_graphsize(args, 'nodegraph', multiplier)
peptide_graph = Nodegraph(peptide_ksize, tablesize, n_tables=4)



In [61]:
20**6

64000000

In [62]:
%%time

DEFAULT_SEED = 42

for record in screed.open("Homo_sapiens.GRCh38.pep.all.fa.gz"):
#     print(record)
    if '*' in record['sequence']:
#         notify("Stop codon found in seq, skipping")
        continue
    kmers = kmerize(record['sequence'], peptide_ksize)
    for kmer in kmers:
        peptide_graph.add(kmer)        

CPU times: user 29.5 s, sys: 248 ms, total: 29.8 s
Wall time: 29.8 s


In [63]:
def three_frame_translation(seq):
    for frame in range(3):
        translation = seq[frame:].translate()
        yield translation
        
def three_frame_translation_no_stops(seq):
    return [t for t in three_frame_translation(seq) if '*' not in t]


with gzip.open('SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled.fq.gz', 'rt') as f:
    for i, record in zip(range(5), SeqIO.parse(f, 'fastq')):
        print(record.description)
        print(str(record.seq))
        print(list(three_frame_translation(record.seq)))
        print(list(three_frame_translation(record.seq[::-1])))

SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1
CGCTTGCTTAATACTGACATCAATAATATTAGGAAAATCGCAATATAACTGTAAATCCTGTTCTGTC
[Seq('RLLNTDINNIRKIAI*L*ILFC', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('ACLILTSIILGKSQYNCKSCSV', ExtendedIUPACProtein()), Seq('LA*Y*HQ*Y*ENRNITVNPVL', HasStopCodon(ExtendedIUPACProtein(), '*'))]
[Seq('LSCPKCQYNAKRIIITTVIIRS', ExtendedIUPACProtein()), Seq('CLVLNVNITLKGL**LQS*FVR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('VLS*MSI*R*KDYNNYSHNSF', HasStopCodon(ExtendedIUPACProtein(), '*'))]
SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1
TCTAGAATGTGAAATAACGTACTTCATGTGTCTTCTTACCAAAAATACCAACGATAAGGGGAAAAGCCATC
[Seq('SRM*NNVLHVSSYQKYQR*GEKP', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('LECEITYFMCLLTKNTNDKGKSH', ExtendedIUPACProtein()), Seq('*NVK*RTSCVFLPKIPTIRGKAI', HasStopCodon(ExtendedIUPACProtein(), '*'))]
[Seq('LPKRGIATIKTILLCTSCNKV*D', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('YRKGE*QP*KPFFCVLHAIKCKI', HasStopCodon(ExtendedIUPA



In [69]:

def six_frame_translation_no_stops(seq):
    forward_translations = three_frame_translation_no_stops(seq)
    reverse_translations = three_frame_translation_no_stops(seq.reverse_complement())
    return forward_translations + reverse_translations

with gzip.open('SRR306838_GSM752691_hsa_br_F_1_trimmed_subsampled.fq.gz', 'rt') as f:
    for i, record in zip(range(5), SeqIO.parse(f, 'fastq')):
        print(record.description)
        print(str(record.seq))
        translations = six_frame_translation_no_stops(record.seq)
        for translation in translations:
            print(f"\t{translation}")
            kmers = set(kmerize(str(translation), peptide_ksize))
            n_kmers = len(kmers)
            n_kmers_in_peptide_db = sum(1 for kmer in kmers if peptide_graph.get(kmer) > 0)
            kmers_in_peptide_db = {kmer:  peptide_graph.get(kmer) for kmer in kmers}
            print(f'n_kmers_in_peptide_db/n_kmers: {n_kmers_in_peptide_db}/{n_kmers} = {n_kmers_in_peptide_db/n_kmers}')

SRR306838.10559374 Ibis_Run100924_C3PO:6:51:17601:17119/1
CGCTTGCTTAATACTGACATCAATAATATTAGGAAAATCGCAATATAACTGTAAATCCTGTTCTGTC
	ACLILTSIILGKSQYNCKSCSV
n_kmers_in_peptide_db/n_kmers: 16/16 = 1.0
	TEQDLQLYCDFPNIIDVSIKQA
n_kmers_in_peptide_db/n_kmers: 16/16 = 1.0
	QNRIYSYIAIFLILLMSVLSK
n_kmers_in_peptide_db/n_kmers: 15/15 = 1.0
SRR306838.6196593 Ibis_Run100924_C3PO:6:29:16733:12435/1
TCTAGAATGTGAAATAACGTACTTCATGTGTCTTCTTACCAAAAATACCAACGATAAGGGGAAAAGCCATC
	LECEITYFMCLLTKNTNDKGKSH
n_kmers_in_peptide_db/n_kmers: 17/17 = 1.0
	WLFPLSLVFLVRRHMKYVISHSR
n_kmers_in_peptide_db/n_kmers: 17/17 = 1.0
SRR306838.20767303 Ibis_Run100924_C3PO:6:104:6864:5062/1
CAATCATCATCACTTTCTAATTCCAGAATATTTTCATCACCCCAAAAAGAAATCCTAAATCCATTAGC
	QSSSLSNSRIFSSPQKEILNPL
n_kmers_in_peptide_db/n_kmers: 16/16 = 1.0
	ANGFRISFWGDENILELESDDD
n_kmers_in_peptide_db/n_kmers: 16/16 = 1.0
SRR306838.12582274 Ibis_Run100924_C3PO:6:62:11779:17975/1
CAAAAGTGAAGACCTCCCTGGGGTCTTCAAAGACAGCCTTTGCTCTCCATGTAGCCAATGGTGCTCT
	KSEDLPGVFKDSLCSPCSQWCS



In [70]:
kmers_in_peptide_db

{'IFSEELR': 1,
 'SHSTMLS': 1,
 'STMLSIF': 1,
 'LSIFSEE': 1,
 'HSTMLSI': 1,
 'VPPSHST': 1,
 'SIFSEEL': 1,
 'PPSHSTM': 1,
 'PSHSTML': 1,
 'TMLSIFS': 1,
 'FSEELRV': 1,
 'MLSIFSE': 1}

In [65]:
peptide_graph.get("ACLILT")

ValueError: Expected k-mer length 7 but got 6.

In [None]:
record.seq.reverse_complement()