Skip to content

Commit

Permalink
Merge pull request #72 from dib-lab/feature/mutate
Browse files Browse the repository at this point in the history
New 'kevlar mutate' command
  • Loading branch information
standage committed May 10, 2017
2 parents fe2ff5f + 5116247 commit fa5595e
Show file tree
Hide file tree
Showing 10 changed files with 270 additions and 0 deletions.
1 change: 1 addition & 0 deletions kevlar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from kevlar import collect
from kevlar import filter
from kevlar import reaugment
from kevlar import mutate
from kevlar import assemble
from kevlar import cli
from kevlar.variantset import VariantSet
Expand Down
2 changes: 2 additions & 0 deletions kevlar/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
'filter': kevlar.filter.main,
'reaugment': kevlar.reaugment.main,
'assemble': kevlar.assemble.main,
'mutate': kevlar.mutate.main,
}


Expand Down Expand Up @@ -54,5 +55,6 @@ def parser():
kevlar.filter.subparser(subparsers)
kevlar.reaugment.subparser(subparsers)
kevlar.assemble.subparser(subparsers)
kevlar.mutate.subparser(subparsers)

return parser
119 changes: 119 additions & 0 deletions kevlar/mutate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2017 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/standage/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from __future__ import print_function
from collections import defaultdict, namedtuple
import argparse
import sys
import khmer
from khmer.utils import write_record
import kevlar


Mutation = namedtuple('Mutation', 'seq pos type data')
char_to_index = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
index_to_char = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}


def subparser(subparsers):
subparser = subparsers.add_parser('mutate')
subparser.add_argument('-o', '--out', metavar='FILE', default=sys.stdout,
type=argparse.FileType('w'),
help='output file; default is terminal (stdout)')
subparser.add_argument('mutations', type=argparse.FileType('r'),
help='mutations file')
subparser.add_argument('genome', metavar='FILE', help='genome to mutate')


def load_mutations(instream, logstream):
mutations = defaultdict(list)
count = 0
for line in instream:
if line.startswith('#') or line.strip() == '':
continue
try:
sequence, offset, vartype, data = line.strip().split()
except ValueError:
raise ValueError('error parsing mutation: ' + line)
if vartype not in ['snv', 'ins', 'del', 'inv']:
raise ValueError('invalid variant type "{:s}"'.format(vartype))
mut = Mutation(seq=sequence, pos=int(offset), type=vartype, data=data)
mutations[sequence].append(mut)
count += 1
message = ' loaded {:d} mutations'.format(count)
message += ' on {:d} sequences'.format(len(mutations), file=logstream)
return mutations


def mutate_snv(sequence, mutation):
refrbase = sequence[mutation.pos]
nuclindex = char_to_index[refrbase]
newindex = nuclindex + int(mutation.data)
while newindex > 3:
newindex -= 4
while newindex < 0:
newindex += 4
newbase = index_to_char[newindex]
prefix, suffix = sequence[:mutation.pos], sequence[mutation.pos+1:]
return prefix + newbase + suffix


def mutate_insertion(sequence, mutation):
prefix, suffix = sequence[:mutation.pos], sequence[mutation.pos:]
return prefix + mutation.data + suffix


def mutate_deletion(sequence, mutation):
del_length = int(mutation.data)
prefix = sequence[:mutation.pos]
suffix = sequence[mutation.pos+del_length:]
return prefix + suffix


def mutate_inversion(sequence, mutation):
inv_length = int(mutation.data)
prefix = sequence[:mutation.pos]
suffix = sequence[mutation.pos+inv_length:]
invseq = sequence[mutation.pos+inv_length-1:mutation.pos-1:-1]
return prefix + invseq + suffix


def mutate_sequence(sequence, mutlist):
for mutation in mutlist:
mutfunc = mutation_functions[mutation.type]
sequence = mutfunc(sequence, mutation)
return sequence


def mutate_genome(infile, mutations):
parser = khmer.ReadParser(infile)
for record in parser:
sequence = record.sequence
if record.name in mutations:
mutlist = sorted(mutations[record.name], key=lambda m: m.pos,
reverse=True)
sequence = mutate_sequence(sequence, mutlist)
yield khmer.Read(name=record.name, sequence=sequence)


mutation_functions = {
'snv': mutate_snv,
'ins': mutate_insertion,
'del': mutate_deletion,
'inv': mutate_inversion,
}


def main(args):
print('[kevlar::mutate] loading mutations', file=args.logfile)
mutations = load_mutations(args.mutations, args.logfile)

print('[kevlar::mutate] mutating genome', file=args.logfile)
for record in mutate_genome(args.genome, mutations):
write_record(record, args.out)
6 changes: 6 additions & 0 deletions kevlar/tests/data/mut-genome.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>contig1
GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGCCACTGGTA
>contig2
GTCATGAACTGACTCGCACGCGCTTCGGAAATTGCCGTATGATATGAC
>contig3
AGTAGCGTCTAGTTCTCCCCCATCGAGTATTGTGGCATAAGCGGAACA
22 changes: 22 additions & 0 deletions kevlar/tests/data/mut-genome.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGCCACTGGTA
# *
# GTACGGCTATTGTCTGAGCTCAAGACTAATACGCGCTGGCCACTGGTA
contig1 11 snv 2

# GTACGGCTATTATCTGAGCTC-----AAGACTAATACGCGCTGGCCACTGGTA
# GTACGGCTATTATCTGAGCTCTTTTTAAGACTAATACGCGCTGGCCACTGGTA
contig1 21 ins TTTTT

# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGC|CACT|GGTA
# X
# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGC|TCAC|GGTA
contig1 40 inv 4

# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGCCACTGGTA
# *
# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGCCACTGGAA
contig1 46 snv 1

# AGTAGCGTCTAGTTCTCCCCCATCGAGTATTGTGGCATAAGCGGAACA
# AGT--------------------CGAGTATTGTGGCATAAGCGGAACA
contig3 3 del 20
1 change: 1 addition & 0 deletions kevlar/tests/data/muts-w.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
contig 5 slippage 10
1 change: 1 addition & 0 deletions kevlar/tests/data/muts-x.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1 441274 snv 3
5 changes: 5 additions & 0 deletions kevlar/tests/data/muts-y.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Lolly lolly lolly get your comments here!
scaffold399 685357 ins AGCTACCCCAGTGAGTCGGTAATGTGATC
scaffold982 108754 del 23

scaffold1102 260686 snv 1
2 changes: 2 additions & 0 deletions kevlar/tests/data/muts-z.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
X,2884322,inv,2766
Y,19983,ins,GATTACA
111 changes: 111 additions & 0 deletions kevlar/tests/test_mutate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2017 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/standage/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from sys import stderr
import pytest
import kevlar
from kevlar.mutate import Mutation
from kevlar.tests import data_file


def test_load_mutations_x():
instream = kevlar.open(data_file('muts-x.txt'), 'r')
mutations = kevlar.mutate.load_mutations(instream, stderr)
assert len(mutations) == 1
assert '1' in mutations
assert len(mutations['1']) == 1

mut = mutations['1'][0]
assert mut == Mutation(seq='1', pos=441274, type='snv', data='3')


def test_load_mutations_y():
instream = kevlar.open(data_file('muts-y.tsv'), 'r')
mutations = kevlar.mutate.load_mutations(instream, stderr)
assert len(mutations) == 3

assert 'scaffold399' in mutations
assert len(mutations['scaffold399']) == 1
mut = mutations['scaffold399'][0]
assert mut == Mutation(seq='scaffold399', pos=685357, type='ins',
data='AGCTACCCCAGTGAGTCGGTAATGTGATC')

assert 'scaffold982' in mutations
assert len(mutations['scaffold982']) == 1
mut = mutations['scaffold982'][0]
assert mut == Mutation(seq='scaffold982', pos=108754, type='del',
data='23')

assert 'scaffold1102' in mutations
assert len(mutations['scaffold1102']) == 1
mut = mutations['scaffold1102'][0]
assert mut == Mutation(seq='scaffold1102', pos=260686, type='snv',
data='1')


def test_load_mutations_z():
instream = kevlar.open(data_file('muts-z.csv'), 'r')
with pytest.raises(ValueError) as ve:
mutations = kevlar.mutate.load_mutations(instream, stderr)
assert 'error parsing mutation' in str(ve)


def test_mutate_snv():
mutation = Mutation(seq='contig', pos=5, type='snv', data='1')
contig = 'ACGTACGTACGT'
assert kevlar.mutate.mutate_snv(contig, mutation) == 'ACGTAGGTACGT'

mutation = Mutation(seq='contig', pos=5, type='snv', data='-1')
assert kevlar.mutate.mutate_snv(contig, mutation) == 'ACGTAAGTACGT'

mutation = Mutation(seq='contig', pos=0, type='snv', data='-1')
assert kevlar.mutate.mutate_snv(contig, mutation) == 'TCGTACGTACGT'


def test_mutate_ins():
mutation = Mutation(seq='contig', pos=5, type='ins', data='AAAA')
contig = 'ACGTACGTACGT'
mutcontig = 'ACGTAAAAACGTACGT'
assert kevlar.mutate.mutate_insertion(contig, mutation) == mutcontig


def test_mutate_del():
mutation = Mutation(seq='contig', pos=5, type='ins', data='5')
contig = 'ACGTACGTACGT'
assert kevlar.mutate.mutate_deletion(contig, mutation) == 'ACGTAGT'


def test_mutate_inv():
mutation = Mutation(seq='contig', pos=5, type='inv', data='5')
contig = 'ACGTACGTACGT'
assert kevlar.mutate.mutate_inversion(contig, mutation) == 'ACGTACATGCGT'


def test_mutate_bogus():
instream = kevlar.open(data_file('muts-w.txt'), 'r')
with pytest.raises(ValueError) as ve:
mutations = kevlar.mutate.load_mutations(instream, stderr)
assert 'invalid variant type "slippage"' in str(ve)


def test_mutate_main(capsys):
genome = data_file('mut-genome.fa')
muts = data_file('mut-genome.txt')
args = kevlar.cli.parser().parse_args(['mutate', muts, genome])
kevlar.mutate.main(args)
out, err = capsys.readouterr()

contig1 = '>contig1\nGTACGGCTATTGTCTGAGCTCTTTTTAAGACTAATACGCGCTGGCTCACGGAA'
assert contig1 in out

contig2 = '>contig2\nGTCATGAACTGACTCGCACGCGCTTCGGAAATTGCCGTATGATATGAC'
assert contig2 in out

contig3 = '>contig3\nAGTCGAGTATTGTGGCATAAGCGGAACA'
assert contig3 in out

0 comments on commit fa5595e

Please sign in to comment.