-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #72 from dib-lab/feature/mutate
New 'kevlar mutate' command
- Loading branch information
Showing
10 changed files
with
270 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
#!/usr/bin/env python | ||
# | ||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2017 The Regents of the University of California | ||
# | ||
# This file is part of kevlar (http://github.com/standage/kevlar) and is | ||
# licensed under the MIT license: see LICENSE. | ||
# ----------------------------------------------------------------------------- | ||
|
||
from __future__ import print_function | ||
from collections import defaultdict, namedtuple | ||
import argparse | ||
import sys | ||
import khmer | ||
from khmer.utils import write_record | ||
import kevlar | ||
|
||
|
||
Mutation = namedtuple('Mutation', 'seq pos type data') | ||
char_to_index = {'A': 0, 'C': 1, 'G': 2, 'T': 3} | ||
index_to_char = {0: 'A', 1: 'C', 2: 'G', 3: 'T'} | ||
|
||
|
||
def subparser(subparsers): | ||
subparser = subparsers.add_parser('mutate') | ||
subparser.add_argument('-o', '--out', metavar='FILE', default=sys.stdout, | ||
type=argparse.FileType('w'), | ||
help='output file; default is terminal (stdout)') | ||
subparser.add_argument('mutations', type=argparse.FileType('r'), | ||
help='mutations file') | ||
subparser.add_argument('genome', metavar='FILE', help='genome to mutate') | ||
|
||
|
||
def load_mutations(instream, logstream): | ||
mutations = defaultdict(list) | ||
count = 0 | ||
for line in instream: | ||
if line.startswith('#') or line.strip() == '': | ||
continue | ||
try: | ||
sequence, offset, vartype, data = line.strip().split() | ||
except ValueError: | ||
raise ValueError('error parsing mutation: ' + line) | ||
if vartype not in ['snv', 'ins', 'del', 'inv']: | ||
raise ValueError('invalid variant type "{:s}"'.format(vartype)) | ||
mut = Mutation(seq=sequence, pos=int(offset), type=vartype, data=data) | ||
mutations[sequence].append(mut) | ||
count += 1 | ||
message = ' loaded {:d} mutations'.format(count) | ||
message += ' on {:d} sequences'.format(len(mutations), file=logstream) | ||
return mutations | ||
|
||
|
||
def mutate_snv(sequence, mutation): | ||
refrbase = sequence[mutation.pos] | ||
nuclindex = char_to_index[refrbase] | ||
newindex = nuclindex + int(mutation.data) | ||
while newindex > 3: | ||
newindex -= 4 | ||
while newindex < 0: | ||
newindex += 4 | ||
newbase = index_to_char[newindex] | ||
prefix, suffix = sequence[:mutation.pos], sequence[mutation.pos+1:] | ||
return prefix + newbase + suffix | ||
|
||
|
||
def mutate_insertion(sequence, mutation): | ||
prefix, suffix = sequence[:mutation.pos], sequence[mutation.pos:] | ||
return prefix + mutation.data + suffix | ||
|
||
|
||
def mutate_deletion(sequence, mutation): | ||
del_length = int(mutation.data) | ||
prefix = sequence[:mutation.pos] | ||
suffix = sequence[mutation.pos+del_length:] | ||
return prefix + suffix | ||
|
||
|
||
def mutate_inversion(sequence, mutation): | ||
inv_length = int(mutation.data) | ||
prefix = sequence[:mutation.pos] | ||
suffix = sequence[mutation.pos+inv_length:] | ||
invseq = sequence[mutation.pos+inv_length-1:mutation.pos-1:-1] | ||
return prefix + invseq + suffix | ||
|
||
|
||
def mutate_sequence(sequence, mutlist): | ||
for mutation in mutlist: | ||
mutfunc = mutation_functions[mutation.type] | ||
sequence = mutfunc(sequence, mutation) | ||
return sequence | ||
|
||
|
||
def mutate_genome(infile, mutations): | ||
parser = khmer.ReadParser(infile) | ||
for record in parser: | ||
sequence = record.sequence | ||
if record.name in mutations: | ||
mutlist = sorted(mutations[record.name], key=lambda m: m.pos, | ||
reverse=True) | ||
sequence = mutate_sequence(sequence, mutlist) | ||
yield khmer.Read(name=record.name, sequence=sequence) | ||
|
||
|
||
mutation_functions = { | ||
'snv': mutate_snv, | ||
'ins': mutate_insertion, | ||
'del': mutate_deletion, | ||
'inv': mutate_inversion, | ||
} | ||
|
||
|
||
def main(args): | ||
print('[kevlar::mutate] loading mutations', file=args.logfile) | ||
mutations = load_mutations(args.mutations, args.logfile) | ||
|
||
print('[kevlar::mutate] mutating genome', file=args.logfile) | ||
for record in mutate_genome(args.genome, mutations): | ||
write_record(record, args.out) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
>contig1 | ||
GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGCCACTGGTA | ||
>contig2 | ||
GTCATGAACTGACTCGCACGCGCTTCGGAAATTGCCGTATGATATGAC | ||
>contig3 | ||
AGTAGCGTCTAGTTCTCCCCCATCGAGTATTGTGGCATAAGCGGAACA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGCCACTGGTA | ||
# * | ||
# GTACGGCTATTGTCTGAGCTCAAGACTAATACGCGCTGGCCACTGGTA | ||
contig1 11 snv 2 | ||
|
||
# GTACGGCTATTATCTGAGCTC-----AAGACTAATACGCGCTGGCCACTGGTA | ||
# GTACGGCTATTATCTGAGCTCTTTTTAAGACTAATACGCGCTGGCCACTGGTA | ||
contig1 21 ins TTTTT | ||
|
||
# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGC|CACT|GGTA | ||
# X | ||
# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGC|TCAC|GGTA | ||
contig1 40 inv 4 | ||
|
||
# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGCCACTGGTA | ||
# * | ||
# GTACGGCTATTATCTGAGCTCAAGACTAATACGCGCTGGCCACTGGAA | ||
contig1 46 snv 1 | ||
|
||
# AGTAGCGTCTAGTTCTCCCCCATCGAGTATTGTGGCATAAGCGGAACA | ||
# AGT--------------------CGAGTATTGTGGCATAAGCGGAACA | ||
contig3 3 del 20 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
contig 5 slippage 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
1 441274 snv 3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Lolly lolly lolly get your comments here! | ||
scaffold399 685357 ins AGCTACCCCAGTGAGTCGGTAATGTGATC | ||
scaffold982 108754 del 23 | ||
|
||
scaffold1102 260686 snv 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
X,2884322,inv,2766 | ||
Y,19983,ins,GATTACA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/usr/bin/env python | ||
# | ||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2017 The Regents of the University of California | ||
# | ||
# This file is part of kevlar (http://github.com/standage/kevlar) and is | ||
# licensed under the MIT license: see LICENSE. | ||
# ----------------------------------------------------------------------------- | ||
|
||
from sys import stderr | ||
import pytest | ||
import kevlar | ||
from kevlar.mutate import Mutation | ||
from kevlar.tests import data_file | ||
|
||
|
||
def test_load_mutations_x(): | ||
instream = kevlar.open(data_file('muts-x.txt'), 'r') | ||
mutations = kevlar.mutate.load_mutations(instream, stderr) | ||
assert len(mutations) == 1 | ||
assert '1' in mutations | ||
assert len(mutations['1']) == 1 | ||
|
||
mut = mutations['1'][0] | ||
assert mut == Mutation(seq='1', pos=441274, type='snv', data='3') | ||
|
||
|
||
def test_load_mutations_y(): | ||
instream = kevlar.open(data_file('muts-y.tsv'), 'r') | ||
mutations = kevlar.mutate.load_mutations(instream, stderr) | ||
assert len(mutations) == 3 | ||
|
||
assert 'scaffold399' in mutations | ||
assert len(mutations['scaffold399']) == 1 | ||
mut = mutations['scaffold399'][0] | ||
assert mut == Mutation(seq='scaffold399', pos=685357, type='ins', | ||
data='AGCTACCCCAGTGAGTCGGTAATGTGATC') | ||
|
||
assert 'scaffold982' in mutations | ||
assert len(mutations['scaffold982']) == 1 | ||
mut = mutations['scaffold982'][0] | ||
assert mut == Mutation(seq='scaffold982', pos=108754, type='del', | ||
data='23') | ||
|
||
assert 'scaffold1102' in mutations | ||
assert len(mutations['scaffold1102']) == 1 | ||
mut = mutations['scaffold1102'][0] | ||
assert mut == Mutation(seq='scaffold1102', pos=260686, type='snv', | ||
data='1') | ||
|
||
|
||
def test_load_mutations_z(): | ||
instream = kevlar.open(data_file('muts-z.csv'), 'r') | ||
with pytest.raises(ValueError) as ve: | ||
mutations = kevlar.mutate.load_mutations(instream, stderr) | ||
assert 'error parsing mutation' in str(ve) | ||
|
||
|
||
def test_mutate_snv(): | ||
mutation = Mutation(seq='contig', pos=5, type='snv', data='1') | ||
contig = 'ACGTACGTACGT' | ||
assert kevlar.mutate.mutate_snv(contig, mutation) == 'ACGTAGGTACGT' | ||
|
||
mutation = Mutation(seq='contig', pos=5, type='snv', data='-1') | ||
assert kevlar.mutate.mutate_snv(contig, mutation) == 'ACGTAAGTACGT' | ||
|
||
mutation = Mutation(seq='contig', pos=0, type='snv', data='-1') | ||
assert kevlar.mutate.mutate_snv(contig, mutation) == 'TCGTACGTACGT' | ||
|
||
|
||
def test_mutate_ins(): | ||
mutation = Mutation(seq='contig', pos=5, type='ins', data='AAAA') | ||
contig = 'ACGTACGTACGT' | ||
mutcontig = 'ACGTAAAAACGTACGT' | ||
assert kevlar.mutate.mutate_insertion(contig, mutation) == mutcontig | ||
|
||
|
||
def test_mutate_del(): | ||
mutation = Mutation(seq='contig', pos=5, type='ins', data='5') | ||
contig = 'ACGTACGTACGT' | ||
assert kevlar.mutate.mutate_deletion(contig, mutation) == 'ACGTAGT' | ||
|
||
|
||
def test_mutate_inv(): | ||
mutation = Mutation(seq='contig', pos=5, type='inv', data='5') | ||
contig = 'ACGTACGTACGT' | ||
assert kevlar.mutate.mutate_inversion(contig, mutation) == 'ACGTACATGCGT' | ||
|
||
|
||
def test_mutate_bogus(): | ||
instream = kevlar.open(data_file('muts-w.txt'), 'r') | ||
with pytest.raises(ValueError) as ve: | ||
mutations = kevlar.mutate.load_mutations(instream, stderr) | ||
assert 'invalid variant type "slippage"' in str(ve) | ||
|
||
|
||
def test_mutate_main(capsys): | ||
genome = data_file('mut-genome.fa') | ||
muts = data_file('mut-genome.txt') | ||
args = kevlar.cli.parser().parse_args(['mutate', muts, genome]) | ||
kevlar.mutate.main(args) | ||
out, err = capsys.readouterr() | ||
|
||
contig1 = '>contig1\nGTACGGCTATTGTCTGAGCTCTTTTTAAGACTAATACGCGCTGGCTCACGGAA' | ||
assert contig1 in out | ||
|
||
contig2 = '>contig2\nGTCATGAACTGACTCGCACGCGCTTCGGAAATTGCCGTATGATATGAC' | ||
assert contig2 in out | ||
|
||
contig3 = '>contig3\nAGTCGAGTATTGTGGCATAAGCGGAACA' | ||
assert contig3 in out |