In [34]:
# enter your params

read_dir = '191641_dna_seq_rep_8852898_zip'
wt_pep = 'bagel_wt/bglb.pep'
allowed_mutants = 'allowed_mutants.txt' 

In [35]:
# make a list of all the wells in 96-well plate 
# unless you can think of a better way to enumerate pairs 

from itertools import product

wells_human = [ '{}{:02}'.format( i, j ) for ( i, j ) in product( 'ABCDEFGH', range( 1, 13 ) ) ]

In [36]:
# convert all the reads to FASTQ

from Bio import SeqIO
from glob import glob 

for read in glob( '{}/*ab1'.format( read_dir ) ):
    SeqIO.convert( read, 'abi', read.replace( 'ab1', 'fastq' ), 'fastq' )

In [37]:
# pair and merge the reads 

from subprocess import call, check_output

expect = 0
for well in wells_human:
    #genscript
    #fwd = glob( '{}/{}_*T7.fastq'.format( read_dir, well ) ) 
    #rev = glob( '{}/{}_*T7-Ter.fastq'.format( read_dir, well ) )
    #operon 
    fwd = glob( '{}/*T7_*{}.fastq'.format( read_dir, well ) ) 
    rev = glob( '{}/*T7term_*{}.fastq'.format( read_dir, well ) )
    if len( fwd ) == len( rev ) == 1:
        pear_opts = [ '-e', '2' ] 
        cmd = [ 'pear', '-f', fwd[0], '-r', rev[0], '-o', '{}/{}'.format( read_dir, well ) ] + pear_opts
        call( cmd )
        expect += 1 

In [38]:
# convert all the assemblies to FASTA

for assembly in glob( '{}/*.assembled.fastq'.format( read_dir ) ):
    SeqIO.convert( assembly, 'fastq', assembly.replace( 'fastq', 'fasta' ), 'fasta' )

In [39]:
# load in allowed mutants for error checking 

with open( allowed_mutants ) as fn:
    allowed_mutants = [ i.strip() for i in fn.readlines() if len(i) > 1 ] 

In [40]:
# diff the assemblies against the wild type protein sequence 

singles = []
for assembly in glob( '{}/*.assembled.fasta'.format( read_dir ) ):
    name = assembly.split( '.' )[0] 
    lines = check_output( [ 'blastx', '-subject', wt_pep, '-query', assembly, '-outfmt', '6 sseq qseq length' ] ).split()
    if lines and lines[0] and lines[1] and lines[2]:
        z = zip( lines[0], lines[1] )
        d = [ '{}{}{}'.format( i, n+1, j ) for n, ( i, j ) in enumerate( z ) if i != j ]
        dstr = '+'.join( d ) 
        if len( d ) == 0: 
            print name, '\twild type'
        elif len( d ) == 1:
            if d[0] in allowed_mutants:
                print name, '\t{}'.format( dstr ) 
                singles.append( ( d[0], name ) )
            else:
                print name, '\tbad alignment'
        elif len( d ) > 1:
            print name, '\t... ({} mutations)'.format( len( d ) ) 
            #print name, '\t{}'.format( dstr ) 

191641_dna_seq_rep_8852898_zip/A08 	V311A
191641_dna_seq_rep_8852898_zip/A10 	... (9 mutations)
191641_dna_seq_rep_8852898_zip/A11 	L108N
191641_dna_seq_rep_8852898_zip/A12 	D322A
191641_dna_seq_rep_8852898_zip/B08 	... (11 mutations)
191641_dna_seq_rep_8852898_zip/B09 	W325K
191641_dna_seq_rep_8852898_zip/B10 	... (7 mutations)
191641_dna_seq_rep_8852898_zip/B11 	... (3 mutations)
191641_dna_seq_rep_8852898_zip/B12 	... (9 mutations)
191641_dna_seq_rep_8852898_zip/C08 	... (6 mutations)
191641_dna_seq_rep_8852898_zip/C09 	... (14 mutations)
191641_dna_seq_rep_8852898_zip/C10 	wild type
191641_dna_seq_rep_8852898_zip/C11 	... (4 mutations)
191641_dna_seq_rep_8852898_zip/C12 	M323T
191641_dna_seq_rep_8852898_zip/D08 	wild type
191641_dna_seq_rep_8852898_zip/D09 	... (2 mutations)
191641_dna_seq_rep_8852898_zip/D10 	L272N
191641_dna_seq_rep_8852898_zip/D11 	... (5 mutations)
191641_dna_seq_rep_8852898_zip/E08 	Q124H
191641_dna_seq_rep_8852898_zip/E09 	N220K
191641_dna_seq_rep_8852898_zip

In [41]:
singles

[('V311A', '191641_dna_seq_rep_8852898_zip/A08'),
 ('L108N', '191641_dna_seq_rep_8852898_zip/A11'),
 ('D322A', '191641_dna_seq_rep_8852898_zip/A12'),
 ('W325K', '191641_dna_seq_rep_8852898_zip/B09'),
 ('M323T', '191641_dna_seq_rep_8852898_zip/C12'),
 ('L272N', '191641_dna_seq_rep_8852898_zip/D10'),
 ('Q124H', '191641_dna_seq_rep_8852898_zip/E08'),
 ('N220K', '191641_dna_seq_rep_8852898_zip/E09'),
 ('Q313E', '191641_dna_seq_rep_8852898_zip/F07'),
 ('N293K', '191641_dna_seq_rep_8852898_zip/F10'),
 ('W399H', '191641_dna_seq_rep_8852898_zip/F11'),
 ('H178N', '191641_dna_seq_rep_8852898_zip/F12'),
 ('N220Y', '191641_dna_seq_rep_8852898_zip/G09'),
 ('D322A', '191641_dna_seq_rep_8852898_zip/G11'),
 ('H178N', '191641_dna_seq_rep_8852898_zip/G12'),
 ('Q313E', '191641_dna_seq_rep_8852898_zip/H07'),
 ('V311D', '191641_dna_seq_rep_8852898_zip/H08'),
 ('N293K', '191641_dna_seq_rep_8852898_zip/H10')]

In [42]:
dict( singles ) 

{'D322A': '191641_dna_seq_rep_8852898_zip/G11',
 'H178N': '191641_dna_seq_rep_8852898_zip/G12',
 'L108N': '191641_dna_seq_rep_8852898_zip/A11',
 'L272N': '191641_dna_seq_rep_8852898_zip/D10',
 'M323T': '191641_dna_seq_rep_8852898_zip/C12',
 'N220K': '191641_dna_seq_rep_8852898_zip/E09',
 'N220Y': '191641_dna_seq_rep_8852898_zip/G09',
 'N293K': '191641_dna_seq_rep_8852898_zip/H10',
 'Q124H': '191641_dna_seq_rep_8852898_zip/E08',
 'Q313E': '191641_dna_seq_rep_8852898_zip/H07',
 'V311A': '191641_dna_seq_rep_8852898_zip/A08',
 'V311D': '191641_dna_seq_rep_8852898_zip/H08',
 'W325K': '191641_dna_seq_rep_8852898_zip/B09',
 'W399H': '191641_dna_seq_rep_8852898_zip/F11'}

In [43]:
# results 

picks = dict( singles ) 
l = len( picks ) 
print 'Found {} of {} expected mutants'.format( l, expect ) 

Found 14 of 0 expected mutants
