In [24]:
# enter your params

read_dir = 'order_with_small_u_removed/' 
wt_pep = 'bagel_wt/bglb.pep'
allowed_mutants = 'allowed_mutants.txt' 
sample_list = []

In [25]:
# convert all the reads to FASTQ

from Bio import SeqIO
from glob import glob 

for read in glob( '{}/*ab1'.format( read_dir ) ):
    SeqIO.convert( read, 'abi', read.replace( 'ab1', 'fastq' ), 'fastq' )

In [26]:
# read from file 

my_sample_list_file = 'my_samples.txt'

with open( my_sample_list_file ) as fn:
    sample_list = fn.read().split()

# try get a ist of the samples if we don't already have that 

if not sample_list:
    my_glob = '{}/*fastq'
    print 'No sample list, attempting to find one with the glob "{}"'.format( my_glob ) 
    my_files = glob( my_glob.format( read_dir ) )
    #my_samples = map( lambda x: x.split( '/' )[1].split( '_' )[2], my_files ) 
    sample_list = [ x.split( '/' )[1].split( '_' )[2] for x in my_files ] 
    print sample_list 

In [27]:
print sample_list

['A1-1', 'A1-2', 'A1-3', 'A10-1', 'A10-2', 'A10-3', 'A11-1', 'A11-2', 'A11-3', 'A2-1', 'A2-2', 'A2-3', 'A3-1', 'A3-2', 'A3-3', 'A4-1', 'A4-2', 'A4-3', 'A5-1', 'A5-2', 'A5-3', 'A6-1', 'A6-2', 'A6-3', 'A7-1', 'A7-2', 'A7-3', 'A8-1', 'A8-2', 'A8-3', 'A9-1', 'A9-2', 'A9-3']


In [28]:
# pair and merge the reads 

from subprocess import call, check_output

expect = 0
for sample in sample_list:
    #genscript
    fwd = glob( '{}/*_{}_T7.fastq'.format( read_dir, sample ) ) 
    rev = glob( '{}/*_{}_T7-Ter.fastq'.format( read_dir, sample ) )
    print len(fwd), len(rev)
    #operon 
    #fwd = glob( '{}/*T7_*{}.fastq'.format( read_dir, well ) ) 
    #rev = glob( '{}/*T7term_*{}.fastq'.format( read_dir, well ) )
    if len( fwd ) == len( rev ) == 1:
        pear_opts = [ '-e', '2' ] 
        cmd = [ 'pear', '-f', fwd[0], '-r', rev[0], '-o', '{}/{}'.format( read_dir, sample ) ] + pear_opts
        call( cmd )
        expect += 1 

1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 1
1 1
1 1
0 1
1 1
1 1
1 1
1 1
1 1


In [29]:
# convert all the assemblies to FASTA

for assembly in glob( '{}/*.assembled.fastq'.format( read_dir ) ):
    SeqIO.convert( assembly, 'fastq', assembly.replace( 'fastq', 'fasta' ), 'fasta' )

In [30]:
# load in allowed mutants for error checking 

with open( allowed_mutants ) as fn:
    allowed_mutants = [ i.strip() for i in fn.readlines() if len(i) > 1 ] 

In [31]:
# diff the assemblies against the wild type protein sequence 

singles = []
for assembly in glob( '{}/*.assembled.fasta'.format( read_dir ) ):
    name = assembly.split( '.' )[0] 
    lines = check_output( [ 'blastx', '-subject', wt_pep, '-query', assembly, '-outfmt', '6 sseq qseq length' ] ).split()
    if lines and lines[0] and lines[1] and lines[2]:
        z = zip( lines[0], lines[1] )
        d = [ '{}{}{}'.format( i, n+1, j ) for n, ( i, j ) in enumerate( z ) if i != j ]
        dstr = '+'.join( d ) 
        if len( d ) == 0: 
            print name, '\twild type'
        elif len( d ) == 1:
            if d[0] in allowed_mutants:
                print name, '\t{}'.format( dstr ) 
                singles.append( ( d[0], name ) )
            else:
                print name, '\tbad alignment'
        elif len( d ) > 1:
            print name, '\t... ({} mutations)'.format( len( d ) ) 
            #print name, '\t{}'.format( dstr ) 

order_with_small_u_removed/A4-2 	wild type
order_with_small_u_removed/A4-3 	... (2 mutations)
order_with_small_u_removed/A5-3 	... (2 mutations)
order_with_small_u_removed/A6-1 	... (9 mutations)


In [32]:
singles

[]

In [33]:
dict( singles ) 

{}

In [34]:
# results 

picks = dict( singles ) 
l = len( picks ) 
print 'Found {} of {} expected mutants'.format( l, expect ) 

Found 0 of 31 expected mutants
