In [5]:
from Bio import SeqIO
from glob import glob 
from subprocess import call, check_output

In [7]:
# enter your params

read_dir = 'sanger_reads/'
my_sample_names = [ '0-1', '0-2', '0-3', '1-1', '1-2', '1-3' ] 
wt_pep = '1bvv_A.pep'
#allowed_mutants = 'allowed_mutants.txt' 

In [34]:
# make a list of all the wells in 96-well plate 
# unless you can think of a better way to enumerate pairs 

#from itertools import product

#wells_human = [ '{}{:02}'.format( i, j ) for ( i, j ) in product( 'ABCDEFGH', range( 1, 13 ) ) ]

In [3]:
# convert all the reads to FASTQ

for read in glob( '{}/*ab1'.format( read_dir ) ):
    SeqIO.convert( read, 'abi', read.replace( 'ab1', 'fastq' ), 'fastq' )

In [37]:
# pair and merge the reads 

expect = 0
for sample_name in my_sample_names:
    #genscript
    fwd = glob( '{}/*{}_*T7.fastq'.format( read_dir, sample_name ) ) 
    rev = glob( '{}/*{}_*T7-Ter.fastq'.format( read_dir, sample_name ) )
    #operon 
    #fwd = glob( '{}/*T7_*{}.fastq'.format( read_dir, well ) ) 
    #rev = glob( '{}/*T7term_*{}.fastq'.format( read_dir, well ) )
    if len( fwd ) == len( rev ) == 1:
        pear_opts = [ '-e', '2' ] 
        cmd = [ 'pear', '-f', fwd[0], '-r', rev[0], '-o', '{}/{}'.format( read_dir, sample_name ) ] + pear_opts
        call( cmd )
        expect += 1 

In [38]:
# convert all the assemblies to FASTA

for assembly in glob( '{}/*.assembled.fastq'.format( read_dir ) ):
    SeqIO.convert( assembly, 'fastq', assembly.replace( 'fastq', 'fasta' ), 'fasta' )

In [39]:
! ls $read_dir

0-1.assembled.fasta                    [31mB03_U4895BF100-71_1-2_T7-Ter.ab1[m[m
0-1.assembled.fastq                    B03_U4895BF100-71_1-2_T7-Ter.fastq
0-1.discarded.fastq                    [31mB03_U4895BF100-71_1-2_T7-Ter.seq[m[m
0-1.unassembled.forward.fastq          [31mC03_U4895BF100-72_1-3_T7-Ter.ab1[m[m
0-1.unassembled.reverse.fastq          C03_U4895BF100-72_1-3_T7-Ter.fastq
0-2.assembled.fasta                    [31mC03_U4895BF100-72_1-3_T7-Ter.seq[m[m
0-2.assembled.fastq                    [31mD01_U4895BF100-31_0-1_T7.ab1[m[m
0-2.discarded.fastq                    D01_U4895BF100-31_0-1_T7.fastq
0-2.unassembled.forward.fastq          [31mD01_U4895BF100-31_0-1_T7.seq[m[m
0-2.unassembled.reverse.fastq          [31mE01_U4895BF100-32_0-2_T7.ab1[m[m
0-3.assembled.fasta                    E01_U4895BF100-32_0-2_T7.fastq
0-3.assembled.fastq                    [31mE01_U4895BF100-32_0-2_T7.seq[m[m
0-3.discarded.fastq                    [31mF01_U489

In [41]:
!ls

1ukb_A.pep                          order_nucleotide_sequences.fasta
1ukb_C.pep                          order_peptide_sequences.fasta
my_reads.fa                         round_B_sequence_verification.ipynb
notes.txt                           [34msanger_reads[m[m


In [42]:
! blastn -subject order_nucleotide_sequences.fasta -query my_reads.fa

BLASTN 2.2.31+


Query= U4895BF100-34_1-1_T7

Length=978

Subject= 1bvv_A

Length=623


 Score = 69.4 bits (37),  Expect = 7e-16
 Identities = 40/41 (98%), Gaps = 1/41 (2%)
 Strand=Plus/Plus

Query  26  GAAATAA-TTTGTTTAACTTTAAGAAGGAGATATACATATG  65
           ||||||| |||||||||||||||||||||||||||||||||
Sbjct  1   GAAATAATTTTGTTTAACTTTAAGAAGGAGATATACATATG  41



Lambda      K        H
    1.33    0.621     1.12 

Gapped
Lambda      K        H
    1.28    0.460    0.850 

Effective search space used: 590226


Query= U4895BF100-34_1-1_T7

Length=978

Subject= 1bvv_C

Length=623


 Score = 69.4 bits (37),  Expect = 7e-16
 Identities = 40/41 (98%), Gaps = 1/41 (2%)
 Strand=Plus/Plus

Query  26  GAAATAA-TTTGTTTAACTTTAAGAAGGAGATATACATATG  65
           ||||||| |||||||||||||||||||||||||||||||||
Sbjct  1   GAAATAATTTTGTTTAACTTTAAGAAGGAGATATACATATG  41



Lambda      K        H
    1.33    0.621     1.12 

Gapped
Lambda      K        H
    1.

In [43]:
# load in allowed mutants for error checking 

#with open( allowed_mutants ) as fn:
#    allowed_mutants = [ i.strip() for i in fn.readlines() if len(i) > 1 ] 

In [8]:
# diff the assemblies against the wild type protein sequence 

singles = []
for assembly in glob( '{}/*.assembled.fasta'.format( read_dir ) ):
    name = assembly.split( '.' )[0] 
    lines = check_output( [ 'blastx', '-subject', wt_pep, '-query', assembly, '-outfmt', '6 sseq qseq length' ] ).split()
    if lines and lines[0] and lines[1] and lines[2]:
        z = zip( lines[0], lines[1] )
        d = [ '{}{}{}'.format( i, n+1, j ) for n, ( i, j ) in enumerate( z ) if i != j ]
        dstr = '+'.join( d ) 
        if len( d ) == 0: 
            print name, '\twild type'
        elif len( d ) == 1:
            if d[0] in allowed_mutants:
                print name, '\t{}'.format( dstr ) 
                singles.append( ( d[0], name ) )
            else:
                print name, '\tbad alignment'
        elif len( d ) > 1:
            #print name, '\t... ({} mutations)'.format( len( d ) ) 
            print name, '\t{}'.format( dstr ) 

sanger_reads/0-1 	wild type
sanger_reads/0-2 	wild type
sanger_reads/0-3 	wild type
sanger_reads/1-1 	W2I+N5D+G6M+N7V+Y9F+L10-+T11-+L12-+A13-+W15F+R17D+S18R+L20E+I21N+I22Y+Y23N+V25S+V26K
sanger_reads/1-3 	W2I+N5D+G6M+N7V+Y9F+L10-+T11-+L12-+A13-+W15F+R17D+S18R+L20E+I21N+I22Y+Y23N+V25S+V26K


In [45]:
singles

[]

In [46]:
dict( singles ) 

{}

In [47]:
# results 

picks = dict( singles ) 
l = len( picks ) 
print 'Found {} of {} expected mutants'.format( l, expect ) 

Found 0 of 6 expected mutants


Is this our contaminant? 

```
Chain A, Sugar Ring Distortion In The Glycosyl-Enzyme Intermediate Of A Family G11 XYLANASE.
Sequence ID: pdb|2BVV|ALength: 185Number of Matches: 1
Related Information
Structure-3D structure displays
Range 1: 1 to 185GenPeptGraphicsNext MatchPrevious Match
Alignment statistics for match #1
Score	Expect	Method	Identities	Positives	Gaps
Frame
348 bits(894)	2e-120	Compositional matrix adjust.	178/185(96%)	179/185(96%)	0/185(0%)
+2
Query  71   ASTDYWENWTDGGGIVNAVNGSGGNYSVNWSNTGFFAVGKGWTTGSPFRTINYNAGVWAP  250
            ASTDYW+NWTDGGGIVNAVNGSGGNYSVNWSNTG F VGKGWTTGSPFRTINYNAGVWAP
Sbjct  1    ASTDYWQNWTDGGGIVNAVNGSGGNYSVNWSNTGNFVVGKGWTTGSPFRTINYNAGVWAP  60

Query  251  NGNGYLTLAGWTRSPLIIYYVVDSWGTYRPTGTYKGTVKSDGGTYDIYTTTRYNYPSIDG  430
            NGNGYLTL GWTRSPLI YYVVDSWGTYRPTGTYKGTVKSDGGTYDIYTTTRYN PSIDG
Sbjct  61   NGNGYLTLFGWTRSPLIEYYVVDSWGTYRPTGTYKGTVKSDGGTYDIYTTTRYNAPSIDG  120

Query  431  DRTTFTQYWSVRQSKRPTGSNATITFTNHVNAWKSHGMNLGSNWAYQVMATIGYQSSGSS  610
            DRTTFTQYWSVRQSKRPTGSNATITFTNHVNAWKSHGMNLGSNWAYQVMAT GYQSSGSS
Sbjct  121  DRTTFTQYWSVRQSKRPTGSNATITFTNHVNAWKSHGMNLGSNWAYQVMATEGYQSSGSS  180

Query  611  NVTVW  625
            NVTVW
Sbjct  181  NVTVW  185
```

In [48]:
# results 

# samples 0-1, 0-2, and 0-3 are sequence-perfect 1bvv_A
# sample 1-3 contains a sequence-verified clone of 1ukb_C 