# Kunkel sequence verifier 

Will attempt to verify your Kunkel sequencing results and pick clones with desired mutations. 

In [49]:
! ls 191641_dna_seq_rep_8852898_zip/ | tail

H09_U5271BB220-61_A7-1_T7-Ter.ab1
H09_U5271BB220-61_A7-1_T7-Ter.seq
H10_U5271BB220-27_A9-3_T7.ab1
H10_U5271BB220-27_A9-3_T7.seq
H10_U5271BB220-69_A9-3_T7-Ter.ab1
H10_U5271BB220-69_A9-3_T7-Ter.seq
H11_U5271BB220-35_A12-2_T7.ab1
H11_U5271BB220-35_A12-2_T7.seq
H11_U5271BB220-77_A12-2_T7-Ter.ab1
H11_U5271BB220-77_A12-2_T7-Ter.seq


## Using `pear` to join paired reads 

Unfortunatly requires converting back and forth between file types 

In [15]:
from itertools import product 
from subprocess import call, check_output
from glob import glob 
import pandas 

# input
read_dir = '191641_dna_seq_rep_8852898_zip/'

# well list 
wells_human = [ '{}{:02}'.format( i, j ) for ( i, j ) in product( 'ABCDEFGH', range( 1, 13 ) ) ]

# convert files 
from Bio import SeqIO

# merge, first need to convert abi files to fastq 
count = 0
for well in wells_human:
    fwd = glob( '{}/{}_*T7.ab1'.format( read_dir, well ) ) 
    rev = glob( '{}/{}_*T7-Ter.ab1'.format( read_dir, well ) )
    if fwd and rev:
        fwdq = fwd[0].replace( 'ab1', 'fastq' )
        revq = rev[0].replace( 'ab1', 'fastq' )
        SeqIO.convert( fwd[0], 'abi', fwdq, 'fastq' )
        SeqIO.convert( rev[0], 'abi', revq, 'fastq' )
        cmd = [ 'pear', '-f', fwdq, '-r', revq, '-o', '{}/{}'.format( read_dir, well ) ] 
        call( cmd )
        
# align and diff, first need to convert fastq files back to fasta  
picks = []

print 'Raw output (||| indicates 3 mutations present at amino acid level):'
for fa in glob( '{}/*.assembled.fastq'.format( read_dir ) ):
    fq = fa.replace( 'fastq', 'fasta' )
    SeqIO.convert( fa, 'fastq', fq, 'fasta' )
    lines = check_output( [ 'blastx', '-subject', 'example/bglb.pep', '-query', fq, '-outfmt',  '6 sseq qseq' ] ).split()
    if lines and lines[0] and lines[1]:
        z = zip( lines[0], lines[1] )
        d = [ '{}{}{}'.format( i, n+1, j ) for n, ( i, j ) in enumerate( z ) if i != j ] 
        if len( d ) == 0: 
            print '[] wild type', fa 
        if len( d ) == 1:
            print d, fa 
            picks.append( ( d[0], fa.split( '/' )[1].split( '.' )[0] ) )
        elif len( d ) > 1:
            print len( d ) * '|', 'mutations', fa  #d, fa

print 'Single mutants:'
df = pandas.DataFrame( picks, columns=[ 'mutant', 'well' ] )
df

Raw output (||| indicates 3 mutations present at amino acid level):
['V311A'] 191641_dna_seq_rep_8852898_zip/A08.assembled.fastq
||||||||| mutations 191641_dna_seq_rep_8852898_zip/A10.assembled.fastq
['L108N'] 191641_dna_seq_rep_8852898_zip/A11.assembled.fastq
['D322A'] 191641_dna_seq_rep_8852898_zip/A12.assembled.fastq
||||||||||| mutations 191641_dna_seq_rep_8852898_zip/B08.assembled.fastq
['W325K'] 191641_dna_seq_rep_8852898_zip/B09.assembled.fastq
||||||| mutations 191641_dna_seq_rep_8852898_zip/B10.assembled.fastq
||| mutations 191641_dna_seq_rep_8852898_zip/B11.assembled.fastq
||||||||| mutations 191641_dna_seq_rep_8852898_zip/B12.assembled.fastq
|||||| mutations 191641_dna_seq_rep_8852898_zip/C08.assembled.fastq
|||||||||||||| mutations 191641_dna_seq_rep_8852898_zip/C09.assembled.fastq
[] wild type 191641_dna_seq_rep_8852898_zip/C10.assembled.fastq
|||| mutations 191641_dna_seq_rep_8852898_zip/C11.assembled.fastq
['M323T'] 191641_dna_seq_rep_8852898_zip/C12.assembled.fastq
[] w

Unnamed: 0,mutant,well
0,V311A,A08
1,L108N,A11
2,D322A,A12
3,W325K,B09
4,M323T,C12
5,L272N,D10
6,Q124H,E08
7,N220K,E09
8,Q313E,F07
9,N293K,F10


## Using phred scores and a fixed quality threshold 

In [87]:
from itertools import product 
from subprocess import call, check_output
from glob import glob 
from Bio import SeqIO

# input
read_dir = '191641_dna_seq_rep_8852898_zip/'

# first, we need a way to quickly generate all the possible wells in a 96-well plate 
wells_human = [ '{}{:02}'.format( i, j ) for ( i, j ) in product( 'ABCDEFGH', range( 1, 13 ) ) ]
# woo one-liners 

# merge 
for well in wells_human:
    
    fwd = glob( '{}/{}_*T7.ab1'.format( read_dir, well ) )
    rev = glob( '{}/{}_*T7-Ter.ab1'.format( read_dir, well ) )

    if fwd and rev:
        
        phred_qual = 10
        
        f = SeqIO.read( open( fwd[0], 'rb' ), "abi" )
        fz = zip( f.letter_annotations['phred_quality'], f.seq )
        fn = ''.join( [ base if qual > phred_qual else 'N' for (qual, base) in fz ] )
        with open( '{}/{}.fwd.fasta'.format( read_dir, well ), 'w' ) as fwd_out:
            fwd_out.write( '>{}\n{}\n'.format( well, fn ) )
        
        r = SeqIO.read( open( rev[0], 'rb' ), "abi" )
        rz = zip( r.letter_annotations['phred_quality'], r.seq )
        rn = ''.join( [ base if qual > phred_qual else 'N' for (qual, base) in rz ] )
        with open( '{}/{}.rev.fasta'.format( read_dir, well ), 'w' ) as rev_out:
            rev_out.write( '>{}\n{}\n'.format( well, rn ) )
            
        cmd = [ 'merger', '-asequence', '{}/{}.fwd.fasta'.format( read_dir, well ), '-bsequence', '{}/{}.rev.fasta'.format( read_dir, well ), '-sreverse2' ]
        outs = [ '-outfile', '{}/{}.merger'.format( read_dir, well ), '-outseq', '{}/{}.fa'.format( read_dir, well ) ]
        call( cmd + outs )

# align and diff 
for fa in glob( '{}/*fa'.format( read_dir ) ):
    lines = check_output( [ 'blastx', '-subject', 'example/bglb.pep', '-query', fa, '-outfmt',  '6 sseq qseq' ] ).split()
    z = zip( lines[0], lines[1] )
    d = [ '{}{}{}'.format( i, n+1, j ) for n, ( i, j ) in enumerate( z ) if i != j ] 
    if len( d ) == 1:
        print d, fa 
    elif len( d ) > 1:
        print len( d ) * '|' #d, fa
                          

|||
['V92D'] 191641_dna_seq_rep_8852898_zip/A09.fa
|||
['L108N'] 191641_dna_seq_rep_8852898_zip/A11.fa
||
|||
['W325K'] 191641_dna_seq_rep_8852898_zip/B09.fa
|||
|||
|||||||||||||||
||
||||||||||||
['L108N'] 191641_dna_seq_rep_8852898_zip/C11.fa
|||
['I4X'] 191641_dna_seq_rep_8852898_zip/D08.fa
|||
||||||
|||||||||||
['I4X'] 191641_dna_seq_rep_8852898_zip/E08.fa
['N220X'] 191641_dna_seq_rep_8852898_zip/E09.fa
|||||||||||||||||||||||||||||||||||||||||||||||
['W180H'] 191641_dna_seq_rep_8852898_zip/E11.fa
||||
['Q313E'] 191641_dna_seq_rep_8852898_zip/F07.fa
||
||||||||||||||||||||||||||||||
['N293K'] 191641_dna_seq_rep_8852898_zip/F10.fa
||
['H178N'] 191641_dna_seq_rep_8852898_zip/F12.fa
['V311D'] 191641_dna_seq_rep_8852898_zip/G08.fa
['N220X'] 191641_dna_seq_rep_8852898_zip/G09.fa
['D322A'] 191641_dna_seq_rep_8852898_zip/G11.fa
['H178N'] 191641_dna_seq_rep_8852898_zip/G12.fa
['Q313E'] 191641_dna_seq_rep_8852898_zip/H07.fa
['V311D'] 191641_dna_seq_rep_8852898_zip/H08.fa
|||||||
['N293K']

## Which provides better results? 