# Kunkel sequence verifier 

Will attempt to verify your Kunkel sequencing results and pick clones with desired mutations. 

In [1]:
! ls U0977BB190/ | tail

H04_U0977BB190-95_A11-2_T7-Ter.ab1
H04_U0977BB190-95_A11-2_T7-Ter.seq
H05_U0977BB190-103_A14-1_T7-Ter.ab1
H05_U0977BB190-103_A14-1_T7-Ter.seq
H05_U0977BB190-40_A14-1_T7.ab1
H05_U0977BB190-40_A14-1_T7.seq
H06_U0977BB190-111_A16-3_T7-Ter.ab1
H06_U0977BB190-111_A16-3_T7-Ter.seq
H06_U0977BB190-48_A16-3_T7.ab1
H06_U0977BB190-48_A16-3_T7.seq


## Using `pear` to join paired reads 

Unfortunatly requires converting back and forth between file types 

In [3]:
from itertools import product 
from subprocess import call, check_output
from glob import glob 
import pandas 

# input
read_dir = 'U0977BB190/'

# well list 
wells_human = [ '{}{:02}'.format( i, j ) for ( i, j ) in product( 'ABCDEFGH', range( 1, 13 ) ) ]

# convert files 
from Bio import SeqIO

# merge, first need to convert abi files to fastq 
count = 0
for well in wells_human:
    fwd = glob( '{}/{}_*T7.ab1'.format( read_dir, well ) ) 
    rev = glob( '{}/{}_*T7-Ter.ab1'.format( read_dir, well ) )
    if fwd and rev:
        fwdq = fwd[0].replace( 'ab1', 'fastq' )
        revq = rev[0].replace( 'ab1', 'fastq' )
        SeqIO.convert( fwd[0], 'abi', fwdq, 'fastq' )
        SeqIO.convert( rev[0], 'abi', revq, 'fastq' )
        cmd = [ 'pear', '-f', fwdq, '-r', revq, '-o', '{}/{}'.format( read_dir, well ) ] 
        call( cmd )
        
# align and diff, first need to convert fastq files back to fasta  
picks = []

print 'Raw output (||| indicates 3 mutations present at amino acid level):'
for fa in glob( '{}/*.assembled.fastq'.format( read_dir ) ):
    fq = fa.replace( 'fastq', 'fasta' )
    SeqIO.convert( fa, 'fastq', fq, 'fasta' )
    lines = check_output( [ 'blastx', '-subject', 'example/bglb.pep', '-query', fq, '-outfmt',  '6 sseq qseq' ] ).split()
    if lines and lines[0] and lines[1]:
        z = zip( lines[0], lines[1] )
        d = [ '{}{}{}'.format( i, n+1, j ) for n, ( i, j ) in enumerate( z ) if i != j ] 
        if len( d ) == 0: 
            print '[] wild type', fa 
        if len( d ) == 1:
            print d, fa 
            picks.append( ( d[0], fa.split( '/' )[1].split( '.' )[0] ) )
        elif len( d ) > 1:
            print len( d ) * '|', 'mutations', fa  #d, fa

print 'Single mutants:'
df = pandas.DataFrame( picks, columns=[ 'mutant', 'well' ] )
df

Raw output (||| indicates 3 mutations present at amino acid level):
['N163S'] U0977BB190/A02.assembled.fastq
|| mutations U0977BB190/A03.assembled.fastq
['N158E'] U0977BB190/A04.assembled.fastq
|| mutations U0977BB190/A05.assembled.fastq
|| mutations U0977BB190/A06.assembled.fastq
['I244V'] U0977BB190/A07.assembled.fastq
||| mutations U0977BB190/B02.assembled.fastq
['N163E'] U0977BB190/B04.assembled.fastq
[] wild type U0977BB190/B05.assembled.fastq
|||| mutations U0977BB190/B06.assembled.fastq
['I244V'] U0977BB190/B07.assembled.fastq
||||||||||||| mutations U0977BB190/C01.assembled.fastq
['R240K'] U0977BB190/C02.assembled.fastq
['E406Q'] U0977BB190/C03.assembled.fastq
||||||| mutations U0977BB190/C04.assembled.fastq
['N293S'] U0977BB190/C06.assembled.fastq
|| mutations U0977BB190/C07.assembled.fastq
|||||| mutations U0977BB190/D01.assembled.fastq
['R240K'] U0977BB190/D02.assembled.fastq
||||||||| mutations U0977BB190/D03.assembled.fastq
['N220Q'] U0977BB190/D04.assembled.fastq
||||| mu

Unnamed: 0,mutant,well
0,N163S,A02
1,N158E,A04
2,I244V,A07
3,N163E,B04
4,I244V,B07
5,R240K,C02
6,E406Q,C03
7,N293S,C06
8,R240K,D02
9,N220Q,D04


## Using phred scores and a fixed quality threshold 

In [87]:
from itertools import product 
from subprocess import call, check_output
from glob import glob 
from Bio import SeqIO

# input
read_dir = '191641_dna_seq_rep_8852898_zip/'

# first, we need a way to quickly generate all the possible wells in a 96-well plate 
wells_human = [ '{}{:02}'.format( i, j ) for ( i, j ) in product( 'ABCDEFGH', range( 1, 13 ) ) ]
# woo one-liners 

# merge 
for well in wells_human:
    
    fwd = glob( '{}/{}_*T7.ab1'.format( read_dir, well ) )
    rev = glob( '{}/{}_*T7-Ter.ab1'.format( read_dir, well ) )

    if fwd and rev:
        
        phred_qual = 10
        
        f = SeqIO.read( open( fwd[0], 'rb' ), "abi" )
        fz = zip( f.letter_annotations['phred_quality'], f.seq )
        fn = ''.join( [ base if qual > phred_qual else 'N' for (qual, base) in fz ] )
        with open( '{}/{}.fwd.fasta'.format( read_dir, well ), 'w' ) as fwd_out:
            fwd_out.write( '>{}\n{}\n'.format( well, fn ) )
        
        r = SeqIO.read( open( rev[0], 'rb' ), "abi" )
        rz = zip( r.letter_annotations['phred_quality'], r.seq )
        rn = ''.join( [ base if qual > phred_qual else 'N' for (qual, base) in rz ] )
        with open( '{}/{}.rev.fasta'.format( read_dir, well ), 'w' ) as rev_out:
            rev_out.write( '>{}\n{}\n'.format( well, rn ) )
            
        cmd = [ 'merger', '-asequence', '{}/{}.fwd.fasta'.format( read_dir, well ), '-bsequence', '{}/{}.rev.fasta'.format( read_dir, well ), '-sreverse2' ]
        outs = [ '-outfile', '{}/{}.merger'.format( read_dir, well ), '-outseq', '{}/{}.fa'.format( read_dir, well ) ]
        call( cmd + outs )

# align and diff 
for fa in glob( '{}/*fa'.format( read_dir ) ):
    lines = check_output( [ 'blastx', '-subject', 'example/bglb.pep', '-query', fa, '-outfmt',  '6 sseq qseq' ] ).split()
    z = zip( lines[0], lines[1] )
    d = [ '{}{}{}'.format( i, n+1, j ) for n, ( i, j ) in enumerate( z ) if i != j ] 
    if len( d ) == 1:
        print d, fa 
    elif len( d ) > 1:
        print len( d ) * '|' #d, fa
                          

|||
['V92D'] 191641_dna_seq_rep_8852898_zip/A09.fa
|||
['L108N'] 191641_dna_seq_rep_8852898_zip/A11.fa
||
|||
['W325K'] 191641_dna_seq_rep_8852898_zip/B09.fa
|||
|||
|||||||||||||||
||
||||||||||||
['L108N'] 191641_dna_seq_rep_8852898_zip/C11.fa
|||
['I4X'] 191641_dna_seq_rep_8852898_zip/D08.fa
|||
||||||
|||||||||||
['I4X'] 191641_dna_seq_rep_8852898_zip/E08.fa
['N220X'] 191641_dna_seq_rep_8852898_zip/E09.fa
|||||||||||||||||||||||||||||||||||||||||||||||
['W180H'] 191641_dna_seq_rep_8852898_zip/E11.fa
||||
['Q313E'] 191641_dna_seq_rep_8852898_zip/F07.fa
||
||||||||||||||||||||||||||||||
['N293K'] 191641_dna_seq_rep_8852898_zip/F10.fa
||
['H178N'] 191641_dna_seq_rep_8852898_zip/F12.fa
['V311D'] 191641_dna_seq_rep_8852898_zip/G08.fa
['N220X'] 191641_dna_seq_rep_8852898_zip/G09.fa
['D322A'] 191641_dna_seq_rep_8852898_zip/G11.fa
['H178N'] 191641_dna_seq_rep_8852898_zip/G12.fa
['Q313E'] 191641_dna_seq_rep_8852898_zip/H07.fa
['V311D'] 191641_dna_seq_rep_8852898_zip/H08.fa
|||||||
['N293K']

## Which provides better results? 