In [25]:
from Bio import SeqIO
from glob import glob 
from subprocess import call, check_output

In [26]:
# params and paths

read_dir = 'my_reads'
my_sample_names = [ 'alex{}'.format( i ) for i in range( 37 ) ]
wt_pep = 'order_peptide_sequences.fasta' 

In [27]:
# convert all the reads to FASTQ

for read in glob( '{}/*ab1'.format( read_dir ) ):
    SeqIO.convert( read, 'abi', read.replace( 'ab1', 'fastq' ), 'fastq' )

In [28]:
# pair and merge the reads 

expect = 0
for sample_name in my_sample_names:
    fwd = glob( '{}/{}_T7_*.fastq'.format( read_dir, sample_name ) ) 
    rev = glob( '{}/{}_T7term_*.fastq'.format( read_dir, sample_name ) )
    print fwd, rev
    if len( fwd ) == len( rev ) == 1:
        my_sample = '{}/{}'.format( read_dir, sample_name )
        cmd = [ 'pear', '-f', fwd[0], '-r', rev[0], '-o', my_sample ]
        call( cmd )
        expect += 1 

['my_reads/alex0_T7_Plate_Plate01_A01.fastq'] ['my_reads/alex0_T7term_Plate_Plate01_B01.fastq']
['my_reads/alex1_T7_Plate_Plate01_C01.fastq'] ['my_reads/alex1_T7term_Plate_Plate01_D01.fastq']
['my_reads/alex2_T7_Plate_Plate01_E01.fastq'] ['my_reads/alex2_T7term_Plate_Plate01_F01.fastq']
['my_reads/alex3_T7_Plate_Plate01_G01.fastq'] ['my_reads/alex3_T7term_Plate_Plate01_H01.fastq']
['my_reads/alex4_T7_Plate_Plate01_A02.fastq'] ['my_reads/alex4_T7term_Plate_Plate01_B02.fastq']
['my_reads/alex5_T7_Plate_Plate01_C02.fastq'] ['my_reads/alex5_T7term_Plate_Plate01_D02.fastq']
['my_reads/alex6_T7_Plate_Plate01_E02.fastq'] ['my_reads/alex6_T7term_Plate_Plate01_F02.fastq']
['my_reads/alex7_T7_Plate_Plate01_G02.fastq'] ['my_reads/alex7_T7term_Plate_Plate01_H02.fastq']
['my_reads/alex8_T7_Plate_Plate01_A03.fastq'] ['my_reads/alex8_T7term_Plate_Plate01_B03.fastq']
['my_reads/alex9_T7_Plate_Plate01_C03.fastq'] ['my_reads/alex9_T7term_Plate_Plate01_D03.fastq']
['my_reads/alex10_T7_Plate_Plate01_E03.f

In [30]:
# convert all the assemblies to FASTA

for assembly in glob( '{}/*.assembled.fastq'.format( read_dir ) ):
    SeqIO.convert( assembly, 'fastq', assembly.replace( 'fastq', 'fasta' ), 'fasta' )

# put in one file 
! cat my_reads/*assembled.fasta > my_reads.fa 

In [50]:
! blastx -help

USAGE
  blastx [-h] [-help] [-import_search_strategy filename]
    [-export_search_strategy filename] [-task task_name] [-db database_name]
    [-dbsize num_letters] [-gilist filename] [-seqidlist filename]
    [-negative_gilist filename] [-entrez_query entrez_query]
    [-db_soft_mask filtering_algorithm] [-db_hard_mask filtering_algorithm]
    [-subject subject_input_file] [-subject_loc range] [-query input_file]
    [-out output_file] [-evalue evalue] [-word_size int_value]
    [-gapopen open_penalty] [-gapextend extend_penalty]
    [-qcov_hsp_perc float_value] [-max_hsps int_value]
    [-xdrop_ungap float_value] [-xdrop_gap float_value]
    [-xdrop_gap_final float_value] [-searchsp int_value]
    [-sum_stats bool_value] [-max_intron_length length] [-seg SEG_options]
    [-soft_masking soft_masking] [-matrix matrix_name]
    [-threshold float_value] [-culling_limit int_value]
    [-best_hit_overhang float_value] [-best_hit_score_edge float_value]
    [-window_size in

In [58]:
# return lengths of all perfect nucleotide matches 
! blastn -subject order_nucleotide_sequences.fasta -query my_reads.fa -perc_identity 99 -outfmt "6 length sseqid qseqid" | sort -rg 

842	1r5l_A	alex7_T7
842	1r5l_A	alex6_T7
835	1r5l_C	alex4_T7
776	1fcy_C_3Tyr	alex36_T7
776	1fcy_C	alex36_T7
769	1fcy_A	alex28_T7
654	1qv0_A	alex17_T7
653	1qv0_C	alex25_T7
653	1qv0_A	alex16_T7
653	1qv0_A	alex15_T7
623	1bvv_C	alex14_T7
623	1bvv_C	alex13_T7
623	1bvv_C	alex12_T7
623	1bvv_A	alex1_T7
623	1bvv_A	alex0_T7
165	1r5l_C	alex8_T7
165	1r5l_A	alex8_T7
161	1qv0_C	alex24_T7
161	1qv0_C	alex23_T7
72	1qv0_C	alex24_T7
72	1qv0_C	alex23_T7
55	1r5l_C	alex8_T7
55	1r5l_A	alex8_T7
43	1upv_C	alex8_T7
43	1upv_C	alex7_T7
43	1upv_C	alex6_T7
43	1upv_A	alex8_T7
43	1upv_A	alex7_T7
43	1upv_A	alex6_T7
43	1fcy_C_3Tyr	alex1_T7
43	1fcy_C_3Tyr	alex14_T7
43	1fcy_C_3Tyr	alex13_T7
43	1fcy_C_3Tyr	alex12_T7
43	1fcy_C_3Tyr	alex0_T7
43	1fcy_C	alex1_T7
43	1fcy_C	alex14_T7
43	1fcy_C	alex13_T7
43	1fcy_C	alex12_T7
43	1fcy_C	alex0_T7
43	1fcy_A	alex1_T7
43	1fcy_A	alex14_T7
43	1fcy_A	alex13_T7
43	1fcy_A	alex12_T7
43	1fcy_A	alex0_T7
42	1upv_C	alex25_T7
42	1upv_C	alex24_T7
42	1up

Sequence perfect at nucleotide level 

```
842	1r5l_A  
653	1qv0_C  
653	1qv0_A  
623	1bvv_C  
623	1bvv_C
623	1bvv_C
623	1bvv_A
623	1bvv_A
```

In [59]:
%%bash 

MYOUT="6 length pident mismatch sseqid qseqid"
echo $MYOUT
blastx -subject order_peptide_sequences.fasta -query my_reads.fa -outfmt "$MYOUT" | sort -rg -k2 

6 length pident mismatch sseqid qseqid
6	100.00	0	1ukb_C	alex20_T7
6	100.00	0	1ukb_A	alex20_T7
40	100.00	0	1qv0_C	alex24_T7
40	100.00	0	1qv0_C	alex23_T7
3	100.00	0	1r5l_C	alex24_T7
3	100.00	0	1r5l_C	alex23_T7
3	100.00	0	1r5l_A	alex24_T7
3	100.00	0	1r5l_A	alex23_T7
240	100.00	0	1r5l_C	alex4_T7
240	100.00	0	1r5l_A	alex7_T7
240	100.00	0	1r5l_A	alex6_T7
236	100.00	0	1fcy_C	alex36_T7
236	100.00	0	1fcy_A	alex28_T7
195	100.00	0	1qv0_C	alex25_T7
195	100.00	0	1qv0_A	alex16_T7
195	100.00	0	1qv0_A	alex15_T7
185	100.00	0	1bvv_C	alex14_T7
185	100.00	0	1bvv_C	alex13_T7
185	100.00	0	1bvv_C	alex12_T7
185	100.00	0	1bvv_A	alex1_T7
185	100.00	0	1bvv_A	alex0_T7
15	100.00	0	1qv0_C	alex24_T7
15	100.00	0	1qv0_C	alex23_T7
236	98.73	3	1fcy_C_3Tyr	alex36_T7
185	98.38	3	1bvv_C	alex1_T7
185	98.38	3	1bvv_C	alex0_T7
185	98.38	3	1bvv_A	alex14_T7
185	98.38	3	1bvv_A	alex13_T7
185	98.38	3	1bvv_A	alex12_T7
240	98.33	4	1r5l_C	alex7_T7
240	98.33	4	1r5l_C	alex6_T7
240	98.33	4	1r5l_A	alex4_T7
253	98.02	5	1upv_C	alex31_T7
40

### Sequencing results  

