In [1]:
from Bio import SeqIO, pairwise2
from Bio.Seq import Seq
import pandas as pd
import time
import re
import numpy as np
import csv
from pathlib import Path

Read in some initial data:

In [2]:
Lib_Info = pd.read_csv('~/Dropbox/xenoMAGE/Libraries/Lib_Info.csv',index_col=2)
barcode_to_protein = dict(zip(Lib_Info.Barcode,Lib_Info.Protein))
protein_to_barcode = dict(zip(Lib_Info.Protein,Lib_Info.Barcode))

Samples 1  -  20 = SSAPs in pARC8 selections 1 to 5, replicates A-C and the toxicity control
Sample 129 = initial libraries 

In [6]:
libraries = ['EC-SSAP','EC-Dual','LL-SSAP','LL-Dual']
steplist = {'EC-SSAP':['initial','sort-1','sort-2','sort-3','sort-4','sort-5']}
samplelist = {'EC-SSAP':[1,20]}
initiallib = {'EC-SSAP':[129]}
replicates = ['1','2','3','-']
preSequence = {'SSAP':'CTTCCGATCTCT'}

barcodes = {}
barcodes2 = {}
barcode_pairs = {}

for l in libraries:
    steps = steplist[l]
    a = b = 0
    libtype = re.split('-',l)[1]
    
    samples = initiallib[l] + list(range(samplelist[l][0],samplelist[l][1]+1))
    for i in samples:

        sample_name = l + '_' + steps[a] + '_' + replicates[b]
        if a == 0:
            sample_name = l + '_' + steps[a]

        # clock the process time for each sample
        print('processing sample',sample_name,'..')
        t0 = time.clock()

        # initialize tracking variables
        count = 0
        junk = 0
        no_match = 0

        # read-in fastQ files using BioPython
        data = 'TW10/trimmed_TW10-%s.fastq' % str(i).zfill(2)
        trimmed = SeqIO.parse(data1,'fastq')

        # initialize tracking variable
        barcodes[sample_name] = []
        barcodes2[sample_name] = []
        barcode_pairs[sample_name] = []

        # iterate through reads
        for j in trimmed:
            count += 1
            match = ''
            match2 = ''

            # quality filter any reads shorter than 50-nt
            seq = str(j.seq)
            seqlen = len(seq)
            if seqlen > 60 and not re.search('[^ACTG]',seq):

                # match to a 12-nt sequence before the beginning of the varied region.
                match = seq.find(preSequence['SSAP'])

                # if the priming sequence(s) are found, record barcode(s), otherwise count as junk.
                if libtype == 'Dual':
                    match2 = seq.find(preSequence['SSB'])

                    if seqlen - 24 > match > 0 and seqlen - 24 > match2 > 0:
                        barcode = seq[match+12 : match+24]
                        barcodes[sample_name].append(barcode)
                        barcode2 = seq[match2+12 : match2+24]
                        barcodes2[sample_name].append(barcode2)
                        barcode_pair = barcode + '_' + barcode2
                        barcode_pairs[sample_name].append(barcode_pair)
                    else:
                        no_match +=1
                else:
                    if match > 0:
                        barcode = seq[match+12 : match+24]
                        barcodes[sample_name].append(barcode)
                    else:
                        no_match += 1
            else:
                junk += 1

        # report out time and statistics
        t1 = time.clock()
        print('took ',t1-t0,'seconds to process',sample_name,'which contains: \n',count,'forward reads',
              '\n',junk,'of which were binned as junk, and \n',no_match,'of which had no match.')

        # iterate counter
        b += 1
        if a == 0:
            a = 1
            b = 0
        if b == len(replicates):
            b = 0
            a += 1

processing sample EC-SSAP_initial ..
took  56.38775000000001 seconds to process EC-SSAP_initial which contains: 
 903233 forward reads 
 6951 of which were binned as junk, and 
 857962 of which had no match.
processing sample EC-SSAP_sort-1_1 ..
took  0.7578730000000178 seconds to process EC-SSAP_sort-1_1 which contains: 
 26001 forward reads 
 55 of which were binned as junk, and 
 646 of which had no match.
processing sample EC-SSAP_sort-1_2 ..
took  0.25842600000001426 seconds to process EC-SSAP_sort-1_2 which contains: 
 9109 forward reads 
 15 of which were binned as junk, and 
 148 of which had no match.
processing sample EC-SSAP_sort-1_3 ..
took  0.30096800000001167 seconds to process EC-SSAP_sort-1_3 which contains: 
 10649 forward reads 
 18 of which were binned as junk, and 
 214 of which had no match.
processing sample EC-SSAP_sort-1_- ..
took  0.3103270000000009 seconds to process EC-SSAP_sort-1_- which contains: 
 10465 forward reads 
 17 of which were binned as junk, and 

took  0.8246469999999988 seconds to process EC-Dual_sort-5_3 which contains: 
 29203 forward reads 
 78 of which were binned as junk, and 
 763 of which had no match.
processing sample EC-Dual_sort-5_- ..
took  0.6943699999999922 seconds to process EC-Dual_sort-5_- which contains: 
 24475 forward reads 
 57 of which were binned as junk, and 
 546 of which had no match.
processing sample EC-Dual_pre-6_1 ..
took  1.2032050000000254 seconds to process EC-Dual_pre-6_1 which contains: 
 43136 forward reads 
 40 of which were binned as junk, and 
 1283 of which had no match.
processing sample EC-Dual_pre-6_2 ..
took  15.661662000000007 seconds to process EC-Dual_pre-6_2 which contains: 
 615055 forward reads 
 1283 of which were binned as junk, and 
 497316 of which had no match.
processing sample EC-Dual_pre-6_3 ..
took  6.254505999999992 seconds to process EC-Dual_pre-6_3 which contains: 
 240913 forward reads 
 444 of which were binned as junk, and 
 212152 of which had no match.
processi

took  0.3348540000000071 seconds to process LL-SSAP_sort-3_2 which contains: 
 12407 forward reads 
 39 of which were binned as junk, and 
 266 of which had no match.
processing sample LL-SSAP_sort-3_3 ..
took  0.1603900000000067 seconds to process LL-SSAP_sort-3_3 which contains: 
 5963 forward reads 
 15 of which were binned as junk, and 
 116 of which had no match.
processing sample LL-SSAP_sort-3_- ..
took  0.3358880000000113 seconds to process LL-SSAP_sort-3_- which contains: 
 12647 forward reads 
 36 of which were binned as junk, and 
 253 of which had no match.
processing sample LL-SSAP_sort-4_1 ..
took  0.21928200000002107 seconds to process LL-SSAP_sort-4_1 which contains: 
 8158 forward reads 
 30 of which were binned as junk, and 
 118 of which had no match.
processing sample LL-SSAP_sort-4_2 ..
took  0.20395100000001776 seconds to process LL-SSAP_sort-4_2 which contains: 
 7650 forward reads 
 33 of which were binned as junk, and 
 132 of which had no match.
processing sam

took  1.0965320000000247 seconds to process LL-Dual_pre-3_1 which contains: 
 36036 forward reads 
 113 of which were binned as junk, and 
 976 of which had no match.
processing sample LL-Dual_pre-3_2 ..
took  2.0563940000000684 seconds to process LL-Dual_pre-3_2 which contains: 
 70203 forward reads 
 153 of which were binned as junk, and 
 1348 of which had no match.
processing sample LL-Dual_pre-3_3 ..
took  2.0373690000000124 seconds to process LL-Dual_pre-3_3 which contains: 
 68315 forward reads 
 166 of which were binned as junk, and 
 1685 of which had no match.
processing sample LL-Dual_pre-3_- ..
took  1.2144240000000082 seconds to process LL-Dual_pre-3_- which contains: 
 40102 forward reads 
 108 of which were binned as junk, and 
 961 of which had no match.
processing sample LL-Dual_sort-3_1 ..
took  2.1302830000000768 seconds to process LL-Dual_sort-3_1 which contains: 
 73582 forward reads 
 114 of which were binned as junk, and 
 21894 of which had no match.
processing 

took  1.496356999999989 seconds to process LL-Dual_sort-9_1 which contains: 
 55633 forward reads 
 102 of which were binned as junk, and 
 30234 of which had no match.
processing sample LL-Dual_sort-9_2 ..
took  1.42177700000002 seconds to process LL-Dual_sort-9_2 which contains: 
 52586 forward reads 
 61 of which were binned as junk, and 
 23832 of which had no match.
processing sample LL-Dual_sort-9_3 ..
took  2.1334920000000466 seconds to process LL-Dual_sort-9_3 which contains: 
 81833 forward reads 
 99 of which were binned as junk, and 
 43609 of which had no match.
processing sample LL-Dual_sort-9_- ..
took  2.398005000000012 seconds to process LL-Dual_sort-9_- which contains: 
 87810 forward reads 
 82 of which were binned as junk, and 
 26697 of which had no match.
processing sample LL-Dual_sort-10_1 ..
took  1.10646399999996 seconds to process LL-Dual_sort-10_1 which contains: 
 40998 forward reads 
 33 of which were binned as junk, and 
 16782 of which had no match.
proces

In [7]:
# combine barcodes into dataframe, count their occurrences, separate data by experiment, sort and filter data,
# and export data as an excel file.

n = 40 # filter out any single barcodes counted fewer than n times in aggregate

# First count the number of times each barcode is seen in every experiment

barcodes = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in barcodes.items() ]))
counts = pd.DataFrame()
for c in barcodes.columns:
    count = barcodes[c].value_counts()
    count = pd.DataFrame(count,columns=[c])
    counts = pd.concat([counts,count], axis=1)

sums = counts.sum(axis=1)
sums = sums[sums > n]
counts = counts[counts.index.isin(sums.index)]

# Then assign a library number to each barcode and sort dataframes by library number and then the initial count

counts.insert(loc=0, column='SSAP', value=[None]*counts.shape[0])
for i in counts.index.tolist():
    try:
        counts.loc[i,'SSAP'] = barcode_to_protein[i]
    except:
        continue
counts = counts.sort_values(by=['SSAP','EC-Dual_initial'],ascending=[True,False])


exp_order = ['EC-SSAP_initial','EC-SSAP_sort-1_-','EC-SSAP_sort-1_1','EC-SSAP_sort-1_2','EC-SSAP_sort-1_3',
             'EC-SSAP_sort-2_-','EC-SSAP_sort-2_1','EC-SSAP_sort-2_2','EC-SSAP_sort-2_3','EC-SSAP_sort-3_-',
             'EC-SSAP_sort-3_1','EC-SSAP_sort-3_2','EC-SSAP_sort-3_3','EC-SSAP_sort-4_-','EC-SSAP_sort-4_1',
             'EC-SSAP_sort-4_2','EC-SSAP_sort-4_3','EC-SSAP_sort-5_-','EC-SSAP_sort-5_1','EC-SSAP_sort-5_2',
             'EC-SSAP_sort-5_3']
counts = counts[['SSAP'] + exp_order]
 
# Distribute data between experiments

EC_SSAP = pd.concat([counts['SSAP'], counts.filter(regex='EC-SSAP')], axis=1)
EC_SSAP = EC_SSAP.sort_values(by=['SSAP','EC-SSAP_initial'],ascending=[True,False])

# Calculate last selection normalized read counts per thousand

d = {'EC-SSAP_sort-5_':EC_SSAP}
reps = ['1','2','3']
for sel,lib in d.items():
    df = pd.DataFrame()
    for rep in reps:
        df[sel+rep] = lib[sel+rep]/(lib[sel+rep].sum()*len(reps))*1000
    lib['LastSel_perThousand'] = df.filter(regex=sel).sum(axis=1)

# Write out to Excel

writer = pd.ExcelWriter('EC_selections.xlsx', engine='xlsxwriter')
EC_SSAP.to_excel(writer, sheet_name = 'EC_SSAP')
writer.save()