In [1]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import numpy as np
from seqwalk import design
from time import time, sleep
import subprocess
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

In [5]:
%%time

# generate random seqs
lib = ["".join(np.random.choice(["A", "C", "G", "T"], size=25)) for _ in range(5*10**5)]

# save as FASTA
seqs = [SeqRecord(Seq(lib[l]), id="seq%d"%l) for l in range(len(lib))]
SeqIO.write(seqs, "million.fasta", "fasta")

t1 = time()

cmd= 'blastn -query million.fasta -subject million.fasta -outfmt 6 -out res_million -dust no -word_size 11'
cmd = cmd.split()    
subprocess.Popen(cmd).wait()

df = pd.read_csv("res_million", sep='\t', header=None)
print("BLAST records: %d" %(len(df)))
trashbin = []
keeper = []
for row in df.iterrows():
    if row[1][3] < 13 and row[1][3] != 25: 
        query = row[1][0]
        target = row[1][1]
        if query not in keeper:
            if query not in trashbin:
                keeper.append(query)
                trashbin.append(target)
        else:
            trashbin.append(target)
            
print("Library size: %d" %(len(keeper)))

# only perfect identity
df = df[df[2] == 100]

print("BLAST records w perfect identity: %d" %(len(df)))

trashbin = set()
keeper = set()
for row in df.iterrows():
    if row[1][3] < 13 and row[1][3] != 25: 
        query = row[1][0]
        target = row[1][1]
        if query not in keeper:
            if query not in trashbin:
                keeper.add(query)
                trashbin.add(target)
        else:
            trashbin.add(target)

print("Library size: %d" %(len(keeper)))

BLAST records: 939674
Library size: 27935
BLAST records w perfect identity: 866459
Library size: 27935
CPU times: user 6min 57s, sys: 544 ms, total: 6min 57s
Wall time: 7min 46s
