In [1]:
import qcbc
from seqwalk import design
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

First, we make a seqwalk library that is analogous to the multi-seq library.

In [2]:
sw_lib = design.max_orthogonality(9, 8, alphabet="ACGT", )#GClims=(7,10))
del sw_lib[7]
d = {"seq": sw_lib,
    "name" : ["seq_%d" % i for i in range(len(sw_lib))]}

df = pd.DataFrame(d)

Number of sequences: 10
SSM k value: 3


In [3]:
df.to_csv("qcbc/sw_multi_analog.txt", index=False, header=None, sep='\t')

Then we do all the QCBC analysis.

In [4]:
# SeqWalk homopolymer

s = !qcbc homopolymer qcbc/sw_multi_analog.txt
s = [l + "\n" for l in s]

f = open("qcbc/SW_multi_homopolymer.txt", "w+")
f.writelines(s)
f.close()

# MultiSeq homopolymer

s = !qcbc homopolymer qcbc/multiseq.txt
s = [l + "\n" for l in s]

f = open("qcbc/multi_homopolymer.txt", "w+")
f.writelines(s)
f.close()

In [5]:
df_SW = pd.read_csv("qcbc/SW_multi_homopolymer.txt", sep='\t')
df_M = pd.read_csv("qcbc/multi_homopolymer.txt", sep='\t')

In [6]:
# average of run length * prevalence

def weighted_homopolymer_score(df, bc_id):
    runs = [int(i) for i in df["homopolymer_length"][bc_id].split(',')]
    return np.sum(np.array(runs) * (np.arange(7) + 2))

h_scores_SW = []
h_scores_M = []
for bc_id in range(9):
    h_scores_SW.append(weighted_homopolymer_score(df_SW, bc_id))
    h_scores_M.append(weighted_homopolymer_score(df_M, bc_id))

In [7]:
print("Seqwalk average homopolymer score: %f" % (np.mean(h_scores_SW)))

print("Multiseq average homopolymer score: %f" % (np.mean(h_scores_M)))

Seqwalk average homopolymer score: 3.333333
Multiseq average homopolymer score: 4.111111


In [8]:
# SeqWalk pdist

s = !qcbc pdist qcbc/sw_multi_analog.txt
s = [l + "\n" for l in s]

f = open("qcbc/SW_multi_pdist.txt", "w+")
f.writelines(s)
f.close()

# MultiSeq pdist

s = !qcbc pdist qcbc/multiseq.txt
s = [l + "\n" for l in s]

f = open("qcbc/multi_pdist.txt", "w+")
f.writelines(s)
f.close()

In [9]:
df_SW = pd.read_csv("qcbc/SW_multi_pdist.txt", sep='\t', header=None)
df_M = pd.read_csv("qcbc/multi_pdist.txt", sep='\t', header=None)

In [10]:
print("Seqwalk mean pairwise distance: %f" %(np.mean(df_SW[4])))
print("Multiseq mean pairwise distance: %f" %(np.mean(df_M[4])))

Seqwalk mean pairwise distance: 5.777778
Multiseq mean pairwise distance: 5.694444


In [11]:
s = !qcbc content qcbc/sw_multi_analog.txt -e
s = [l + "\n" for l in s]

f = open("qcbc/SW_multi_content.txt", "w+")
f.writelines(s)
f.close()

# MultiSeq nt entropy

s = !qcbc content qcbc/multiseq.txt -e
s = [l + "\n" for l in s]

f = open("qcbc/multi_content.txt", "w+")
f.writelines(s)
f.close()

In [12]:
df_SW = pd.read_csv("qcbc/SW_multi_content.txt", sep='\t')
df_M = pd.read_csv("qcbc/multi_content.txt", sep='\t')

In [13]:
print("Seqwalk mean GC entropy score: %f" %(np.mean(df_SW["ent"])))
print("Multiseq mean GC entropy score: %f" %(np.mean(df_M["ent"])))

Seqwalk mean GC entropy score: 0.547778
Multiseq mean GC entropy score: 0.551111


In [14]:
n_ambiguous_SW = []
for x in range(2, 8):
    s = !qcbc ambiguous -l $x qcbc/sw_multi_analog.txt
    ambi_bcs = []
    for amb in s:
        ambi_bcs += amb.split('\t')[1].split(',')
    n_ambiguous_SW.append(len(set(ambi_bcs)))

# compute ambiguous for l=2 to l=7
# record number of barcodes with ambuities

n_ambiguous_M = []
for x in range(2, 8):
    s = !qcbc ambiguous -l $x qcbc/multiseq.txt
    ambi_bcs = []
    for amb in s:
        ambi_bcs += amb.split('\t')[1].split(',')
    n_ambiguous_M.append(len(set(ambi_bcs)))

In [15]:
print("Number of ambiguous SeqWalk barcodes for l=2 to l=7")
n_ambiguous_SW

Number of ambiguous SeqWalk barcodes for l=2 to l=7


[9, 0, 0, 0, 0, 0]

In [16]:
print("Number of ambiguous MultiSeq barcodes for l=2 to l=7")
n_ambiguous_M

Number of ambiguous MultiSeq barcodes for l=2 to l=7


[9, 9, 5, 2, 2, 2]