In [None]:
import os
import glob
import datetime
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
import pandas as pd
import scipy
from scipy.optimize import curve_fit

#from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment

import gsf_ims_fitness as fitness

import pickle

import random

import gzip

import seaborn as sns
sns.set()

#from sklearn.mixture import GaussianMixture
#from sklearn.mixture import BayesianGaussianMixture

%load_ext autoreload
%autoreload 2

%matplotlib inline

%autosave 0

In [None]:
notebook_directory = os.getcwd()
notebook_directory

In [None]:
data_directory = notebook_directory + "\\barcode_analysis"
os.chdir(data_directory)
os.getcwd()

In [None]:
glob.glob("*reverse_barcode*.csv")

In [None]:
reverse_barcode_map_file = glob.glob("*reverse_barcode*.csv")[0]
reverse_barcode_map_file

In [None]:
rev_barcode_clusterID_frame = pd.read_csv(reverse_barcode_map_file, skipinitialspace=True)
rev_barcode_clusterID_frame[:5]

In [None]:
reverse_barcode_center_file = glob.glob("*reverse_cluster*.csv")[0]
reverse_barcode_center_file

In [None]:
rev_barcode_center_frame = pd.read_csv(reverse_barcode_center_file, skipinitialspace=True)
rev_barcode_center_frame.rename(columns={"time_point_1": "HiSeq_count"}, inplace=True)
rev_barcode_center_frame[:5]

In [None]:
rev_barcode_center_frame.sort_values(by=['HiSeq_count'], ascending=False)[:5]

In [None]:
rev_barcode_clusterID_dict = dict(zip(rev_barcode_clusterID_frame["Unique.reads"], rev_barcode_clusterID_frame["Cluster.ID"]))

for index, row in rev_barcode_center_frame.iterrows():
    rev_barcode_clusterID_dict[row["Center"]] = row["Cluster.ID"]

In [None]:
forward_barcode_map_file = glob.glob("*forward_barcode*.csv")[0]
forward_barcode_map_file

In [None]:
for_barcode_clusterID_frame = pd.read_csv(forward_barcode_map_file, skipinitialspace=True)
for_barcode_clusterID_frame[:5]

In [None]:
forward_barcode_center_file = glob.glob("*forward_cluster*.csv")[0]
forward_barcode_center_file

In [None]:
for_barcode_center_frame = pd.read_csv(forward_barcode_center_file, skipinitialspace=True)
for_barcode_center_frame.rename(columns={"time_point_1": "HiSeq_count"}, inplace=True)
for_barcode_center_frame[:5]

In [None]:
for_barcode_center_frame.sort_values(by=['HiSeq_count'], ascending=False)[:5]

In [None]:
for_barcode_clusterID_dict = dict(zip(for_barcode_clusterID_frame["Unique.reads"], for_barcode_clusterID_frame["Cluster.ID"]))

for index, row in for_barcode_center_frame.iterrows():
    for_barcode_clusterID_dict[row["Center"]] = row["Cluster.ID"]

In [None]:
os.chdir(notebook_directory)
glob.glob("*pkl")

In [None]:
os.chdir(notebook_directory)
pickle_file = 'output_file_label_BarSeqFitnessFrame.pkl'
print(pickle_file)

barcode_frame = pickle.load(open(pickle_file, 'rb'))

hiseq_count_frame = barcode_frame.barcode_frame

In [None]:
experiment = barcode_frame.experiment
experiment

In [None]:
len(hiseq_count_frame)

In [None]:
hiseq_count_frame["for_BC_ID"] = [ for_barcode_clusterID_dict[x] for x in hiseq_count_frame["forward_BC"] ]
hiseq_count_frame["rev_BC_ID"] = [ rev_barcode_clusterID_dict[x] for x in hiseq_count_frame["reverse_BC"] ]

In [None]:
hiseq_count_frame[:5]

In [None]:
hiseq_BC_pairs = []

for index, row in hiseq_count_frame.iterrows():
    f_bc = row["for_BC_ID"]
    r_bc = row["rev_BC_ID"]
    hiseq_BC_pairs.append(f"{f_bc}_{r_bc}")

hiseq_count_frame["dual_BC_ID"] = hiseq_BC_pairs

In [None]:
len(hiseq_BC_pairs)

In [None]:
# Dataset no. 1:

In [None]:
os.chdir(notebook_directory)
os.getcwd()

In [None]:
pac_bio_dir_1 = notebook_directory[:notebook_directory.find("E-Coli")]
pac_bio_dir_1 += "LacI_CCS_analysis\\engineering-bio-lacI-landscape\\data_1\\processed\\targets"
os.chdir(pac_bio_dir_1)
os.getcwd()

In [None]:
glob.glob("*.tsv.gz")

In [None]:
with gzip.open('barcode_1.tsv.gz', 'rb') as f:
    bc1_frame_1 = pd.read_csv(f, sep="\t", skipinitialspace=True)

with gzip.open('barcode_2.tsv.gz', 'rb') as f:
    bc2_frame_1 = pd.read_csv(f, sep="\t", skipinitialspace=True)

In [None]:
with gzip.open('lacI.tsv.gz', 'rb') as f:
    lacI_frame_1 = pd.read_csv(f, sep="\t", skipinitialspace=True)

In [None]:
lacI_frame_1[:3]

In [None]:
# Dataset no. 2:

In [None]:
os.chdir(notebook_directory)
os.getcwd()

In [None]:
pac_bio_dir_2 = notebook_directory[:notebook_directory.find("E-Coli")]
pac_bio_dir_2 += "LacI_CCS_analysis\\engineering-bio-lacI-landscape\\data_2\\processed\\targets"
os.chdir(pac_bio_dir_2)
os.getcwd()

In [None]:
glob.glob("*.tsv.gz")

In [None]:
with gzip.open('barcode_1.tsv.gz', 'rb') as f:
    bc1_frame_2 = pd.read_csv(f, sep="\t", skipinitialspace=True)

with gzip.open('barcode_2.tsv.gz', 'rb') as f:
    bc2_frame_2 = pd.read_csv(f, sep="\t", skipinitialspace=True)

In [None]:
with gzip.open('lacI.tsv.gz', 'rb') as f:
    lacI_frame_2 = pd.read_csv(f, sep="\t", skipinitialspace=True)

In [None]:
lacI_frame_2[:3]

In [None]:
bc1_dict = {}
for key, value in zip(bc1_frame_1["#name"], bc1_frame_1["seq"]):
    bc1_dict[key] = value
for key, value in zip(bc1_frame_2["#name"], bc1_frame_2["seq"]):
    bc1_dict[key] = value
    
bc2_dict = {}
for key, value in zip(bc2_frame_1["#name"], bc2_frame_1["seq"]):
    bc2_dict[key] = value
for key, value in zip(bc2_frame_2["#name"], bc2_frame_2["seq"]):
    bc2_dict[key] = value
    
lacI_dict = {}
for key, value in zip(lacI_frame_1["#name"], lacI_frame_1["seq"]):
    lacI_dict[key] = value
for key, value in zip(lacI_frame_2["#name"], lacI_frame_2["seq"]):
    lacI_dict[key] = value

In [None]:
all_keys = np.unique(np.array(list(bc1_dict.keys()) + list(bc2_dict.keys()) + list(lacI_dict.keys()) ))
len(all_keys)

In [None]:
%%time
id_list = []
bc2_list = []
bc1_list = []
cds_list = []

for key in all_keys:
    if (key in bc1_dict.keys()) & (key in bc2_dict.keys()) & (key in lacI_dict.keys()):
        id_list.append(key[:-3])
        bc1_list.append(bc1_dict[key])
        bc2_list.append(bc2_dict[key])
        cds_list.append(lacI_dict[key])

In [None]:
wild_type_cds = 'TCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCCAGGGTGGTTTTTCTTTTCACCAGTGAGACTGGCAACAGCTGATTGCCCTTCACCGCCTGGCCCTGAGAGAGTTGCAGCAAGCGGTCCACGCTGGTTTGCCCCAGCAGGCGAAAATCCTGTTTGATGGTGGTTAACGGCGGGATATAACATGAGCTATCTTCGGTATCGTCGTATCCCACTACCGAGATATCCGCACCAACGCGCAGCCCGGACTCGGTAATGGCGCGCATTGCGCCCAGCGCCATCTGATCGTTGGCAACCAGCATCGCAGTGGGAACGATGCCCTCATTCAGCATTTGCATGGTTTGTTGAAAACCGGACATGGCACTCCAGTCGCCTTCCCGTTCCGCTATCGGCTGAATTTGATTGCGAGTGAGATATTTATGCCAGCCAGCCAGACGCAGACGCGCCGAGACAGAACTTAATGGGCCCGCTAACAGCGCGATTTGCTGGTGACCCAATGCGACCAGATGCTCCACGCCCAGTCGCGTACCGTCCTCATGGGAGAAAATAATACTGTTGATGGGTGTCTGGTCAGAGACATCAAGAAATAACGCCGGAACATTAGTGCAGGCAGCTTCCACAGCAATGGCATCCTGGTCATCCAGCGGATAGTTAATGATCAGCCCACTGACGCGTTGCGCGAGAAGATTGTGCACCGCCGCTTTACAGGCTTCGACGCCGCTTCGTTCTACCATCGACACCACCACGCTGGCACCCAGTTGATCGGCGCGAGATTTAATCGCCGCGACAATTTGCGACGGCGCGTGCAGGGCCAGACTGGAGGTGGCAACGCCAATCAGCAACGACTGTTTGCCCGCCAGTTGTTGTGCCACGCGGTTGGGAATGTAATTCAGCTCCGCCATCGCCGCTTCCACTTTTTCCCGCGTTTTCGCAGAAACGTGGCTGGCCTGGTTCACCACGCGGGAAACGGTCTGATAAGAGACACCGGCATACTCTGCGACATCGTATAACGTTACTGGTTTCAT'

In [None]:
print(cds_list[0][:25])
print(wild_type_cds[:25])

In [None]:
pacbio_frame = pd.DataFrame({"id":id_list, "cterm-bc":bc2_list, "nterm-bc":bc1_list, "cds":cds_list})

In [None]:
len(pacbio_frame)

In [None]:
pacbio_frame["cds_length"] = [ len(x) for x in pacbio_frame["cds"] ]

In [None]:
print(pacbio_frame["cds_length"].mode())

In [None]:
print(pacbio_frame["cds_length"].max())

In [None]:
print(pacbio_frame["cds_length"].min())

In [None]:
cds_length_mode = pacbio_frame["cds_length"].mode().values[0]
cds_length_mode

In [None]:
plt.rcParams["figure.figsize"] = [8, 6]
fig, axs = plt.subplots(1, 1)
bins= [i+0.5 for i in range(cds_length_mode-10, cds_length_mode+10)]

axs.hist(pacbio_frame["cds_length"], bins=bins, alpha=0.7, label="lacI");
axs.set_yscale('log');
axs.set_xticks([i for i in range(cds_length_mode-10, cds_length_mode+10, 2)]);
leg = axs.legend(loc='upper right', bbox_to_anchor= (0.97, 0.97), ncol=1, borderaxespad=0)
new_length = len(pacbio_frame[pacbio_frame["cds_length"]==cds_length_mode])
print(new_length)
print(new_length/len(pacbio_frame))

In [None]:
pacbio_frame = pacbio_frame[pacbio_frame["cds_length"]==pacbio_frame["cds_length"].mode().values[0]].copy()

In [None]:
pacbio_frame = pacbio_frame[~pacbio_frame["nterm-bc"].isnull()]
pacbio_frame = pacbio_frame[~pacbio_frame["cterm-bc"].isnull()]

In [None]:
print(len(pacbio_frame))

In [None]:
%%time

for_match_num = 0
rev_match_num = 0
for_barcodeID_list = []
rev_barcodeID_list = []
for index, row in pacbio_frame.iterrows():
    barcode = row["nterm-bc"]
    #barcode = barcode[1:-2]
    if barcode in for_barcode_clusterID_dict:
        for_barcodeID_list.append(for_barcode_clusterID_dict[barcode])
        for_match_num += 1
    else:
        for_barcodeID_list.append(-1)
    
    barcode = row["cterm-bc"]
    #barcode = reverse_complement(barcode[1:-1])
    barcode = str(Seq(barcode).reverse_complement())
    if barcode in rev_barcode_clusterID_dict:
        rev_barcodeID_list.append(rev_barcode_clusterID_dict[barcode])
        rev_match_num += 1
    else:
        rev_barcodeID_list.append(-1)
        
pacbio_frame["for_BC_ID"] = for_barcodeID_list
pacbio_frame["rev_BC_ID"] = rev_barcodeID_list

print(for_match_num)
print(rev_match_num)

In [None]:
len(pacbio_frame) - for_match_num

In [None]:
pacbio_BC_pairs = []

for f_bc, r_bc in zip(pacbio_frame["for_BC_ID"], pacbio_frame["rev_BC_ID"]):
    pacbio_BC_pairs.append(f"{f_bc}_{r_bc}")

pacbio_frame["dual_BC_ID"] = pacbio_BC_pairs

In [None]:
len(pacbio_frame)

In [None]:
# fraction of PacBio reads that have one of the matching barcodes
print(len(pacbio_frame[pacbio_frame["for_BC_ID"]!=-1])/len(pacbio_frame))
print(len(pacbio_frame[pacbio_frame["rev_BC_ID"]!=-1])/len(pacbio_frame))

In [None]:
# fraction of PacBio reads that have both matching barcodes
print(len(pacbio_frame[(pacbio_frame["rev_BC_ID"]>-1) & (pacbio_frame["for_BC_ID"]>-1)])/len(pacbio_frame))

In [None]:
# fraction of PacBio reads that have dual barcode matching a dual barcode from the HiSeq data
print(len(pacbio_frame[pacbio_frame["dual_BC_ID"].isin(hiseq_BC_pairs)])/len(pacbio_frame))

In [None]:
len(pacbio_frame[~pacbio_frame["dual_BC_ID"].isin(hiseq_BC_pairs)])

In [None]:
hiseq_BC_pairs_series = pd.Series(hiseq_BC_pairs)
hiseq_BC_pairs_series_2 = hiseq_BC_pairs_series[hiseq_BC_pairs_series.isin(pacbio_frame["dual_BC_ID"])]

In [None]:
# fraction of HiSeq double barcodes that have matching barcodes in the PacBio dataset
print(len(hiseq_BC_pairs_series_2)/len(hiseq_BC_pairs))

In [None]:
# number of HiSeq double barcodes that have matching barcodes in the PacBio dataset
print(len(hiseq_BC_pairs_series_2))

In [None]:
# number of unique dual barcodes found in the PacBio data
#    (not necessarily dual barcode pairs that showed up in the HiSeq)
print(len(pacbio_frame[pacbio_frame["dual_BC_ID"].str.contains("-1")==False]["dual_BC_ID"].unique()))

In [None]:
#number of barcodes in HiSeq dataset
len(hiseq_count_frame)

In [None]:
def distance(str1, str2):
    if len(str1) != len(str2):
        raise ValueError("Strand lengths are not equal!")
    else:
        count = 0
        for (a, b) in zip(str1, str2):
            if a!=b:
                if ( (a=='X') or (b=='X') ):
                    count += 0.5
                else:
                    count += 1
                
    return count

In [None]:
def trim_errors(err_list):
    if len(err_list)<=2:
        return err_list
    else:
        #if there are 3 or more terms, throw out outliers
        err_list.sort()
        #if there are 2 sequences with a low error rate relative to the consensus, assume they are the "good" reads.
        err_list = [err for err in err_list if err<=err_list[1]]
        return err_list

In [None]:
cds_length = cds_length_mode
cds_length

In [None]:
%%time

#Calculate consensus cds for each dual barcode and cds read error rate relative to consensus

dual_cds_list = list(pacbio_frame[(pacbio_frame["for_BC_ID"]!=-1) & (pacbio_frame["rev_BC_ID"]!=-1)]["cds"].values)

dual_concensus_cds_list = []
dual_cds_err_rate = []
dual_cluster_size_list = []
dual_cds_err_rate_rand = []

for bc_id in hiseq_count_frame["dual_BC_ID"]:
    df = pacbio_frame[pacbio_frame["dual_BC_ID"]==bc_id]
    
    dual_cluster_size_list.append(len(df))
    
    cds_list = df["cds"]
    
    if len(cds_list)>1:
        #first do non-random matches
        alignment = MultipleSeqAlignment([ SeqRecord(Seq(x)) for x in cds_list ])
        summary_align = AlignInfo.SummaryInfo(alignment)
        concensus_cds = str(summary_align.dumb_consensus(threshold=0.2, consensus_alpha=generic_dna))
        dual_concensus_cds_list.append(concensus_cds)
        errors = []
        for c in cds_list:
            errors.append(distance(c, concensus_cds))
        errors = trim_errors(errors)
        dual_cds_err_rate.append(sum(errors)/len(errors)/cds_length)
        
        #then do the same thing, but for randomly chaosen sets of cds
        alignment = MultipleSeqAlignment([ SeqRecord(Seq(x)) for x in random.sample(dual_cds_list, len(cds_list)) ])
        summary_align = AlignInfo.SummaryInfo(alignment)
        concensus_cds = str(summary_align.dumb_consensus(threshold=0.2, consensus_alpha=generic_dna))
        errors = []
        for c in cds_list:
            errors.append(distance(c, concensus_cds))
        errors = trim_errors(errors)
        dual_cds_err_rate_rand.append(sum(errors)/len(errors)/cds_length)
    else:
        if len(cds_list)==1:
            dual_concensus_cds_list.append(cds_list.iloc[0])
        else:
            dual_concensus_cds_list.append("")
        dual_cds_err_rate.append(0)
        dual_cds_err_rate_rand.append(0)
    
hiseq_count_frame["concensus_cds"] = dual_concensus_cds_list
hiseq_count_frame["cds_error_rate"] = dual_cds_err_rate
hiseq_count_frame["pacbio_count"] = dual_cluster_size_list
hiseq_count_frame["cds_error_rate_rand"] = dual_cds_err_rate_rand

In [None]:
len(hiseq_count_frame)

In [None]:
# How many cds assigments?
print(len(hiseq_count_frame[hiseq_count_frame["concensus_cds"]!=""]))

In [None]:
%%time

#Calcualte mutation rate of consensus cds relative to wild type cds

concensus_mutation_rate_list = []

for index, row in hiseq_count_frame.iterrows():
    consensus_cds = row["concensus_cds"]
    if consensus_cds!="":
        errors = fitness.hamming_distance(consensus_cds, wild_type_cds)
        
        concensus_mutation_rate_list.append(errors/cds_length)
    
    else:
        concensus_mutation_rate_list.append(-1)
    
hiseq_count_frame["concensus_cds_mutation_rate"] = concensus_mutation_rate_list

In [None]:
print(len(hiseq_count_frame[hiseq_count_frame["pacbio_count"]==1]))

In [None]:
print(len(hiseq_count_frame[hiseq_count_frame["pacbio_count"]>1]))

In [None]:
plt.rcParams["figure.figsize"] = [16,8]
fig, axs = plt.subplots(1, 2)
fig.suptitle('PacBio Barcode Reads', y=0.95, fontsize=24)

bins = [n+0.5 for n in range(-1,100)]

dual_cluster_size_list = hiseq_count_frame["pacbio_count"]

axs[0].hist(dual_cluster_size_list[(dual_cluster_size_list>0) & (dual_cluster_size_list<1000)],
            bins=bins, alpha=1);
axs[0].hist(dual_cluster_size_list[(dual_cluster_size_list>0) & (dual_cluster_size_list<1000)], cumulative=-1,
            bins=bins, histtype='step', color=sns.color_palette()[0]);
axs[0].set_yscale('log');

axs[1].hist(dual_cluster_size_list[(dual_cluster_size_list>0) & (dual_cluster_size_list<1000)],
            bins=bins, alpha=1, label="new");

axs[1].set_xlim(-1,50);
#axs[0].set_ylim(0.7,120000);
for ax in axs.flatten():
    ax.set_xlabel('PacBio Read Count', size=20)
    ax.set_ylabel('Number of Barcodes', size=20)
    ax.tick_params(labelsize=16);
#leg = axs[1].legend(loc='upper right', bbox_to_anchor= (0.97, 0.97), ncol=1, borderaxespad=0)

In [None]:
plt.rcParams["figure.figsize"] = [10,10]
fig, axs = plt.subplots(1, 1)

f_x = hiseq_count_frame["pacbio_count"]

f_y = hiseq_count_frame["total_counts"]
f_y2 = hiseq_count_frame["total_counts_plate_2"]

axs.plot(f_x, f_y, "o")#, alpha=0.7, label="forward");
axs.plot(f_x, f_y2, "o", alpha=0.5, ms=3);
axs.set_yscale('log');
axs.set_xscale('log');

axs.set_xlabel('PacBio Count per Barcode', size=20)
axs.set_ylabel('HiSeq Count per Barcode', size=20);
axs.tick_params(labelsize=16);
#leg = axs.legend(loc='lower right', bbox_to_anchor= (0.95, 0.05), ncol=1, borderaxespad=0, frameon=True, fontsize=16)
#leg.get_frame().set_edgecolor('k');

In [None]:
plt.rcParams["figure.figsize"] = [16,8]
fig, axs = plt.subplots(1, 2)

count_thresh = 2

dual_error_rate_list = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=count_thresh]
dual_error_rate_list = dual_error_rate_list["cds_error_rate"]

dual_error_rate_list_3 = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=3]
dual_error_rate_list_3 = dual_error_rate_list_3["cds_error_rate"]

dual_error_rate_list_rand = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=count_thresh]
dual_error_rate_list_rand = dual_error_rate_list_rand["cds_error_rate_rand"]

bins = [(n+0.5)/cds_length for n in range(-1,20)]

axs[0].set_yscale('log');

for ax in axs:
    ax.hist(dual_error_rate_list, alpha=0.5, bins=bins, label="matching barcodes");
    ax.hist(dual_error_rate_list_rand, alpha=0.5, bins=bins, label="random barcodes");
    ax.hist(dual_error_rate_list_3, alpha=0.5, bins=bins, label="matching with > 2 reads");
    
    ax.set_xlabel('CDS Mismatch Rate', size=20)
    ax.set_ylabel('Number of Barcodes', size=20);
    ax.tick_params(labelsize=16);
    
    leg = ax.legend(loc='upper right', bbox_to_anchor= (0.975, 0.975), ncol=1, borderaxespad=0, frameon=True, fontsize=16)
    leg.get_frame().set_edgecolor('k');

In [None]:
plt.rcParams["figure.figsize"] = [16,8]
fig, axs = plt.subplots(1, 2)

count_thresh = 2

dual_error_rate_list = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=count_thresh]
dual_error_rate_list = dual_error_rate_list["cds_error_rate"]

mutation_rate_list = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=count_thresh]
mutation_rate_list = mutation_rate_list["concensus_cds_mutation_rate"]

dual_error_rate_list_rand = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=count_thresh]
dual_error_rate_list_rand = dual_error_rate_list_rand["cds_error_rate_rand"]

bins = [(n+0.5)/cds_length for n in range(-1,20)]

axs[0].set_yscale('log');

for ax in axs:
    ax.hist(mutation_rate_list, alpha=0.5, bins=bins, label="mutation rate");
    #ax.hist(dual_error_rate_list, alpha=0.5, bins=bins, label="matching barcodes");
    ax.hist(dual_error_rate_list_rand, alpha=0.5, bins=bins, label="random barcodes");
    
    ax.set_xlabel('CDS Mismatch Rate', size=20)
    ax.set_ylabel('Number of Barcodes', size=20);
    ax.tick_params(labelsize=16);
    
    leg = ax.legend(loc='upper right', bbox_to_anchor= (0.975, 0.975), ncol=1, borderaxespad=0, frameon=True, fontsize=16)
    leg.get_frame().set_edgecolor('k');

In [None]:
current_palette = sns.color_palette()
sns.palplot(current_palette)

In [None]:
plt.rcParams["figure.figsize"] = [16,8]
fig, axs = plt.subplots(1, 2)

count_thresh = 2

dual_error_rate_list = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=count_thresh]
dual_error_rate_list = dual_error_rate_list["cds_error_rate"]

dual_error_rate_list_3 = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=3]
dual_error_rate_list_3 = dual_error_rate_list_3["cds_error_rate"]

dual_error_rate_list_rand = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=count_thresh]
dual_error_rate_list_rand = dual_error_rate_list_rand["cds_error_rate_rand"]

mutation_rate_list = hiseq_count_frame[hiseq_count_frame["pacbio_count"]>=count_thresh+2]
mutation_rate_list = mutation_rate_list["concensus_cds_mutation_rate"]

bins = [(n+0.5) for n in range(-1,21)]

axs[0].set_yscale('log');

for ax in axs:
    ax.hist(dual_error_rate_list*cds_length, alpha=0.5, bins=bins, label="matching barcodes", color=current_palette[0]);
    ax.hist(dual_error_rate_list_rand*cds_length, alpha=0.5, bins=bins, label="random barcodes", color=current_palette[1]);
    ax.hist(mutation_rate_list*cds_length, alpha=0.5, bins=bins, label="difference from w-t", color=current_palette[2]);
    #ax.hist(dual_error_rate_list_3, alpha=0.5, bins=bins, label="matching with > 2 reads", color=current_palette[2]);
    
    ax.set_xlabel('CDS Error/Mutation Rate', size=20)
    ax.set_ylabel('Number of Barcodes', size=20);
    ax.tick_params(labelsize=16);
    
    leg = ax.legend(loc='upper right', bbox_to_anchor= (0.975, 0.975), ncol=1, borderaxespad=0, frameon=True, fontsize=16)
    leg.get_frame().set_edgecolor('k');

In [None]:
len(hiseq_count_frame[hiseq_count_frame["pacbio_count"]==1])

In [None]:
2/cds_length

In [None]:
len(hiseq_count_frame)

In [None]:
hiseq_selection_1 = hiseq_count_frame["pacbio_count"]==1
hiseq_selection_2 = (hiseq_count_frame["pacbio_count"]>=2) & (hiseq_count_frame["cds_error_rate"]<2/cds_length)

In [None]:
hiseq_count_frame["hasConfidentCds"] = (hiseq_selection_1 | hiseq_selection_2)

In [None]:
hiseq_count_frame[:5][["pacbio_count", "hasConfidentCds", "cds_error_rate"]]

In [None]:
# How many variants are at the margin: pacbio_count==2 and cds_error_rate==1/cds_length?
frame = hiseq_count_frame
frame = frame[frame["pacbio_count"]==2]
frame = frame[frame["cds_error_rate"]==1/cds_length]
len(frame)

In [None]:
notebook_directory

In [None]:
data_directory

In [None]:
barcode_frame.notebook_dir

In [None]:
barcode_frame.data_directory

In [None]:
# This might be necessary becasue the Stan fits were run on AWS.
#    notebook_dir and data_directory should be set to the appropriate directories on the JCloud
#barcode_frame.notebook_dir = notebook_directory
#barcode_frame.data_directory = data_directory

In [None]:
barcode_frame.save_as_pickle() # Use this version when running with data in place on JCloud
#barcode_frame.save_as_pickle(notebook_dir=notebook_directory) # Use this version when running with data on a local PC/HD

In [None]:
out_frame = hiseq_count_frame[hiseq_count_frame["hasConfidentCds"]]
out_frame = out_frame[["forward_BC", "reverse_BC", "total_counts", "dual_BC_ID",
                       "cds_error_rate", "pacbio_count", "concensus_cds"]]

In [None]:
# number of barcode pairs for which we have a good reason to think we know the cds
len(out_frame)

In [None]:
len(out_frame)/len(hiseq_count_frame)

In [None]:
out_frame[:3]

In [None]:
out_frame[out_frame["total_counts"]==out_frame["total_counts"].min()]