In [None]:
import glob #filenames and pathnames utility
import os   #operating sytem utility

import matplotlib.pyplot as plt
from matplotlib import colors
#from matplotlib.backends.backend_pdf import PdfPages

import gsf_ims_fitness as fitness

import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
import scipy
import scipy.special as sp
#from scipy import special
#from scipy import misc

from Bio.Seq import Seq

#import pystan
import pickle

import seaborn as sns
sns.set()

%load_ext autoreload
%autoreload 2

%matplotlib inline

%autosave 0

sns.set_style("white")
sns.set_style("ticks", {'xtick.direction':'in', 'xtick.top':True, 'ytick.direction':'in', 'ytick.right':True})

In [None]:
total_count_threshold = 3000 # Only use data with > this number of total counts

In [None]:
notebook_directory = os.getcwd()
notebook_directory

In [None]:
glob.glob("*BarSeqFitnessFrame.pkl")

In [None]:
pickle_file = '2019-10-16_IPTG_Select-DNA-5-plates_BarSeqFitnessFrame.pkl'

HiSeq_data = pickle.load(open(pickle_file, 'rb'))

count_frame_16 = HiSeq_data.barcode_frame

In [None]:
# Data cleaning for NN modeling with Hill equation fits;
#     add column with number of points where Hill equation fit is within 5%-95% confidence region of GP model fit
def hill_funct(x, low, high, mid, n):
    return low + (high-low)*( x**n )/( mid**n + x**n )

def fit_funct(x, log_g_min, log_g_max, log_x_50, log_nx, *argv):
    return hill_funct(x, 10**log_g_min, 10**log_g_max, 10**log_x_50, 10**log_nx)


frame = count_frame_16
x = np.array(HiSeq_data.inducer_conc_list)

hill_params = frame["sensor_params"]
gp_quantiles = frame["sensor_GP_g_quantiles"]

num_points_list = []

for hill, gp in zip(hill_params, gp_quantiles):
    if (np.isnan(hill).any())|(np.isnan(gp).any()):
        num = -1
    else:
        g_out = fit_funct(x, *hill)
        low_bound = 10**gp[0]
        high_bound = 10**gp[-1]
        g_inside = g_out[(g_out>low_bound)&(g_out<high_bound)]
        num = len(g_inside)
    num_points_list.append(num)
    
count_frame_16["good_hill_fit_points"] = num_points_list

In [None]:
plt.rcParams["figure.figsize"] = [6,6]
fig, axs = plt.subplots(1, 1)

bins = [i-0.5 for i in range(-1,14)]
axs.hist(count_frame_16["good_hill_fit_points"], bins=bins);
axs.set_yscale("log")
print(len(count_frame_16[count_frame_16["good_hill_fit_points"]==12]))

In [None]:
# Add columns for amino acid sequence, amino acid mutation number, and mutation codes
wild_type_cds = 'TCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCCAGGGTGGTTTTTCTTTTCACCAGTGAGACTGGCAACAGCTGATTGCCCTTCACCGCCTGGCCCTGAGAGAGTTGCAGCAAGCGGTCCACGCTGGTTTGCCCCAGCAGGCGAAAATCCTGTTTGATGGTGGTTAACGGCGGGATATAACATGAGCTATCTTCGGTATCGTCGTATCCCACTACCGAGATATCCGCACCAACGCGCAGCCCGGACTCGGTAATGGCGCGCATTGCGCCCAGCGCCATCTGATCGTTGGCAACCAGCATCGCAGTGGGAACGATGCCCTCATTCAGCATTTGCATGGTTTGTTGAAAACCGGACATGGCACTCCAGTCGCCTTCCCGTTCCGCTATCGGCTGAATTTGATTGCGAGTGAGATATTTATGCCAGCCAGCCAGACGCAGACGCGCCGAGACAGAACTTAATGGGCCCGCTAACAGCGCGATTTGCTGGTGACCCAATGCGACCAGATGCTCCACGCCCAGTCGCGTACCGTCCTCATGGGAGAAAATAATACTGTTGATGGGTGTCTGGTCAGAGACATCAAGAAATAACGCCGGAACATTAGTGCAGGCAGCTTCCACAGCAATGGCATCCTGGTCATCCAGCGGATAGTTAATGATCAGCCCACTGACGCGTTGCGCGAGAAGATTGTGCACCGCCGCTTTACAGGCTTCGACGCCGCTTCGTTCTACCATCGACACCACCACGCTGGCACCCAGTTGATCGGCGCGAGATTTAATCGCCGCGACAATTTGCGACGGCGCGTGCAGGGCCAGACTGGAGGTGGCAACGCCAATCAGCAACGACTGTTTGCCCGCCAGTTGTTGTGCCACGCGGTTGGGAATGTAATTCAGCTCCGCCATCGCCGCTTCCACTTTTTCCCGCGTTTTCGCAGAAACGTGGCTGGCCTGGTTCACCACGCGGGAAACGGTCTGATAAGAGACACCGGCATACTCTGCGACATCGTATAACGTTACTGGTTTCAT'


In [None]:
len(wild_type_cds)

In [None]:
wild_type_aminos = str(Seq(wild_type_cds).reverse_complement().translate())

In [None]:
wild_type_aminos[-6:]

In [None]:
%%time
# Add column for amino acid sequence and distance from wild-type

lacI_amino_seq = []
amino_distance = []

for index, row in count_frame_16.iterrows():
    if row["hasConfidentCds"]:
        try:
            trans_this = str(Seq(row["concensus_cds"]).reverse_complement().translate())
            dist = fitness.hamming_distance(wild_type_aminos, trans_this)
        except:
            trans_this = ""
            dist = -1
    else:
        trans_this = ""
        dist = -1
        
    lacI_amino_seq.append(trans_this)
    amino_distance.append(dist)
    if index%5000 == 0:
        print(index)

count_frame_16["lacI_amino_seq"] = lacI_amino_seq
count_frame_16["lacI_amino_mutations"] = amino_distance

In [None]:
%%time

# For each variant, create list of amino acid changes from wild-type
aminos_list = list(count_frame_16["lacI_amino_seq"])
amino_distance = list(count_frame_16["lacI_amino_mutations"])

mutations_lists = []

for amino, dist in zip(aminos_list, amino_distance):
    mutations = []
    if (dist<=12) and (dist>=0):
        #This version only marks mutations codes up to amino acid 324 (excluding the tetramerization domains)
        #for ind, (c1, c2) in enumerate(zip(amino[:324], wild_type_aminos[:324])):
        
        # This version marks all mutations
        for ind, (c1, c2) in enumerate(zip(amino, wild_type_aminos)): 
            if c1 != c2:
                mutations.append(f"{c2}{ind+1}{c1}")
    mutations_lists.append(mutations)
    
count_frame_16["mutation_codes"] = mutations_lists

In [None]:

fit_params_list = []
fit_errs_list = []
fit_samples_list = []
 
for index, row in count_frame_16.iterrows():
    params = row["sensor_params"]
    param_errs = np.sqrt(np.diagonal(row["sensor_params_cov"]))
    param_samples = row["sensor_stan_samples"]
    if len(params) < 7:
        params = np.full((7), np.nan)
        param_errs = np.full((7), np.nan)
        param_samples = np.full((7, 32), np.nan)
    fit_params_list.append(params)
    fit_errs_list.append(param_errs)
    fit_samples_list.append(param_samples)

fit_params_arr = np.array(fit_params_list).transpose()
fit_errs_arr = np.array(fit_errs_list).transpose()
fit_samples_list = np.array(fit_samples_list).transpose((1, 0, 2))

param_names = ["log_low_level", "log_high_level", "log_ic50", "log_n", "log_high_low_ratio"]

for params_y, err_y, name, samples in zip(fit_params_arr, fit_errs_arr, param_names, fit_samples_list):
    y_label = f'{name}'
    err_label = f'{name} error'
    samp_lable = f'{name} samples'
    
    count_frame_16[y_label] = params_y
    count_frame_16[err_label] = err_y
    
    count_frame_16[samp_lable] = [s for s in samples]

In [None]:
# Re-pickle and save to hdf

In [None]:
HiSeq_data.experiment

In [None]:
HiSeq_data.notebook_dir

In [None]:
HiSeq_data.save_as_pickle()
#HiSeq_data.save_as_pickle(notebook_dir=os.getcwd())

In [None]:
os.getcwd()

In [None]:
count_frame_16.to_hdf('2019-10-16_IPTG_Select-DNA-5-plates_BarSeqFitnessFrame.hdf', key="count_frame_16")

In [None]:
# Histogram of log_high_level error and log_low_level error for full library and log_low_level>4.5

plt.rcParams["figure.figsize"] = [12,6]
fig, axs = plt.subplots(1, 2)

x = np.array(count_frame_16["log_high_level error"])
x = x[~np.isnan(x)]
n, bins, p = axs[0].hist(x, bins=50, density=False, alpha=0.7);

plot_frame = count_frame_16[count_frame_16["log_low_level"]>4.5]
x = np.array(plot_frame["log_high_level error"])
x = x[~np.isnan(x)]
axs[0].hist(x, bins=bins, density=False, alpha=0.7);

axs[0].set_yscale("log");

ylim = axs[0].get_ylim()
axs[0].set_ylim(ylim)
axs[0].plot([0.7, 0.7], ylim, color='k');


x = np.array(count_frame_16["log_low_level error"])
x = x[~np.isnan(x)]
n, bins, p = axs[1].hist(x, bins=50, density=False, alpha=0.7);

plot_frame = count_frame_16[count_frame_16["log_low_level"]>4.5]
x = np.array(plot_frame["log_low_level error"])
x = x[~np.isnan(x)]
axs[1].hist(x, bins=bins, density=False, alpha=0.7);

axs[0].set_yscale("log");
axs[1].set_yscale("log");

In [None]:
plot_frame = count_frame_16#[count_frame_16["log_low_level"]>4.5]
plot_frame1 = plot_frame[plot_frame["log_high_level error"]>0.7]
plot_frame2 = plot_frame[plot_frame["log_low_level error"]>0.64]
in_colors = ["indigo", "firebrick"]
in_labels=["log_high_level error>0.7", "log_low_level error>0.64"]
HiSeq_data.plot_hill_params([plot_frame1, plot_frame2], in_labels=in_labels, in_colors=in_colors, error_bars=False);

In [None]:
print(len(plot_frame))
for ind in list(plot_frame.index)[:20]:
    HiSeq_data.plot_fitness_and_difference_curves(plot_range=[ind, ind], include_ref_seqs=False, show_GP=True)