In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# The variants we wanted to synthesize (includes nt constructs)
attempted_synthesis=pd.read_csv("../Data/data_for_synthesis_pipeline/chip_df.csv")
#Only take the relevant columns for this analysis
attempted_synthesis=attempted_synthesis[["category","coding"]].iloc[:244000] #only the first 244000 of those in the file could fit on the chip
#First sequencing run
run1_counts=pd.read_csv("../Data/raw_counts_nextSeq_run1.txt",index_col=0)
run1_counts.rename(columns={'Unnamed: 0.1':'nt_seq'}, inplace=True)
#Second sequencing run (higher depth)
run2_counts=pd.read_csv("../Data/raw_counts_nextSeq_run2.txt")
run2_counts.rename(columns={'Unnamed: 0':'nt_seq'}, inplace=True)

In [3]:
run1_counts.head(2)

Unnamed: 0,nt_seq,EK266_GAS1_p1_rep1a_plasmid,EK266_GAS1_p1_rep1b_plasmid,EK269_GAS1_p1_rep1a_plasmid,EK269_GAS1_p1_rep1b_plasmid,EK269_GAS1_p1_rep1c_plasmid,EK269_GAS1_p1_rep1d_plasmid,EK269_GAS1_v3_rep1a_virus,EK269_GAS1_v3_rep1b_virus,EK269_GAS1_v3_rep1c_virus,...,category,chip,control,is_wt_aa,is_wt_nt,mask,mut,rep_i,rep_original,rep_total
0,GACGAGGACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,4,7,6,8,8,8,16,17,16,...,rnn_designed_plus_rand_train_walked,1,0,0,0,__D_____________________Dn_n_d_,5,1,1,1
1,GACGAGGACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,23,11,11,12,13,13,36,47,34,...,rnn_designed_plus_rand_train_walked,1,0,0,0,__D______________A______Gd__d_,5,1,1,1


In [4]:
run2_counts.head(2)

Unnamed: 0,nt_seq,EK269_GAS1_p1_rep1a_plasmid,EK269_GAS1_p1_rep1b_plasmid,EK269_GAS1_p1_rep1c_plasmid,EK269_GAS1_p1_rep1d_plasmid,EK269_GAS1_v3_rep1a_virus,EK269_GAS1_v3_rep1b_virus,EK269_GAS1_v3_rep1c_virus,EK269_GAS1_v3_rep1d_virus,EK269_GAS1_v4_rep2a_virus,...,category,chip,control,is_wt_aa,is_wt_nt,mask,mut,rep_i,rep_original,rep_total
0,GACGAGGACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,13,89,51,47,176,47,53,211,78,...,rnn_designed_plus_rand_train_walked,1,0,0,0,__D_____________________Dn_n_d_,5,1,1,1
1,GACGAGGACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,29,129,100,63,439,131,130,563,269,...,rnn_designed_plus_rand_train_walked,1,0,0,0,__D______________A______Gd__d_,5,1,1,1


Ensure that the nucleotide sequences were those that were in fact attempted

In [5]:
print (f'Before filter:{len(run1_counts)}' )
run1_counts=run1_counts[run1_counts["nt_seq"].isin(attempted_synthesis.coding)]
print (f'After filter:{len(run1_counts)}' )

Before filter:485749
After filter:243481


Merge the runs together

In [6]:
runs_combined_df=run1_counts.merge(run2_counts, on=["nt_seq","rep_i"],how="left")

In [7]:
print (f'Before filter:{len(runs_combined_df)}' )
runs_combined_df=runs_combined_df[runs_combined_df["nt_seq"].isin(attempted_synthesis.coding)]
print (f'After filter:{len(runs_combined_df)}' )

Before filter:243481
After filter:243481


In [8]:
len(runs_combined_df['nt_seq'].dropna()),len(runs_combined_df),len(set(attempted_synthesis.coding))

(243481, 243481, 243483)

In [9]:
runs_combined_df.tail(5)

Unnamed: 0,nt_seq,EK266_GAS1_p1_rep1a_plasmid,EK266_GAS1_p1_rep1b_plasmid,EK269_GAS1_p1_rep1a_plasmid_x,EK269_GAS1_p1_rep1b_plasmid_x,EK269_GAS1_p1_rep1c_plasmid_x,EK269_GAS1_p1_rep1d_plasmid_x,EK269_GAS1_v3_rep1a_virus_x,EK269_GAS1_v3_rep1b_virus_x,EK269_GAS1_v3_rep1c_virus_x,...,aa_y,category_y,chip_y,control_y,is_wt_aa_y,is_wt_nt_y,mask_y,mut_y,rep_original_y,rep_total_y
243476,GACGAAGAGGAAATCAGGACAACCAATCCCGTGGCTACGGAGCCAT...,13,12,16,11,23,17,5,2,1,...,DEEEIRTTNPVATEPYGSVSTNLQRPNR,random_doubles,1,0,0,0,______________P__________P__,2,1,1
243477,GACGAAGAGGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,7,19,7,10,19,10,0,0,0,...,DEEEIRTTNPVATEQYGShVSTNAQRGNR,random_doubles,1,0,0,0,__________________h____A_____,2,1,1
243478,GACGAAGAGGAATTCGAAACAACCAATCCCGTGGCTACGGAGCAGT...,13,7,15,16,19,13,1,1,1,...,DEEEFETTNPVATEQYGSVSTNLQRGNR,random_doubles,1,0,0,0,____FE______________________,2,1,1
243479,GACGAAGAGGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,7,6,16,11,15,17,278,345,168,...,DEEEIRTTNPVATEQYGSVSDNLQRaGNR,random_doubles,1,0,0,0,____________________D____a___,2,1,1
243480,GACGAACGAGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,12,14,17,9,20,16,3,7,2,...,DEREIRTTNPVATEQYGSVSTNPQRGNR,random_doubles,1,0,0,0,__R___________________P_____,2,1,1


In [10]:
# helper function for splitting columns in different ways
def select_col(data,key,exp,complete=True,subset=None):
    selected_columns=[]
    for col in data.columns:
        if col.startswith("EK"):
            if (key in col) and exp in col:
                selected_columns.append(col)
        else:
            if complete:
                selected_columns.append(col)
    if subset:
        selected_columns=[col  for col in selected_columns if subset in col] 
    return selected_columns

Compute the total plasmid count for each varian based on the counts for all replicates in the two runs.

In [11]:
runs_combined_df.loc[:,"GAS1_plasmid_N"]=runs_combined_df[select_col(runs_combined_df,"EK269","GAS1",complete=False,subset="plasmid")].sum(axis=1)

Compute the total virus count for each variant based on the counts for all replicates in the two runs

In [12]:
runs_combined_df["GAS1_virus_N"]=runs_combined_df[select_col(runs_combined_df,"EK269","GAS1",complete=False,subset="virus")].sum(axis=1)

Compute the frequency of each variant in the plasmid library : $f_{plasmid}$

In [13]:
runs_combined_df["GAS1_plasmid_F"]=runs_combined_df["GAS1_plasmid_N"]/runs_combined_df["GAS1_plasmid_N"].sum()

Compute the frequency of each variant in the virus library:  $f_{virus}$

In [14]:
runs_combined_df["GAS1_virus_F"]=runs_combined_df["GAS1_virus_N"]/runs_combined_df["GAS1_virus_N"].sum()

Compute selection as $S_i= \log_2 \big(\frac{f_{virus}}{f_{plasmid}}\big)$

In [15]:
runs_combined_df["GAS1_virus_S"]=np.log2(runs_combined_df["GAS1_virus_F"]/runs_combined_df["GAS1_plasmid_F"])

In [16]:
runs_combined_df.head()

Unnamed: 0,nt_seq,EK266_GAS1_p1_rep1a_plasmid,EK266_GAS1_p1_rep1b_plasmid,EK269_GAS1_p1_rep1a_plasmid_x,EK269_GAS1_p1_rep1b_plasmid_x,EK269_GAS1_p1_rep1c_plasmid_x,EK269_GAS1_p1_rep1d_plasmid_x,EK269_GAS1_v3_rep1a_virus_x,EK269_GAS1_v3_rep1b_virus_x,EK269_GAS1_v3_rep1c_virus_x,...,is_wt_nt_y,mask_y,mut_y,rep_original_y,rep_total_y,GAS1_plasmid_N,GAS1_virus_N,GAS1_plasmid_F,GAS1_virus_F,GAS1_virus_S
0,GACGAGGACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,4,7,6,8,8,8,16,17,16,...,0,__D_____________________Dn_n_d_,5,1,1,230,1143,3e-06,5e-06,0.572274
1,GACGAGGACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,23,11,11,12,13,13,36,47,34,...,0,__D______________A______Gd__d_,5,1,1,370,3042,6e-06,1.4e-05,1.298578
2,GACGAAGAGGAAATCGCTACAACCAATCCCGTGGCTACGGAGCAGT...,10,16,12,9,14,11,14,12,8,...,0,_____A__________________Hd_De_,5,1,1,350,2307,5e-06,1e-05,0.979746
3,GACGAACACGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,27,20,12,11,37,32,76,64,63,...,0,__H______________N______Gg_d__,5,1,1,879,6243,1.3e-05,2.8e-05,1.087459
4,GACGAACATGAAATCAGGACAACCAATCCCGTGGCTACGGAGCAGT...,19,14,15,17,10,10,50,59,22,...,0,__H_____________________pG__Dg,5,1,1,479,4126,7e-06,1.9e-05,1.365801


Save output

In [17]:
runs_combined_df.to_csv("../Data/library_w_selection_scores.csv")