Copyright 2021 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Overview

This notebook summarizes the numbers of aptamers that appear to be enriched in positive pools for particular particule display experiments.  These values are turned into venn diagrams and pie charts in Figure 2. 

The inputs are csvs, where each row is an aptamer and columns indicate the sequencing counts within each particle display subexperiment.



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd    

# Parameters used in Manuscript



In [None]:
# Required coverage level for analysis.  This is in units of number of apatamer 
# particles (beads). This is used to minimize potential contamination. 
# For example, a tolerated bead fraction of 0.2 means that if, based on read 
# depth and number of beads, there are 100 reads expected per bead, then 
# sequences with fewer than 20 reads would be excluded from analysis.
TOLERATED_BEAD_FRAC = 0.2 

# Ratio cutoff between positive and negative pools to count as being real.
# The ratio is calculated normalized by read depth, so if the ratio is 0.5, 
# then positive sequences are expected to have equal read depth (or more) in 
# the positive pool as the negative pool. So, as a toy example, if the 
# positive pool had 100 reads total and the negative pool had 200 reads total,
# then a sequence with 5 reads in the positive pool and 10 reads in the 
# negative pool would have a ratio of 0.5.
POS_NEG_RATIO_CUTOFF = 0.5

# Minimum required reads (when 0 it uses only the above filters)
MIN_READ_THRESH = 0

# Load in data

##  Load in experimental conditions for Particle Display experiments

The mlpd_params_df contains the experimental information for MLPD.

Parameters are:
* apt_collected: The number of aptamer bead particles collected during the FACs experiment of particle display.
* apt_screened: The number of aptamer bead particles screened in order to get the apt_collected beads. 
* seq_input: The estimated number of unique sequences in the input sequence library during bead construction.

In [None]:
#@title Original PD Data Parameters

# Since these are small I'm going to embed in the colab.
apt_screened_list = [ 2.4*10**6, 2.4*10**6, 1.24*10**6]
apt_collected_list = [3.5 * 10**4, 8.5 * 10**4,  8 * 10**4]
seq_input = [10**5] * 3
conditions = ['round2_high_no_serum_positive', 
              'round2_medium_no_serum_positive',
              'round2_low_no_serum_positive']
flags = ['round2_high_no_serum_flag', 'round2_medium_no_serum_flag', 
         'round2_low_no_serum_flag']
stringency = ['High', 'Medium', 'Low']

pd_param_df = pd.DataFrame.from_dict({'apt_screened': apt_screened_list,
                                    'apt_collected': apt_collected_list,
                                    'seq_input': seq_input,
                                    'condition': conditions,
                                    'condition_flag': flags,
                                    'stringency': stringency})

pd_param_df

Unnamed: 0,apt_screened,apt_collected,seq_input,condition,condition_flag,stringency
0,2400000.0,35000.0,100000,round2_high_no_serum_positive,round2_high_no_serum_flag,High
1,2400000.0,85000.0,100000,round2_medium_no_serum_positive,round2_medium_no_serum_flag,Medium
2,1240000.0,80000.0,100000,round2_low_no_serum_positive,round2_low_no_serum_flag,Low


In [None]:
#@title MLPD Data Parameters
apt_screened_list = [ 3283890.016, 6628573.952, 5801469.696, 3508412.512]
apt_collected_list = [12204, 50353, 153845, 201255]
seq_input = [200000] * 4
conditions = ['round1_very_positive', 
              'round1_high_positive',
              'round1_medium_positive',
              'round1_low_positive']
flags = ['round1_very_flag', 'round1_high_flag', 'round1_medium_flag', 
         'round1_low_flag']
stringency = ['Very High', 'High', 'Medium', 'Low']
mlpd_param_df = pd.DataFrame.from_dict({'apt_screened': apt_screened_list,
                                    'apt_collected': apt_collected_list,
                                    'seq_input': seq_input,
                                    'condition': conditions,
                                    'condition_flag': flags,
                                    'stringency': stringency})


mlpd_param_df

Unnamed: 0,apt_screened,apt_collected,seq_input,condition,condition_flag,stringency
0,3283890.016,12204,200000,round1_very_positive,round1_very_flag,Very High
1,6628573.952,50353,200000,round1_high_positive,round1_high_flag,High
2,5801469.696,153845,200000,round1_medium_positive,round1_medium_flag,Medium
3,3508412.512,201255,200000,round1_low_positive,round1_low_flag,Low


## Load CSVs

In [None]:
# PD and MLPD sequencing counts across experiments
# Upload pd_clustered_input_data_manuscript.csv and mlpd_input_data_manuscript.csv
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving mlpd_input_data_manuscript.csv to mlpd_input_data_manuscript.csv
Saving pd_clustered_input_data_manuscript.csv to pd_clustered_input_data_manuscript.csv
User uploaded file "mlpd_input_data_manuscript.csv" with length 24198752 bytes
User uploaded file "pd_clustered_input_data_manuscript.csv" with length 64368063 bytes


In [None]:
# Load PD Data
with open('pd_clustered_input_data_manuscript.csv') as f:
  pd_input_df = pd.read_csv(f)

In [None]:
# Load MLPD data
with open('mlpd_input_data_manuscript.csv') as f:
  mlpd_input_df = pd.read_csv(f)

# Helper functions




In [None]:
def generate_cutoffs_via_PD_stats(df, col, apt_screened, apt_collected, seq_input,
                                  tolerated_bead_frac, min_read_thresh):
  """Use the experimental parameters to determine sequences passing thresholds.

  Args:
    df: Pandas dataframe with experiment results. Must have columns named 
      after the col function parameter, containing the read count, and a
      column 'sequence'.
    col: The string name of the column in the experiment dataframe with the 
      read count.
    apt_screened: The integer number of aptamers screened, from the experiment 
      parameters.
    apt_collected: The integer number of aptamers collected, from the experiment
      parameters.
    seq_input: The integer number of unique sequences in the sequence library 
      used to construct the aptamer particles.
    tolerated_bead_frac: The float tolerated bead fraction threshold. In other
      words, the sequencing depth required to keep a sequence, in units of
      fractions of a bead based on the average expected read depth per bead. 
    min_read_threshold: The integer minimum number of reads that a sequence
      must have in order not to be filtered.

  Returns:
    Pandas series of the sequences from the dataframe that pass filter.
  """

  expected_bead_coverage =  apt_screened / seq_input
  tolerated_bead_coverage = expected_bead_coverage * tolerated_bead_frac
  bead_full_min_sequence_coverage = (1. / apt_collected) * tolerated_bead_coverage
  col_sum = df[col].sum()
  # Look at sequenced counts calculated observed fraction of pool and raw count.
  seqs = df[((df[col]/col_sum) > bead_full_min_sequence_coverage) & # Pool frac.
            (df[col] > min_read_thresh)   # Raw count
            ].sequence
  return seqs


def generate_pos_neg_normalized_ratio(df, col_prefix):
  """Adds fraction columns to the dataframe with the calculated pos/neg ratio.

  Args:
    df: Pandas dataframe, expected to have columns [col_prefix]_positive and
      [col_prefix]_negative contain read counts for the positive and negative
      selection conditions, respectively.
    col_prefix: String prefix of the columns to use to calculate the ratio. 
      For example 'round1_very_positive'.
  
  Returns:
    The original dataframe with three new columns:
    [col_prefix]_positive_frac contains the fraction of the total positive 
      pool that is this sequence.
    [col_prefix]_negative_frac contains the fraction of the total negative
      pool that is this sequence.
    [col_prefix]_pos_neg_ratio: The read-depth normalized fraction of the 
      sequence that ended in the positive pool.
  """
  col_pos = col_prefix + '_' + 'positive'
  col_neg = col_prefix + '_' + 'negative'
  df[col_pos + '_frac'] = df[col_pos] /  df[col_pos].sum()
  df[col_neg + '_frac'] = df[col_neg] /  df[col_neg].sum()
  df[col_prefix + '_pos_neg_ratio'] = df[col_pos + '_frac'] / (
      df[col_pos + '_frac'] + df[col_neg + '_frac'])
  return df


def build_seq_sets_from_df (input_param_df, input_df, tolerated_bead_frac, 
                            pos_neg_ratio, min_read_thresh):
  """Sets flags for sequences based on whether they clear stringencies.

  This function adds a column 'seq_set' to the input_param_df (one row per
  stringency level of a particle display experiment) containing all the 
  sequences in the experiment that passed that stringency level in the 
  experiment.

  Args:
    input_param_df: Pandas dataframe with experimental parameters. Expected
      to have one row per stringency level in the experiment and 
      columns 'apt_screened', 'apt_collected', 'seq_input', 'condition', and
      'condition_flag'.
    input_df: Pandas dataframe with the experimental results (counts per 
      sequence) for the experiment covered in the input_param_df. Expected
      to have a [col_prefix]_pos_neg_ratio column for each row of the
      input_param_df (i.e. each stringency level).
    tolerated_bead_frac: Float representing the minimum sequence depth, in
      units of expected beads, for a sequence to be used in analysis.
    pos_neg_ratio: The threshold for the pos_neg_ratio column for a sequence
      to be used in the analysis.
    min_read_thresh: The integer minimum number of reads for a sequence to
      be used in the analysis (not normalized, a straight count.)

  Returns:
    Nothing.

  """
  for _, row in input_param_df.iterrows():
    
    # Get parameters to calculate bead fraction.
    apt_screened = row['apt_screened']
    apt_collected = row['apt_collected']
    seq_input = row['seq_input']
    condition = row['condition']
    flag = row['condition_flag']
    
    # Get sequences above tolerated_bead_frac in positive pool.
    tolerated_bead_frac_seqs = generate_cutoffs_via_PD_stats(
        input_df, condition, apt_screened, apt_collected, seq_input, 
        tolerated_bead_frac, min_read_thresh)
    
    # Intersect with seqs > normalized positive sequencing count ratio.
    condition_pre = condition.split('_positive')[0]
    ratio_col = '%s_pos_neg_ratio' % (condition_pre)
    pos_frac_seqs = input_df[input_df[ratio_col] > pos_neg_ratio].sequence
    seqs = set(tolerated_bead_frac_seqs) & set(pos_frac_seqs)
    input_df[flag] = input_df.sequence.isin(set(seqs))

# Data Analysis

In [None]:
#@title Add positive_frac / (positive_frac + negative_frac) col to df

for col_prefix in ['round1_very', 'round1_high', 'round1_medium', 'round1_low']:
  mlpd_input_df = generate_pos_neg_normalized_ratio(mlpd_input_df, col_prefix)
  

for col_prefix in ['round2_high_no_serum', 'round2_medium_no_serum', 'round2_low_no_serum']:
  pd_input_df = generate_pos_neg_normalized_ratio(pd_input_df, col_prefix)

In [None]:
#@title Measure consistency of particle display data when increasing stringency thresholds within each experimental set (i.e PD and MLPD)

build_seq_sets_from_df(pd_param_df, pd_input_df, TOLERATED_BEAD_FRAC, 
                       POS_NEG_RATIO_CUTOFF, MIN_READ_THRESH)

build_seq_sets_from_df(mlpd_param_df, mlpd_input_df, TOLERATED_BEAD_FRAC, 
                       POS_NEG_RATIO_CUTOFF, MIN_READ_THRESH)

# Generate Figure Data
Here, we generate the raw data used to build Venn diagrams.  The final figures were render in Figma.

In [None]:
#@title  Figure 2B Raw Data
pd_input_df.groupby('round2_low_no_serum_flag	round2_medium_no_serum_flag	round2_high_no_serum_flag'.split()).count()[['sequence']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sequence
round2_low_no_serum_flag,round2_medium_no_serum_flag,round2_high_no_serum_flag,Unnamed: 3_level_1
False,False,False,908587
False,False,True,44
False,True,False,27
False,True,True,3
True,False,False,687
True,False,True,6
True,True,False,702
True,True,True,385


In [None]:
#@title Figure 2C Raw Data

# To build venn (green), sum preceding True flags to get consistent sets
# 512 nM  = 5426+3 = 5429
# 512 & 128 nM = 2360+15 = 2375
# 512 & 128 & 32nM (including 8 nM) = 276+84 = 360
# To build venn (grey) Inconsistent flags are summed (ignoring 8nM)
# 128 nM only = 185 + 1 = 186
# 128 nM & 32 nM = 12+1 = 13
# 32 nM only = 2
# 32 nM and 512 nM only = 22+1 = 23
#  
# To build pie, look at all round1_very_flags = True
# Green = 84
# Grey = 15+1+3+1+1 = 21
mlpd_input_df.groupby('round1_low_flag	round1_medium_flag	round1_high_flag round1_very_flag'.split()).count()[['sequence']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sequence
round1_low_flag,round1_medium_flag,round1_high_flag,round1_very_flag,Unnamed: 4_level_1
False,False,False,False,179161
False,False,True,False,2
False,True,False,False,185
False,True,False,True,1
False,True,True,False,12
False,True,True,True,1
True,False,False,False,5426
True,False,False,True,3
True,False,True,False,22
True,False,True,True,1
