Copyright 2021 Google LLC

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

In [None]:
import pandas as pd
import math
import numpy as np

# Load in Data

In [None]:
# PD sequencing counts across experiments
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
# Load PD Data
with open('pd_clustered_input_data_manuscript.csv') as f:
  pd_input_df = pd.read_csv(f)

# Helper Functions

In [None]:
def generate_pos_neg_normalized_ratio(df, col_prefix):
  """Adds fraction columns to the dataframe with the calculated pos/neg ratio.

  Args:
    df: (pd.DataFrame) DataFrame expected to have columns [col_prefix]_positive, 
      [col_prefix]_negative contain read counts for the positive and negative
      selection conditions, respectively.
    col_prefix: (str) Prefix of the columns to use to calculate the ratio. For 
    example, 'round1_very_positive'.
  
  Returns:
    (pd.DataFrame) The original dataframe with three new columns:
    [col_prefix]_positive_frac contains the fraction of the total positive 
      pool that is this sequence.
    [col_prefix]_negative_frac contains the fraction of the total negative
      pool that is this sequence.
    [col_prefix]_pos_neg_ratio: The read-depth normalized fraction of the 
      sequence that ended in the positive pool.
  """
  col_pos = col_prefix + '_' + 'positive'
  col_neg = col_prefix + '_' + 'negative'
  df[col_pos + '_frac'] = df[col_pos] /  df[col_pos].sum()
  df[col_neg + '_frac'] = df[col_neg] /  df[col_neg].sum()
  df[col_prefix + '_pos_neg_ratio'] = df[col_pos + '_frac'] / (
      df[col_pos + '_frac'] + df[col_neg + '_frac'])
  return df

In [None]:
def fraction_to_3bins (frac, min_bin=0.1, max_bin=0.9):
  '''Takes a positive / (positive + negative) fraction and converts to ternary.

  Args:  
    frac: (float) positive / (positive + negative) fraction.
    min_bin: (float) Cutoff between bin 0 and bin 1.
    max_bin: (float) Cutoff between bin 1 and bin 2.

  Returns:
    (int) Bin
  '''
  if math.isnan(frac):
    return 0
  if frac < min_bin:
    return 0
  elif frac > max_bin:
    return 2
  else:
    return 1

def bins_to_super_bins (low, medium, high):
  '''Take the binned labels and convert it to a single SuperBin label.

  Args:
    low: (int) Bin for low stringency.
    medium: (int) Bin for medium stringency.
    high: (int) Bin for high strigency.
  
  Returns:
    (int) SuperBin.
  '''

  if high == 0:
    if medium == 0:
      if low == 0:
        # If all three bins are 0 return 0
        return 0
      if low == 1:
        # Borderline low stringency.
        return 1
      if low == 2:
        # Unambiguous low strigency
        return 2
    elif medium == 1:
      if low == 1:
        # If medium and low are 1 return 2 
        # The idea is that this added support is similar to low being = 2.
        return 2
      if low == 2:
        # Borderline medium stringency.
        return 3
    elif medium == 2:
      # This is an unambiguous medium stringency.
      if low == 2:
        return 4
  elif high == 1:
    # Require that anything in the potentially high bin passes low stringency.
    if low == 2:
      if medium == 1:
        # If medium and and high are borderline this is similar to medium = 2.
        return 4
      if medium == 2:
        # Borderline high strigency.
        return 5
  elif high == 2 and medium == 2 and low == 2:
    # Unambiguous high stringency.
    return 6  
  # The bins provide an ambiguous story and we need to exclude. 
  return -1

# Create Binned and SuperBin Labels

In [None]:
# Generate Binned and SuperBin labels as additional columns in dataframe
# Binned cols: low_3bins, med_3bins, high_3bins
# SuperBin col: super_bin
for col_prefix, stringency_level in zip(
    ['round2_high_no_serum', 'round2_medium_no_serum', 'round2_low_no_serum'],
    ['low', 'med', 'high']):
  pd_input_df = generate_pos_neg_normalized_ratio(pd_input_df, col_prefix)
  pd_input_df['%s_3bins' %(stringency_level)] = pd_input_df[col_prefix + '_pos_neg_ratio'].apply(fraction_to_3bins)
pd_input_df['super_bin'] = pd_input_df.apply(
    lambda x: bins_to_super_bins(x.low_3bins, x.med_3bins, x.high_3bins), 
    axis=1) 