# Analysis of BarSeq Reads

Generate read counts for PKR variants by mapping barcode counts to variants

**Input:** Bartender extracted barcodes and counts  

**Output:**
- pkr-replicate-barcode-reads_\<date>.csv = Two barseq replicate experiments
- pkr-barcode-reads_\<date>.csv = Combine reads from two replicates, filter to known PKR barcodes
- pkr-variant-reads_\<date>.csv = Group reads by PKR variant, used for subsequent analysis

**Samples:** \<number>_\<k3_condition>_\<timepoint>_\<sample_number>  
e.g. "1_1_0_S1" is tube #1, Replicate 1 of PKR variants, 0 hr timepoint, sample S1

- K3 Conditions:
    - 1: PKR variants, Replicate 1
    - 2: PKR variants, Replicate 2
    - 3: PKR variants + WT K3, Replicate 1
    - 4: PKR variants + WT K3, Replicate 2
    - 5: PKR variants + K3-H47R, Replicate 1
    - 6: PKR variants + K3-H47R, Replicate 2
- Timepoints: 0, 12, 16, and 20 hours

In [1]:
from datetime import datetime
import time
import os
import pickle

import pandas as pd
import numpy as np
from Bio.Seq import Seq
import seaborn as sns
import matplotlib as plt

# custom functions
from function_pkr_functional_score import pkr_functional_score
from function_pkr_metadata import pkr_metadata

In [2]:
date = datetime.now().strftime("%y%m%d")

In [None]:
# input directory
input_dir = '../../results/bartender'

# save output directory
save_dir = "../../results/barseq"
os.makedirs(save_dir, exist_ok=True)

In [3]:
# map barcodes to PKR variants: 
pkr_variant_table = '../../results/alignparse/pkr.codon_variant_table.csv'
var_df = pd.read_csv(pkr_variant_table)
var_df = var_df.query('n_aa_substitutions < 2')
var_df.loc[var_df.n_aa_substitutions == 0, "aa_substitutions"] = "WT"
var_df.aa_substitutions = 'PKR-' + var_df.aa_substitutions
pkr_dict = dict(zip(var_df.barcode, var_df.aa_substitutions))

In [4]:
samples = [
  '1_1_0_S1',
  '2_1_12_S2',
  '3_1_16_S3',
  '4_1_20_S4',
  '5_2_0_S5',
  '6_2_12_S6',
  '7_2_16_S7',
  '8_2_20_S8',
  '9_3_0_S9',
  '10_3_12_S10',
  '11_3_16_S11',
  '12_3_20_S12',
  '13_4_0_S13',
  '14_4_12_S14',
  '15_4_16_S15',
  '16_4_20_S16',
  '17_5_0_S17',
  '18_5_12_S18',
  '19_5_16_S19',
  '20_5_20_S20',
  '21_6_0_S21',
  '22_6_12_S22',
  '23_6_16_S23',
  '24_6_20_S24',
  '25_7_0_S25',
  '26_7_12_S26',
  '27_7_16_S27',
  '28_7_20_S28',
  '29_8_0_S29',
  '30_8_12_S30',
  '31_8_16_S31',
  '32_8_20_S32'
]
experiment = [samples[i:i+4] for i in range(0, len(samples), 4)]
conditions_list = ['K3L-Null', 'K3L-Null', 'K3L-WT', 'K3L-WT', 'K3L-H47R', 'K3L-H47R', 'SUI2', 'SUI2']
rep_list = [1,2,1,2,1,2,1,2]

In [5]:
# 1 - generate replicate dataframes (all-reads)

# merge timepoints into single df
df_list = []
for samples,cond,rep in zip(experiment, conditions_list,rep_list):
    for sample in samples:
        barcode_file = f'{sample}.pkr_barcode.txt'
        input_file = os.path.join(input_dir, barcode_file)
        temp_df = pd.read_csv(input_file, names=['pkr_bc', 'line'])
        temp_df = temp_df['pkr_bc'].value_counts().rename_axis('pkr_bc').reset_index(name=sample)
        if sample == samples[0]:
            r_df = temp_df
        else:
            r_df = pd.merge(r_df, temp_df, on='pkr_bc',how='outer')

    # now cleanup r_df before appending to list
    new_cols = ['pkr_bc','0hr','12hr','16hr','20hr']
    r_df.columns = new_cols

    # replicate
    r_df['replicate'] = f'Replicate {rep}'
    
    # k3l variant
    k3_value = f"K3L-{samples[0].split('_')[0]}"
    r_df['k3'] = cond

    # map pkr bc to variant
    r_df['pkr'] = r_df['pkr_bc'].map(pkr_dict)

    # calculations: normalize reads, -log2(fold change), and auc
    r_df = pkr_functional_score(r_df)

    df_list.append(r_df)

# merge all the replicate dataframes
df = pd.concat(df_list, ignore_index=True)

# add metadata
df = pkr_metadata(df)

# save dataframe
output_name = f'pkr-replicate-barcode-reads_{date}.csv'
output_file = os.path.join(save_dir, output_name)
df.to_csv(output_file, index=False)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

In [6]:
# 2 - combine reads (all-reads and variant-reads)

# input
input_name = f'pkr-replicate-barcode-reads_{date}.csv'
input_file = os.path.join(save_dir, input_name)
df = pd.read_csv(input_file)

# output
#output_all_reads = f'../../results/barseq/combined_all-reads_{date}.csv' # all reads
output_name = f'pkr-barcode-reads_{date}.csv'
output_file = os.path.join(save_dir, output_name) # filtered to PKR variants

# merge all the replicate dataframes
grp_cols = ['k3','pkr','pkr_bc']
cols = ['0hr','12hr','16hr','20hr']
df = df.groupby(grp_cols)[cols].sum().reset_index()

# split datafrmae into k3s and make calculations, then concat
cond_list = ['K3L-Null','K3L-WT','K3L-H47R','SUI2']

df_list = []
for cond in cond_list:
    r_df = df.query('k3 == @cond')

    # map pkr bc to variant
    r_df['pkr'] = r_df['pkr_bc'].map(pkr_dict)

    # normalize reads, -log2(fold change), and auc
    r_df = pkr_functional_score(r_df)

    df_list.append(r_df)

df = pd.concat(df_list)

# save all reads with unidentified
#df.to_csv(output_all_reads, index=False)

# drop missing pkr barcodes
df.dropna(subset=['pkr'], inplace=True)

# apply read threshold
threshold = 50
df = df[df['0hr'] >= threshold]

# add metadata
df = pkr_metadata(df)

# purify to designed pkr variants
input_file = '../../data/dms_primers/pkr_variants_list.pkl'
with open(input_file, 'rb') as f:
    designed_variant_list = pickle.load(f)
designed_variant_list = ["PKR-" + variant for variant in designed_variant_list]
designed_variant_list.append("PKR-WT")
df = df[df['pkr'].isin(designed_variant_list)]

# save dataframe
df.to_csv(output_file, index=False)

  df = pd.read_csv(input_file)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r_df['pkr'] = r_df['pkr_bc'].map(pkr_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[timepoints] = df[timepoints].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{tp}_norm'] = df[tp]/df[tp].sum()
A value is trying to be set on a copy of a slice f

In [7]:
# 3 - group barcodes by pkr variants and condense the dataframe by replicate

# input
input_name = f'pkr-barcode-reads_{date}.csv'
input_file = os.path.join(save_dir, input_name)
df = pd.read_csv(input_file)

# output
output_name = f'pkr-variant-reads_{date}.csv'
output_file = os.path.join(save_dir, output_name) # filtered to PKR variants

result_df = df.groupby(['k3', 'pkr']).agg({
    'auc': ['mean', 'std', 'sem'],
    'pkr_bc': 'nunique',
    '0hr': 'sum'
}).reset_index()

result_df.columns = ['k3', 'pkr', 'auc_mean', 'auc_std', 'auc_sem', 'pkr_bc_nunique', '0hr_total_reads']

# purify to designed pkr variants
input_file = '../../data/dms_primers/pkr_variants_list.pkl'
with open(input_file, 'rb') as f:
    designed_variant_list = pickle.load(f)
designed_variant_list = ["PKR-" + variant for variant in designed_variant_list]
designed_variant_list.append("PKR-WT")
df = df[df['pkr'].isin(designed_variant_list)]

df = pkr_metadata(result_df)

# downselect to K3 conditions
k3 = df.query('k3 != "SUI2"')
k3.to_csv(output_file, index=False)

# SUI2 condition
#sui2 = df.query('k3 == "SUI2"')
#output_file  = f'../../results/barseq/combined_grouped-barcodes_sui2_{date}.csv'
#sui2.to_csv(output_file, index=False)