In [1]:
#analyze new

In [8]:
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

import os
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from Bio import SeqIO
from Bio.Seq import Seq

import warnings
warnings.simplefilter('ignore')

import itertools
import numpy as np

import tqdm
import gc

import scipy as sp

In [9]:
#read in all_data_df
all_data_df=pd.read_csv('data/all_data_df.csv')
all_data_df

Unnamed: 0,virus_background,replicate,selection,sample,barcode,bc_count
0,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAACACGCACAAGCTAATACGAAACGA,1
1,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAACGACTGCTAGCTCCTAACCGTGGT,1
2,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAACTCACTTAAGCTCGGCCCAAGGAT,1
3,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAAGGTGTCTTAGCTTGTTCATTATCC,1
4,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAAGTTGACATAGCTGGTCAATATTAC,1
...,...,...,...,...,...,...
10193604,SYD21,R2,P1,SYD21_R2_P1,TTTTGCTAGGGTAGCTTGTATCACGTGG,1
10193605,SYD21,R2,P1,SYD21_R2_P1,TTTTGGGACTACAGCTATCTCATCTCAG,1
10193606,SYD21,R2,P1,SYD21_R2_P1,TTTTTGACCTCATGCTACCTTGTATTCA,1
10193607,SYD21,R2,P1,SYD21_R2_P1,TTTTTTATTGTTAGCTTATAAACTGGTA,1


In [10]:
#trim data to only include those with bc_count less than bc_count_min values
#note that here bc_count refers to the number of barcode reads that are associated with unique UMIs
bc_count_min_pre=1
bc_count_min_post=1

all_data_df_pre_selection_trimmed = all_data_df[(all_data_df['selection']=='Plasmid') & (all_data_df['bc_count']>=bc_count_min_pre)]
all_data_df_post_selection_trimmed = all_data_df[(all_data_df['selection']=='P1') & (all_data_df['bc_count']>=bc_count_min_post)]

all_data_df_trimmed=pd.concat([all_data_df_pre_selection_trimmed,all_data_df_post_selection_trimmed])

counts_before_filter = len(all_data_df)
counts_after_filter = len(all_data_df_trimmed)
counts_removed = counts_before_filter - counts_after_filter

print(f'''Minimum pre-selection barcode count has been set to: {bc_count_min_pre}
Minimum post-selection barcode count has been set to: {bc_count_min_post}
{counts_removed} barcodes have been removed, {counts_after_filter} unique barcodes remain.
Here is a breakdown of the unique barcode reads found in each sample:''')

filtered_stats=pd.DataFrame()
uniques=all_data_df['sample'].unique()

sample_list=[]
unique_bcs_before_filter_list = []
unique_bcs_after_filter_list = []

for s in tqdm.tqdm(uniques):
    before = len(all_data_df[all_data_df['sample'] == s])
    after = len(all_data_df_trimmed[all_data_df_trimmed['sample'] == s])
    
    sample_list.append(s)
    unique_bcs_before_filter_list.append(before)
    unique_bcs_after_filter_list.append(after)

filtered_stats['sample'] = sample_list 
filtered_stats['unique_bcs_before_filter'] = unique_bcs_before_filter_list
filtered_stats['unique_bcs_after_filter'] = unique_bcs_after_filter_list
filtered_stats['unique_bcs_removed'] = filtered_stats['unique_bcs_before_filter'] - filtered_stats['unique_bcs_after_filter']

pd.set_option('display.max_rows',None)
filtered_stats

Minimum pre-selection barcode count has been set to: 1
Minimum post-selection barcode count has been set to: 1
0 barcodes have been removed, 10193609 unique barcodes remain.
Here is a breakdown of the unique barcode reads found in each sample:


100%|██████████| 16/16 [00:11<00:00,  1.38it/s]


Unnamed: 0,sample,unique_bcs_before_filter,unique_bcs_after_filter,unique_bcs_removed
0,USSR77_R1_Plasmid,752302,752302,0
1,USSR77_R2_Plasmid,631383,631383,0
2,USSR77_R1_P1,605651,605651,0
3,USSR77_R2_P1,450066,450066,0
4,SN89_R1_Plasmid,852054,852054,0
5,SN89_R2_Plasmid,855466,855466,0
6,SN89_R1_P1,486023,486023,0
7,SN89_R2_P1,455744,455744,0
8,SI06_R1_Plasmid,796591,796591,0
9,SI06_R2_Plasmid,981196,981196,0


In [11]:
#load the barcode pacbio data
background_list=['USSR77','SN89','SI06','SYD21']
geno_to_pheno_list=[]
geno_to_pheno_df=pd.DataFrame()

for x in background_list:
    geno_to_pheno_list.append(f'data/codon_variant_tables/codon_variant_table_{x}.csv')

for x in tqdm.tqdm(geno_to_pheno_list):
    temp_df=pd.read_csv(x)
    geno_to_pheno_df=pd.concat([geno_to_pheno_df,temp_df])

geno_to_pheno_df=geno_to_pheno_df.rename(columns={'target':'virus_background'})
geno_to_pheno_df['replicate']='R'+geno_to_pheno_df['library'].str[-1:]

print('PacBio data has been loaded\nHere is a sample of the PacBio dataframe:')
pd.set_option('display.max_rows',10)
geno_to_pheno_df

100%|██████████| 4/4 [00:00<00:00,  4.60it/s]


PacBio data has been loaded
Here is a sample of the PacBio dataframe:


Unnamed: 0,virus_background,library,barcode,variant_call_support,codon_substitutions,aa_substitutions,n_codon_substitutions,n_aa_substitutions,replicate
0,USSR77,rep1,ACAACAGAGTGCAGCTAGACGACCAACC,1,CCA176ACA,P176T,1,1,R1
1,USSR77,rep1,ACAACTGTAAGTAGCTAGGCTGTGCGGC,4,GTA125CAA,V125Q,1,1,R1
2,USSR77,rep1,ACAAGATCGGTCAGCTCAGTATCTGAAT,4,GTG216GAT,V216D,1,1,R1
3,USSR77,rep1,ACAAGGATTTACAGCTCGGTTCACCCAG,3,AGC179TTC,S179F,1,1,R1
4,USSR77,rep1,ACAAGGATTTACAGCTTGGTTGCTACTC,1,AGC179TTC,S179F,1,1,R1
...,...,...,...,...,...,...,...,...,...
407607,SYD21,rep2,TTGCGGTCCATTAGCTGCGATCGCCCCC,1,AGG130CAT,R130H,1,1,R2
407608,SYD21,rep2,TTGCGTACATCCAGCTCGGATGGTGCCT,2,AAT142GGA,N142G,1,1,R2
407609,SYD21,rep2,TTGCTAAGTACCAGCTCTACCAAAATGC,1,GGA110AAT,G110N,1,1,R2
407610,SYD21,rep2,TTGCTCACGCTCAGCTAGATATACAATC,5,GGA80TGC,G80C,1,1,R2


In [12]:
pd.set_option('display.max_rows',10)
all_data_df_trimmed

Unnamed: 0,virus_background,replicate,selection,sample,barcode,bc_count
0,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAACACGCACAAGCTAATACGAAACGA,1
1,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAACGACTGCTAGCTCCTAACCGTGGT,1
2,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAACTCACTTAAGCTCGGCCCAAGGAT,1
3,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAAGGTGTCTTAGCTTGTTCATTATCC,1
4,USSR77,R1,Plasmid,USSR77_R1_Plasmid,AAAAGTTGACATAGCTGGTCAATATTAC,1
...,...,...,...,...,...,...
10193604,SYD21,R2,P1,SYD21_R2_P1,TTTTGCTAGGGTAGCTTGTATCACGTGG,1
10193605,SYD21,R2,P1,SYD21_R2_P1,TTTTGGGACTACAGCTATCTCATCTCAG,1
10193606,SYD21,R2,P1,SYD21_R2_P1,TTTTTGACCTCATGCTACCTTGTATTCA,1
10193607,SYD21,R2,P1,SYD21_R2_P1,TTTTTTATTGTTAGCTTATAAACTGGTA,1


In [13]:
#merge illumina data with pacbio data
all_data_df_merged=all_data_df_trimmed.merge(geno_to_pheno_df,on=['virus_background','replicate','barcode'])

all_data_dropped = all_data_df_trimmed.merge(all_data_df_merged, how='outer', indicator=True)
all_data_dropped = all_data_dropped[all_data_dropped['_merge'] == 'left_only']
all_data_dropped = all_data_dropped.drop(columns=['_merge'])

print('PacBio data has been loaded\nHere is a sample of the merged dataframe:')
all_data_df_merged

PacBio data has been loaded
Here is a sample of the merged dataframe:


Unnamed: 0,virus_background,replicate,selection,sample,barcode,bc_count,library,variant_call_support,codon_substitutions,aa_substitutions,n_codon_substitutions,n_aa_substitutions
0,USSR77,R1,Plasmid,USSR77_R1_Plasmid,ACAACAGAGTGCAGCTAGACGACCAACC,10,rep1,1,CCA176ACA,P176T,1,1
1,USSR77,R1,Plasmid,USSR77_R1_Plasmid,ACAACTGTAAGTAGCTAGGCTGTGCGGC,55,rep1,4,GTA125CAA,V125Q,1,1
2,USSR77,R1,Plasmid,USSR77_R1_Plasmid,ACAAGATCGGTCAGCTCAGTATCTGAAT,31,rep1,4,GTG216GAT,V216D,1,1
3,USSR77,R1,Plasmid,USSR77_R1_Plasmid,ACAAGGATTTACAGCTCGGTTCACCCAG,146,rep1,3,AGC179TTC,S179F,1,1
4,USSR77,R1,Plasmid,USSR77_R1_Plasmid,ACAAGGATTTACAGCTTGGTTGCTACTC,92,rep1,1,AGC179TTC,S179F,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1366816,SYD21,R1,P1,SYD21_R1_P1,ATCTTAGCACGCAGCTGGTCACATCAAC,1,rep1,2,,,0,0
1366817,SYD21,R2,P1,SYD21_R2_P1,ATCTAGACTTAGAGCTGTGACTTTTCAT,1,rep2,3,ACC181CAA,T181Q,1,1
1366818,SYD21,R2,P1,SYD21_R2_P1,ATCTCTATCAACAGCTTGGTCACGGGGC,25,rep2,1,AAG136GAT,K136D,1,1
1366819,SYD21,R2,P1,SYD21_R2_P1,ATCTGGTCCATTAGCTAATACGCCCTTG,1,rep2,4,AGG130AAT,R130N,1,1


In [14]:
#output counts files for individual samples for loading in to the analyze_func_scores notebook

replicate_list=['R1','R2']
cols=['barcode','count','codon_substitutions','aa_substitutions','variant_call_support']
background_list=['USSR77','SN89','SI06','SYD21']

for bkg in background_list:
    for rep in replicate_list:
        
        temp_df_pre = all_data_df_merged.query(f"virus_background == '{bkg}' & replicate == '{rep}' & selection == 'Plasmid' ").rename(columns={'bc_count':'count'})
        temp_df_post = all_data_df_merged.query(f"virus_background == '{bkg}' & replicate == '{rep}' & selection == 'P1' ").rename(columns={'bc_count':'count'})
        
        temp_df_pre = temp_df_pre[cols].reset_index(drop=True)
        temp_df_post = temp_df_post[cols].reset_index(drop=True)
        
        temp_df_pre.to_csv(f'data/counts_files/{bkg}_{rep}_Plasmid.csv')
        temp_df_post.to_csv(f'data/counts_files/{bkg}_{rep}_P1.csv')
