In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
from scipy.stats import spearmanr, pearsonr, ranksums, linregress
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import matplotlib
import itertools
import statannot
from statsmodels.stats.multitest import fdrcorrection

from deeplift.visualization import viz_sequence


matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.sans-serif'] = 'Helvetica'


# HEPG2_COL = 'log2FoldChange_HepG2_DNA'
# K562_COL = 'log2FoldChange_K562_DNA'
HEPG2_COL = 'log2FoldChange_HepG2'
K562_COL = 'log2FoldChange_K562'
H2K_COL = 'log2FoldChange_H2K'

HEPG2_SE_COL = 'lfcSE_HepG2'
K562_SE_COL = 'lfcSE_K562'
H2K_SE_COL = 'lfcSE_H2K'

D1_HEPG2_COL = 'log2FoldChange_HEPG2_DNA'
D1_K562_COL = 'log2FoldChange_K562_DNA'
D1_HEPG2_SE_COL = 'lfcSE_HEPG2_DNA'
D1_K562_SE_COL = 'lfcSE_K562_DNA'

D2_HEPG2_COL = 'log2FoldChange_HEPG2'
D2_K562_COL = 'log2FoldChange_K562'
D2_HEPG2_SE_COL = 'lfcSE_HEPG2'
D2_K562_SE_COL = 'lfcSE_K562'

D3_HEPG2_COL = 'log2FoldChange_HepG2_DNA'
D3_K562_COL = 'log2FoldChange_K562_DNA'
D3_HEPG2_SE_COL = 'lfcSE_HepG2_DNA'
D3_K562_SE_COL = 'lfcSE_K562_DNA'

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# ### Load D1, D2, and D3 data ###

# d2_deseq_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis/d2_deseq_data/chris_log2fc_df_clean2.csv')

# d1_final_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis/fimo_motif_scanning/saved_processed_motif_files/d1_final_df_qthresh05_v2.csv')
# d2_final_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis/fimo_motif_scanning/saved_processed_motif_files/d2_final_df_qthresh05_v2.csv')
# d3_final_df = pd.read_csv('../designed_seqs/d3_tot_final_df_qthresh0.05.csv')

# d1_deseq_plus_cluster_cnts_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis/fimo_motif_scanning/saved_processed_motif_files/d1_deseq_plus_cluster_cnts_df_qthresh05_v2_plus_cell_type.csv')
# d2_deseq_plus_cluster_cnts_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis/fimo_motif_scanning/saved_processed_motif_files/d2_deseq_plus_cluster_cnts_df_qthresh05_v2.csv')

# # no longer need to process everytime loading this, I saved the processed + dna count thresholded version
# # d3_deseq_df = pd.read_csv('illumina_processing/log2fc/log2fc_enhancer.csv')
# # # rename 'Unnamed: 0' colum to 'sequence_name'
# # d3_deseq_df.rename(columns={'Unnamed: 0': 'sequence_name'}, inplace=True)
# # cols_to_merge = ['log2FoldChange_HepG2_DNA',
# #                  'log2FoldChange_K562_DNA']

# # d3_deseq_df['sequence_name'] = d3_deseq_df['sequence_name'].apply(lambda x: '_'.join(x.split('_')[1:]))

# d3_seq_dir = '../designed_seqs'
# # d3_seq_df = pd.read_csv(f'{d3_seq_dir}/d3_seq_df.csv')

# # # add the cols_to_merge to the d3_seq_df, via sequence_name
# # d3_seq_df = d3_seq_df.merge(d3_deseq_df[['sequence_name'] + cols_to_merge], on='sequence_name')
# # d3_seq_df['log2FoldChange_H2K'] = d3_seq_df['log2FoldChange_HepG2_DNA'] - d3_seq_df['log2FoldChange_K562_DNA']
# d3_seq_df = pd.read_csv(f'{d3_seq_dir}/d3_seq_df_thresh.csv')


# d3_deseq_df_plus_cluster_cnts = pd.read_csv('../designed_seqs/d3_seq_df_plus_cluster_counts.csv')
# cluster_cols = [col for col in d3_deseq_df_plus_cluster_cnts.columns if 'cluster' in col]
# cluster_cols = ['n_motifs'] + cluster_cols

# # merge cluster cols from d3_deseq_df_plus_cluster_cnts onto d3_seq_df, on sequence_name
# d3_deseq_plus_cluster_cnts_df = d3_seq_df.merge(d3_deseq_df_plus_cluster_cnts[['sequence_name'] + cluster_cols], on='sequence_name')

In [23]:
# load in the dhs data
dhs_deseq_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis//fimo_motif_scanning/saved_processed_motif_files/dw_deseq_plus_cluster_cnts_df_qthresh05_v2.csv')

dhs_deseq_df.rename(columns={D2_HEPG2_COL:HEPG2_COL,
                            D2_K562_COL:K562_COL,
                            D2_HEPG2_SE_COL:HEPG2_SE_COL,
                            D2_K562_SE_COL:K562_SE_COL},inplace=True)

# load in the combined data
comb_deseq_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis/d2_deseq_data/combined_log2fc.csv')
# rename Unnamed: 0 to sequence
comb_deseq_df.rename(columns={'Unnamed: 0': 'enhancer'}, inplace=True)
# drop column X
comb_deseq_df.drop(columns=['X'], inplace=True)
comb_deseq_df.rename(columns={'log2FoldChange_HEPG2_DNA':f'{HEPG2_COL}_comb',
                              'log2FoldChange_K562_DNA':f'{K562_COL}_comb',
                              'lfcSE_HEPG2_DNA':f'{HEPG2_SE_COL}_comb',
                              'lfcSE_K562_DNA':f'{K562_SE_COL}_comb'},inplace=True)

# merge in the newly processed deseq values into the dhs_deseq_df
dhs_deseq_df = dhs_deseq_df.merge(comb_deseq_df[['enhancer',f'{HEPG2_COL}_comb',f'{K562_COL}_comb',f'{HEPG2_SE_COL}_comb',f'{K562_SE_COL}_comb']],on='enhancer',how='left')
dhs_deseq_df[HEPG2_COL] = dhs_deseq_df[f'{HEPG2_COL}_comb']
dhs_deseq_df[K562_COL] = dhs_deseq_df[f'{K562_COL}_comb']
dhs_deseq_df[HEPG2_SE_COL] = dhs_deseq_df[f'{HEPG2_SE_COL}_comb']
dhs_deseq_df[K562_SE_COL] = dhs_deseq_df[f'{K562_SE_COL}_comb']

dhs_deseq_df.drop(columns=[f'{HEPG2_COL}_comb',f'{K562_COL}_comb',f'{HEPG2_SE_COL}_comb',f'{K562_SE_COL}_comb'],inplace=True)

dhs_deseq_df[H2K_COL] = dhs_deseq_df[HEPG2_COL] - dhs_deseq_df[K562_COL]
dhs_deseq_df[H2K_SE_COL] = np.sqrt(dhs_deseq_df[HEPG2_SE_COL]**2 + dhs_deseq_df[K562_SE_COL]**2)

In [6]:
# do batch correction first!!!!

### Load D1, D2, and D3 data ###

# d1_deseq_df # I guess I don't need this, I can just use the motif df...
d3_seq_dir = '../designed_seqs'
d2_deseq_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis/d2_deseq_data/chris_log2fc_df_clean2.csv')
d2_deseq_df['sequence_name'] = d2_deseq_df.index
d2_deseq_df['sequence_name'] = d2_deseq_df['sequence_name'].apply(lambda x: f'd2_seq_{x}')
d3_seq_df = pd.read_csv(f'{d3_seq_dir}/d3_seq_df_thresh.csv')

# add SE cols to d3_seq_df
# first, load in the full deseq2 output dataframe
d3_deseq_df = pd.read_csv('illumina_processing/log2fc/log2fc_enhancer.csv')
# rename first column to sequence_name
d3_deseq_df.rename(columns={'Unnamed: 0':'sequence_name'},inplace=True)
# remove the 'design_' prefix from the sequence_name column
d3_deseq_df['sequence_name'] = d3_deseq_df['sequence_name'].apply(lambda x: x[7:])
# merge in lfcSE_HepG2_DNA and lfcSE_K562_DNA columns from d2_deseq_plus_cluster_cnts_df into top_enhancer_df by sequence_name
d3_seq_df = d3_seq_df.merge(d3_deseq_df[['sequence_name','lfcSE_HepG2_DNA','lfcSE_K562_DNA']],on='sequence_name',how='left')
# rename lfcSE_HepG2_DNA and lfcSE_K562_DNA to lfcSE_HepG2 and lfcSE_K562
d3_seq_df.rename(columns={'lfcSE_HepG2_DNA':'lfcSE_HEPG2','lfcSE_K562_DNA':'lfcSE_K562'},inplace=True)
d3_seq_df['lfcSE_H2K'] = np.sqrt(d3_seq_df['lfcSE_HEPG2']**2 + d3_seq_df['lfcSE_K562']**2)

d1_deseq_plus_cluster_cnts_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis/fimo_motif_scanning/saved_processed_motif_files/d1_deseq_plus_cluster_cnts_df_qthresh05_v2_plus_cell_type.csv')
d1_deseq_plus_cluster_cnts_df['d1_sequence_name'] = d1_deseq_plus_cluster_cnts_df.index
d1_deseq_plus_cluster_cnts_df['d1_sequence_name'] = d1_deseq_plus_cluster_cnts_df['d1_sequence_name'].apply(lambda x: f'd1_seq_{x}')

d2_deseq_plus_cluster_cnts_df = pd.read_csv('../../../aws/for_aws/sequencing_analysis/fimo_motif_scanning/saved_processed_motif_files/d2_deseq_plus_cluster_cnts_df_qthresh05_v2.csv')
d2_deseq_plus_cluster_cnts_df['sequence_name'] = d2_deseq_plus_cluster_cnts_df.index
d2_deseq_plus_cluster_cnts_df['sequence_name'] = d2_deseq_plus_cluster_cnts_df['sequence_name'].apply(lambda x: f'd2_seq_{x}')

d3_deseq_df_plus_cluster_cnts = pd.read_csv('../designed_seqs/d3_seq_df_plus_cluster_counts.csv')
cluster_cols = [col for col in d3_deseq_df_plus_cluster_cnts.columns if 'cluster' in col]
cluster_cols = ['n_motifs'] + cluster_cols
d3_deseq_plus_cluster_cnts_df = d3_seq_df.merge(d3_deseq_df_plus_cluster_cnts[['sequence_name'] + cluster_cols], on='sequence_name')

### replace d2_deseq_df measurements with the reprocessed values from the comb_dhs df; also do this for d2_deseq_plus_cluster_cnts_df

# replace the columns in d2_deseq_df with the columns from comb_deseq_df
d2_deseq_df = d2_deseq_df.merge(comb_deseq_df[['enhancer',f'{HEPG2_COL}_comb',f'{K562_COL}_comb',f'{HEPG2_SE_COL}_comb',f'{K562_SE_COL}_comb']],on='enhancer',how='left')
# replace values in HEPG2_COL with HEPG2_COL_comb, drop HEPG2_COL_comb
d2_deseq_df[D2_HEPG2_COL] = d2_deseq_df[f'{HEPG2_COL}_comb']
d2_deseq_df[D2_K562_COL] = d2_deseq_df[f'{K562_COL}_comb']
d2_deseq_df[D2_HEPG2_SE_COL] = d2_deseq_df[f'{HEPG2_SE_COL}_comb']
d2_deseq_df[D2_K562_SE_COL] = d2_deseq_df[f'{K562_SE_COL}_comb']
d2_deseq_df.drop(columns=[f'{HEPG2_COL}_comb',f'{K562_COL}_comb',f'{HEPG2_SE_COL}_comb',f'{K562_SE_COL}_comb'],inplace=True)
# recalculate H2K_COL and H2K_SE_COL
d2_deseq_df[H2K_COL] = d2_deseq_df[D2_HEPG2_COL] - d2_deseq_df[D2_K562_COL]
d2_deseq_df[H2K_SE_COL] = np.sqrt(d2_deseq_df[D2_HEPG2_SE_COL]**2 + d2_deseq_df[D2_K562_SE_COL]**2)

# repeat with d2_deseq_plus_cluster_cnts_df
d2_deseq_plus_cluster_cnts_df = d2_deseq_plus_cluster_cnts_df.merge(comb_deseq_df[['enhancer',f'{HEPG2_COL}_comb',f'{K562_COL}_comb',f'{HEPG2_SE_COL}_comb',f'{K562_SE_COL}_comb']],on='enhancer',how='left')
d2_deseq_plus_cluster_cnts_df[D2_HEPG2_COL] = d2_deseq_plus_cluster_cnts_df[f'{HEPG2_COL}_comb']
d2_deseq_plus_cluster_cnts_df[D2_K562_COL] = d2_deseq_plus_cluster_cnts_df[f'{K562_COL}_comb']
d2_deseq_plus_cluster_cnts_df[D2_HEPG2_SE_COL] = d2_deseq_plus_cluster_cnts_df[f'{HEPG2_SE_COL}_comb']
d2_deseq_plus_cluster_cnts_df[D2_K562_SE_COL] = d2_deseq_plus_cluster_cnts_df[f'{K562_SE_COL}_comb']
d2_deseq_plus_cluster_cnts_df.drop(columns=[f'{HEPG2_COL}_comb',f'{K562_COL}_comb',f'{HEPG2_SE_COL}_comb',f'{K562_SE_COL}_comb'],inplace=True)
d2_deseq_plus_cluster_cnts_df['log2FoldChange_H2K'] = d2_deseq_plus_cluster_cnts_df[D2_HEPG2_COL] - d2_deseq_plus_cluster_cnts_df[D2_K562_COL]
d2_deseq_plus_cluster_cnts_df['lfcSE_H2K'] = np.sqrt(d2_deseq_plus_cluster_cnts_df[D2_HEPG2_SE_COL]**2 + d2_deseq_plus_cluster_cnts_df[D2_K562_SE_COL]**2)

# prior to merging, perform all the batch correction steps. Ctrl sequence values will be overwritten with the avg values already stored in the ctrl_dfs
# add model_type col to d2_deseq_plus_cluster_cnts_df
def extract_model_type(x):
    return 'ensemble' if '-' in x else 'single' if 'wide' in x else 'boot' if 'boot' in x else 'motif repeat' if 'crafted' in x else x
d2_deseq_plus_cluster_cnts_df['model_type'] = d2_deseq_plus_cluster_cnts_df['model'].apply(extract_model_type)
# rename generator to design_type
d2_deseq_plus_cluster_cnts_df.rename(columns={'generator': 'design_type'}, inplace=True)

# load linreg slopes from npy file
batch_correction_dir = 'batch_correction_weights/comb_dhs'
suffix = '_weighted_pf'
m_hepg2_d12,b_hepg2_d12,m_k562_d12,b_k562_d12 = np.load(f'{batch_correction_dir}/d1_to_d2_linregs{suffix}.npy')[:4]
m_hepg2_d23,b_hepg2_d23,m_k562_d23,b_k562_d23 = np.load(f'{batch_correction_dir}/d2_to_d3_linregs{suffix}.npy')[:4]

m_hepg2_d13 = m_hepg2_d12 * m_hepg2_d23
b_hepg2_d13 = m_hepg2_d23 * b_hepg2_d12 + b_hepg2_d23 #  m_d23 * b_d12 + b_d23
m_k562_d13 = m_k562_d12 * m_k562_d23
b_k562_d13 = m_k562_d23 * b_k562_d12 + b_k562_d23

# rescale d1 cols to D3
d1_deseq_plus_cluster_cnts_df['log2FoldChange_HEPG2_DNA'] = d1_deseq_plus_cluster_cnts_df['log2FoldChange_HEPG2_DNA'] * m_hepg2_d13 + b_hepg2_d13
d1_deseq_plus_cluster_cnts_df['log2FoldChange_K562_DNA'] = d1_deseq_plus_cluster_cnts_df['log2FoldChange_K562_DNA'] * m_k562_d13 + b_k562_d13
d1_deseq_plus_cluster_cnts_df[H2K_COL] = d1_deseq_plus_cluster_cnts_df[f'log2FoldChange_HEPG2_DNA'] - d1_deseq_plus_cluster_cnts_df[f'log2FoldChange_K562_DNA']

# se rescale
d1_deseq_plus_cluster_cnts_df['lfcSE_HEPG2_DNA'] = d1_deseq_plus_cluster_cnts_df['lfcSE_HEPG2_DNA'] * m_hepg2_d13
d1_deseq_plus_cluster_cnts_df['lfcSE_K562_DNA'] = d1_deseq_plus_cluster_cnts_df['lfcSE_K562_DNA'] * m_k562_d13
d1_deseq_plus_cluster_cnts_df['lfcSE_H2K'] = np.sqrt(d1_deseq_plus_cluster_cnts_df['lfcSE_HEPG2_DNA']**2 + d1_deseq_plus_cluster_cnts_df['lfcSE_K562_DNA']**2)

# rescale d2 cols to D3
d2_deseq_df['log2FoldChange_HEPG2'] = d2_deseq_df['log2FoldChange_HEPG2'] * m_hepg2_d23 + b_hepg2_d23
d2_deseq_df['log2FoldChange_K562'] = d2_deseq_df['log2FoldChange_K562'] * m_k562_d23 + b_k562_d23
d2_deseq_df['log2FoldChange_H2K'] = d2_deseq_df['log2FoldChange_HEPG2'] - d2_deseq_df['log2FoldChange_K562']

# se rescale
d2_deseq_df['lfcSE_HEPG2'] = d2_deseq_df['lfcSE_HEPG2'] * m_hepg2_d23
d2_deseq_df['lfcSE_K562'] = d2_deseq_df['lfcSE_K562'] * m_k562_d23
d2_deseq_df['lfcSE_H2K'] = np.sqrt(d2_deseq_df['lfcSE_HEPG2']**2 + d2_deseq_df['lfcSE_K562']**2)

d2_deseq_plus_cluster_cnts_df['log2FoldChange_HEPG2'] = d2_deseq_plus_cluster_cnts_df['log2FoldChange_HEPG2'] * m_hepg2_d23 + b_hepg2_d23
d2_deseq_plus_cluster_cnts_df['log2FoldChange_K562'] = d2_deseq_plus_cluster_cnts_df['log2FoldChange_K562'] * m_k562_d23 + b_k562_d23
d2_deseq_plus_cluster_cnts_df['log2FoldChange_H2K'] = d2_deseq_plus_cluster_cnts_df['log2FoldChange_HEPG2'] - d2_deseq_plus_cluster_cnts_df['log2FoldChange_K562']

d2_deseq_plus_cluster_cnts_df['lfcSE_HEPG2'] = d2_deseq_df['lfcSE_HEPG2'].copy()
d2_deseq_plus_cluster_cnts_df['lfcSE_K562'] = d2_deseq_df['lfcSE_K562'].copy()
d2_deseq_plus_cluster_cnts_df['lfcSE_H2K'] = d2_deseq_df['lfcSE_H2K'].copy()

In [24]:
# do dhs batch correction using the d2 to d3 scaling

dhs_deseq_df[HEPG2_COL] = dhs_deseq_df[HEPG2_COL] * m_hepg2_d23 + b_hepg2_d23
dhs_deseq_df[K562_COL] = dhs_deseq_df[K562_COL] * m_k562_d23 + b_k562_d23
dhs_deseq_df[H2K_COL] = dhs_deseq_df[HEPG2_COL] - dhs_deseq_df[K562_COL]

dhs_deseq_df[HEPG2_SE_COL] = dhs_deseq_df[HEPG2_SE_COL] * m_hepg2_d23
dhs_deseq_df[K562_SE_COL] = dhs_deseq_df[K562_SE_COL] * m_k562_d23
dhs_deseq_df[H2K_SE_COL] = np.sqrt(dhs_deseq_df[HEPG2_SE_COL]**2 + dhs_deseq_df[K562_SE_COL]**2)

In [None]:
# # rename the columns to be consistent
# d1_deseq_plus_cluster_cnts_df.rename(columns={D1_HEPG2_COL:HEPG2_COL,
#                                                 D1_K562_COL:K562_COL,
#                                                 D1_HEPG2_SE_COL:HEPG2_SE_COL,
#                                                 D1_K562_SE_COL:K562_SE_COL},inplace=True)

# d2_deseq_df.rename(columns={D2_HEPG2_COL:HEPG2_COL,
#                             D2_K562_COL:K562_COL,
#                             D2_HEPG2_SE_COL:HEPG2_SE_COL,
#                             D2_K562_SE_COL:K562_SE_COL},inplace=True)

# d3_seq_df.rename(columns={D3_HEPG2_COL:HEPG2_COL,
#                           D3_K562_COL:K562_COL,
#                           D3_HEPG2_SE_COL:HEPG2_SE_COL,
#                           D3_K562_SE_COL:K562_SE_COL},inplace=True)


# dhs_deseq_df.rename(columns={D2_HEPG2_COL:HEPG2_COL,
#                             D2_K562_COL:K562_COL,
#                             D2_HEPG2_SE_COL:HEPG2_SE_COL,
#                             D2_K562_SE_COL:K562_SE_COL},inplace=True)

### R2 vs R1 seq merging ###

In [26]:
# # first, load in the full deseq2 output dataframe
# d3_deseq_df = pd.read_csv('illumina_processing/log2fc/log2fc_enhancer.csv')
# # rename first column to sequence_name
# d3_deseq_df.rename(columns={'Unnamed: 0':'sequence_name'},inplace=True)
# # remove the 'design_' prefix from the sequence_name column
# d3_deseq_df['sequence_name'] = d3_deseq_df['sequence_name'].apply(lambda x: x[7:])

# # merge in lfcSE_HepG2_DNA and lfcSE_K562_DNA columns from d2_deseq_plus_cluster_cnts_df into top_enhancer_df by sequence_name
# d3_seq_df = d3_seq_df.merge(d3_deseq_df[['sequence_name','lfcSE_HepG2_DNA','lfcSE_K562_DNA']],on='sequence_name',how='left')

In [9]:
d3_ctrl_df = d3_seq_df[d3_seq_df['model_type']=='d2_meas'].copy()
d3_ctrl_df['og_seq_idx'] = d3_ctrl_df['sequence_name'].apply(lambda x: int(x.split('_')[-1]))
d3_ctrl_inds = d3_ctrl_df['og_seq_idx'].values

d2_ctrl_cols = ['log2FoldChange_HEPG2','lfcSE_HEPG2','log2FoldChange_K562','lfcSE_K562']
d3_ctrl_cols = ['log2FoldChange_HepG2_DNA','lfcSE_HEPG2','log2FoldChange_K562_DNA','lfcSE_K562','cell_type','sequence_name','og_seq_idx']

# create ctrl_df with d2_ctrl_cols from d2_deseq_df and d3_ctrl_cols from d3_ctrl_df

d2_ctrl_df = d2_deseq_df.iloc[d3_ctrl_inds][d2_ctrl_cols]
d3_ctrl_df = d3_ctrl_df[d3_ctrl_cols]

d2_ctrl_df = d2_ctrl_df.reset_index(drop=True)
d3_ctrl_df = d3_ctrl_df.reset_index(drop=True)

# rename cols in d2_ctrl_df to be f'{HEPG2_COL}_d2' and f'{K562_COL}_d2'
d2_ctrl_df.rename(columns={'log2FoldChange_HEPG2':f'{HEPG2_COL}_d2',
                           'lfcSE_HEPG2':f'lfcSE_HepG2_d2',
                           'log2FoldChange_K562':f'{K562_COL}_d2',
                           'lfcSE_K562':f'lfcSE_K562_d2'},inplace=True)
# rename cols in d3_ctrl_df to be f'{HEPG2_COL}_d3' and f'{K562_COL}_d3'
d3_ctrl_df.rename(columns={'log2FoldChange_HepG2_DNA':f'{HEPG2_COL}_d3',
                           'lfcSE_HEPG2':f'lfcSE_HepG2_d3',
                           'log2FoldChange_K562_DNA':f'{K562_COL}_d3',
                           'lfcSE_K562':f'lfcSE_K562_d3'},inplace=True)

d23_ctrl_df = pd.concat([d2_ctrl_df,d3_ctrl_df],axis=1)

# calculate inverse variance weighted average of log2FoldChange_HepG2_DNA and log2FoldChange_K562_DNA
hepg2_d2 = d23_ctrl_df[f'{HEPG2_COL}_d2']
hepg2_d3 = d23_ctrl_df[f'{HEPG2_COL}_d3']
hepg2_d2_se = d23_ctrl_df[f'lfcSE_HepG2_d2']
hepg2_d3_se = d23_ctrl_df[f'lfcSE_HepG2_d3']
d23_ctrl_df[f'{HEPG2_COL}_avg'] = (hepg2_d2/hepg2_d2_se**2 + hepg2_d3/hepg2_d3_se**2) / (1/hepg2_d2_se**2 + 1/hepg2_d3_se**2)
d23_ctrl_df['lfcSE_HepG2_avg'] = 1 / np.sqrt(1/hepg2_d2_se**2 + 1/hepg2_d3_se**2)

k562_d2 = d23_ctrl_df[f'{K562_COL}_d2']
k562_d3 = d23_ctrl_df[f'{K562_COL}_d3']
k562_d2_se = d23_ctrl_df[f'lfcSE_K562_d2']
k562_d3_se = d23_ctrl_df[f'lfcSE_K562_d3']
d23_ctrl_df[f'{K562_COL}_avg'] = (k562_d2/k562_d2_se**2 + k562_d3/k562_d3_se**2) / (1/k562_d2_se**2 + 1/k562_d3_se**2)
d23_ctrl_df['lfcSE_K562_avg'] = 1 / np.sqrt(1/k562_d2_se**2 + 1/k562_d3_se**2)


d23_ctrl_df[f'{H2K_COL}_avg'] = d23_ctrl_df[f'{HEPG2_COL}_avg'] - d23_ctrl_df[f'{K562_COL}_avg']
d23_ctrl_df[f'lfcSE_H2K_avg'] = d23_ctrl_df[[f'lfcSE_HepG2_avg',f'lfcSE_K562_avg']].apply(lambda x: np.sqrt(x[0]**2 + x[1]**2),axis=1)

hepg2_cols = [f'{HEPG2_COL}_d2',f'{HEPG2_COL}_d3',f'{HEPG2_COL}_avg',f'lfcSE_HepG2_d2',f'lfcSE_HepG2_d3',f'lfcSE_HepG2_avg']
k562_cols = [f'{K562_COL}_d2',f'{K562_COL}_d3',f'{K562_COL}_avg',f'lfcSE_K562_d2',f'lfcSE_K562_d3',f'lfcSE_K562_avg']
d23_ctrl_df[k562_cols]

  d23_ctrl_df[f'lfcSE_H2K_avg'] = d23_ctrl_df[[f'lfcSE_HepG2_avg',f'lfcSE_K562_avg']].apply(lambda x: np.sqrt(x[0]**2 + x[1]**2),axis=1)


Unnamed: 0,log2FoldChange_K562_DNA_d2,log2FoldChange_K562_DNA_d3,log2FoldChange_K562_DNA_avg,lfcSE_K562_d2,lfcSE_K562_d3,lfcSE_K562_avg
0,-2.367292,-2.674587,-2.572556,0.156129,0.110076,0.089965
1,-2.081859,-2.079870,-2.080832,0.152769,0.147779,0.106216
2,-2.583716,-0.585209,-1.762816,0.171279,0.205144,0.131478
3,-2.080197,-1.373441,-1.607332,0.163828,0.115219,0.094245
4,-2.005341,-0.725604,-1.759763,0.240597,0.493729,0.216284
...,...,...,...,...,...,...
203,2.778211,2.585729,2.642354,0.172691,0.111489,0.093665
204,1.861281,1.840041,1.843822,0.157359,0.073221,0.066386
205,1.678118,2.023670,1.943762,0.174418,0.095661,0.083874
206,-1.969111,-2.252198,-2.120750,0.181171,0.168678,0.123454


In [10]:
d23_ctrl_df['log2FoldChange_H2K_d2'] = d23_ctrl_df['log2FoldChange_HepG2_DNA_d2'] - d23_ctrl_df['log2FoldChange_K562_DNA_d2']
d23_ctrl_df['lfcSE_H2K_d2'] = np.sqrt(d23_ctrl_df['lfcSE_HepG2_d2']**2 + d23_ctrl_df['lfcSE_K562_d2']**2)
d23_ctrl_df['log2FoldChange_H2K_d3'] = d23_ctrl_df['log2FoldChange_HepG2_DNA_d3'] - d23_ctrl_df['log2FoldChange_K562_DNA_d3']
d23_ctrl_df['lfcSE_H2K_d3'] = np.sqrt(d23_ctrl_df['lfcSE_HepG2_d3']**2 + d23_ctrl_df['lfcSE_K562_d3']**2)

# k562_cols = [f'{K562_COL}_d3',f'{K562_COL}_d3',f'{K562_COL}_avg',f'lfcSE_K562_d3',f'lfcSE_K562_d3',f'lfcSE_K562_avg']
d23_ctrl_df[d23_ctrl_df['sequence_name']=='d2_seq_1099']

Unnamed: 0,log2FoldChange_HepG2_DNA_d2,lfcSE_HepG2_d2,log2FoldChange_K562_DNA_d2,lfcSE_K562_d2,log2FoldChange_HepG2_DNA_d3,lfcSE_HepG2_d3,log2FoldChange_K562_DNA_d3,lfcSE_K562_d3,cell_type,sequence_name,...,log2FoldChange_HepG2_DNA_avg,lfcSE_HepG2_avg,log2FoldChange_K562_DNA_avg,lfcSE_K562_avg,log2FoldChange_H2K_avg,lfcSE_H2K_avg,log2FoldChange_H2K_d2,lfcSE_H2K_d2,log2FoldChange_H2K_d3,lfcSE_H2K_d3
9,-0.805705,0.19568,3.891485,0.192511,-1.740064,0.076163,3.846755,0.054986,K562,d2_seq_1099,...,-1.617137,0.070976,3.850129,0.052871,-5.467265,0.088504,-4.69719,0.274501,-5.586819,0.093938


### R1 vs R0 seq merging ###

In [11]:
d2_cols = ['enhancer','log2FoldChange_HEPG2','lfcSE_HEPG2','log2FoldChange_K562','lfcSE_K562','sequence_name']
d2_ctrl_df = d2_deseq_df[d2_deseq_df['model']=='control_f'][d2_cols].copy()
# reset index
d2_ctrl_df.reset_index(inplace=True,drop=True)
# rename cols to D2 HEPG2, D2 K562, D2 H2K
d2_ctrl_df.rename(columns={'log2FoldChange_HEPG2': 'log2FoldChange_HEPG2_d2',
                           'lfcSE_HEPG2': 'lfcSE_HepG2_d2',
                           'log2FoldChange_K562': 'log2FoldChange_K562_d2',
                           'lfcSE_K562': 'lfcSE_K562_d2'}, inplace=True)


d1_cols = ['log2FoldChange_HEPG2_DNA','log2FoldChange_K562_DNA','lfcSE_HEPG2_DNA','lfcSE_K562_DNA','d1_sequence_name']
# get rows of d1_deseq_plus_cluster_cnts_df where seq is in d2_ctrl_df['enhancer]
d1_ctrl_df = d1_deseq_plus_cluster_cnts_df[d1_deseq_plus_cluster_cnts_df['seq'].isin(d2_ctrl_df['enhancer'])][d1_cols].copy()
# reset index
d1_ctrl_df.reset_index(inplace=True,drop=True)
# rename cols to D1 HEPG2, D1 K562, D1 H2K
d1_ctrl_df.rename(columns={'log2FoldChange_HEPG2_DNA': 'log2FoldChange_HEPG2_d1',
                            'lfcSE_HEPG2_DNA': 'lfcSE_HepG2_d1',
                           'log2FoldChange_K562_DNA': 'log2FoldChange_K562_d1',
                           'lfcSE_K562_DNA': 'lfcSE_K562_d1'}, inplace=True)

# concatenate d1_ctrl_df and d2_ctrl_df axis=1
d12_ctrl_df = pd.concat([d1_ctrl_df,d2_ctrl_df],axis=1)
# drop enhancer column
d12_ctrl_df.drop(columns=['enhancer'],inplace=True)

# calculate inverse variance weighted average of log2FoldChange_HepG2_DNA and log2FoldChange_K562_DNA
hepg2_d2 = d12_ctrl_df[f'log2FoldChange_HEPG2_d2']
hepg2_d1 = d12_ctrl_df[f'log2FoldChange_HEPG2_d1']
hepg2_d2_se = d12_ctrl_df[f'lfcSE_HepG2_d2']
hepg2_d1_se = d12_ctrl_df[f'lfcSE_HepG2_d1']
d12_ctrl_df[f'log2FoldChange_HEPG2_avg'] = (hepg2_d2/hepg2_d2_se**2 + hepg2_d1/hepg2_d1_se**2) / (1/hepg2_d2_se**2 + 1/hepg2_d1_se**2)
d12_ctrl_df['lfcSE_HepG2_avg'] = 1 / np.sqrt(1/hepg2_d2_se**2 + 1/hepg2_d1_se**2)

k562_d2 = d12_ctrl_df[f'log2FoldChange_K562_d2']
k562_d1 = d12_ctrl_df[f'log2FoldChange_K562_d1']
k562_d2_se = d12_ctrl_df[f'lfcSE_K562_d2']
k562_d1_se = d12_ctrl_df[f'lfcSE_K562_d1']
d12_ctrl_df[f'log2FoldChange_K562_avg'] = (k562_d2/k562_d2_se**2 + k562_d1/k562_d1_se**2) / (1/k562_d2_se**2 + 1/k562_d1_se**2)
d12_ctrl_df['lfcSE_K562_avg'] = 1 / np.sqrt(1/k562_d2_se**2 + 1/k562_d1_se**2)


d12_ctrl_df[f'{H2K_COL}_avg'] = d12_ctrl_df[f'log2FoldChange_HEPG2_avg'] - d12_ctrl_df[f'log2FoldChange_K562_avg']
d12_ctrl_df[f'lfcSE_H2K_avg'] = d12_ctrl_df[[f'lfcSE_HepG2_avg',f'lfcSE_K562_avg']].apply(lambda x: np.sqrt(x[0]**2 + x[1]**2),axis=1)

# # remove duplicate sequence name column
# d12_cols = list(d12_ctrl_df.columns.values)
# d12_cols[9]
# d12_cols = d12_cols[:9] + d12_cols[10:]
# d12_ctrl_df = d12_ctrl_df[d12_cols]

hepg2_cols = [f'log2FoldChange_HEPG2_d2',f'log2FoldChange_HEPG2_d1',f'log2FoldChange_HEPG2_avg',f'lfcSE_HepG2_d2',f'lfcSE_HepG2_d1',f'lfcSE_HepG2_avg',f'{H2K_COL}_avg','lfcSE_H2K_avg']
k562_cols = [f'log2FoldChange_K562_d2',f'log2FoldChange_K562_d1',f'log2FoldChange_K562_avg',f'lfcSE_K562_d2',f'lfcSE_K562_d1',f'lfcSE_K562_avg',f'{H2K_COL}_avg','lfcSE_H2K_avg']
d12_ctrl_df[k562_cols]

  d12_ctrl_df[f'lfcSE_H2K_avg'] = d12_ctrl_df[[f'lfcSE_HepG2_avg',f'lfcSE_K562_avg']].apply(lambda x: np.sqrt(x[0]**2 + x[1]**2),axis=1)


Unnamed: 0,log2FoldChange_K562_d2,log2FoldChange_K562_d1,log2FoldChange_K562_avg,lfcSE_K562_d2,lfcSE_K562_d1,lfcSE_K562_avg,log2FoldChange_H2K_avg,lfcSE_H2K_avg
0,-1.509961,-2.050335,-1.594855,0.216566,0.501634,0.198828,0.285849,0.275628
1,-2.616645,-2.008728,-2.562977,0.156711,0.503606,0.149633,3.542549,0.202855
2,0.728436,0.790577,0.744345,0.204686,0.348919,0.176550,-3.446691,0.258091
3,-0.872718,0.114669,-0.748628,0.169843,0.447981,0.158812,-0.668416,0.228454
4,-0.460773,-1.968087,-0.649224,0.197168,0.521600,0.184432,1.419995,0.247030
...,...,...,...,...,...,...,...,...
95,-1.152191,-0.097923,-1.007124,0.201888,0.505422,0.187484,-1.237211,0.266134
96,-2.017257,-1.724995,-2.002584,0.160521,0.698200,0.156439,2.327300,0.212518
97,-1.486992,-0.437138,-1.433547,0.149011,0.643407,0.145169,4.228476,0.197083
98,-2.176198,-1.970288,-2.156666,0.170776,0.527538,0.162475,-0.326383,0.227146


In [12]:
d12_ctrl_df['log2FoldChange_H2K_d1'] = d12_ctrl_df['log2FoldChange_HEPG2_d1'] - d12_ctrl_df['log2FoldChange_K562_d1']
d12_ctrl_df['lfcSE_H2K_d1'] = np.sqrt(d12_ctrl_df['lfcSE_HepG2_d1']**2 + d12_ctrl_df['lfcSE_K562_d1']**2)
d12_ctrl_df['log2FoldChange_H2K_d2'] = d12_ctrl_df['log2FoldChange_HEPG2_d2'] - d12_ctrl_df['log2FoldChange_K562_d2']
d12_ctrl_df['lfcSE_H2K_d2'] = np.sqrt(d12_ctrl_df['lfcSE_HepG2_d2']**2 + d12_ctrl_df['lfcSE_K562_d2']**2)

In [13]:
h2k_cols = ['sequence_name','d1_sequence_name','log2FoldChange_H2K_d1','log2FoldChange_H2K_d2','log2FoldChange_H2K_avg','lfcSE_H2K_d1','lfcSE_H2K_d2','lfcSE_H2K_avg']
d12_ctrl_df.sort_values(by='log2FoldChange_H2K_d2',ascending=False)[h2k_cols]

Unnamed: 0,sequence_name,d1_sequence_name,log2FoldChange_H2K_d1,log2FoldChange_H2K_d2,log2FoldChange_H2K_avg,lfcSE_H2K_d1,lfcSE_H2K_d2,lfcSE_H2K_avg
11,d2_seq_138,d1_seq_2814,3.901893,5.132355,4.873838,0.595395,0.217265,0.199655
29,d2_seq_319,d1_seq_8446,2.615251,4.655773,4.285953,0.717311,0.229001,0.212017
97,d2_seq_1326,d1_seq_29308,2.293851,4.523128,4.228476,0.707708,0.211045,0.197083
59,d2_seq_722,d1_seq_16438,2.035098,3.921838,3.650769,0.597387,0.213762,0.199306
15,d2_seq_165,d1_seq_3758,3.464242,3.750278,3.547521,0.724303,0.215172,0.199057
...,...,...,...,...,...,...,...,...
40,d2_seq_597,d1_seq_13664,-3.618304,-3.658980,-3.673204,0.438236,0.230574,0.200243
38,d2_seq_569,d1_seq_12799,-2.983199,-3.817074,-3.603342,0.452452,0.269825,0.230494
28,d2_seq_294,d1_seq_7666,-3.818336,-4.001691,-3.839654,0.404729,0.318062,0.243811
74,d2_seq_1034,d1_seq_22158,-3.248725,-4.464074,-4.122312,0.486197,0.302559,0.255024


In [14]:
d12_ctrl_df[d12_ctrl_df['d1_sequence_name']=='d1_seq_2814']

Unnamed: 0,log2FoldChange_HEPG2_d1,log2FoldChange_K562_d1,lfcSE_HepG2_d1,lfcSE_K562_d1,d1_sequence_name,log2FoldChange_HEPG2_d2,lfcSE_HepG2_d2,log2FoldChange_K562_d2,lfcSE_K562_d2,sequence_name,log2FoldChange_HEPG2_avg,lfcSE_HepG2_avg,log2FoldChange_K562_avg,lfcSE_K562_avg,log2FoldChange_H2K_avg,lfcSE_H2K_avg,log2FoldChange_H2K_d1,lfcSE_H2K_d1,log2FoldChange_H2K_d2,lfcSE_H2K_d2
11,2.053913,-1.84798,0.275109,0.528025,d1_seq_2814,3.090904,0.152265,-2.041452,0.154982,d2_seq_138,2.847732,0.133221,-2.026106,0.148709,4.873838,0.199655,3.901893,0.595395,5.132355,0.217265


In [66]:
d12_ctrl_df[d12_ctrl_df['sequence_name']=='d2_seq_1041']['log2FoldChange_HEPG2_d2'] - d12_ctrl_df[d12_ctrl_df['sequence_name']=='d2_seq_1041']['log2FoldChange_K562_d2']

75   -5.299993
dtype: float64

In [15]:
# get intersection of sequence_name column in d12_ctrl_df and d23_ctrl_df
ctrl_seq_names = list(set(d12_ctrl_df['sequence_name']).intersection(set(d23_ctrl_df['sequence_name'])))
print((ctrl_seq_names))

['d2_seq_1041']


In [16]:
# okay there is only 1 sequence that appears in all 3 datasets, seq 1041. For this sequence I should calculate the average of the 3 log2FoldChange_H2K values
d1_ctrl_df['sequence_name'] = d2_ctrl_df['sequence_name']

seq_1041_d3 = d3_ctrl_df[d3_ctrl_df['sequence_name']=='d2_seq_1041']
seq_1041_d2 = d2_ctrl_df[d2_ctrl_df['sequence_name']=='d2_seq_1041']
seq_1041_d1 = d1_ctrl_df[d1_ctrl_df['sequence_name']=='d2_seq_1041']

hepg2_d1 = seq_1041_d1['log2FoldChange_HEPG2_d1'].values[0]
hepg2_d2 = seq_1041_d2['log2FoldChange_HEPG2_d2'].values[0]
hepg2_d3 = seq_1041_d3['log2FoldChange_HepG2_DNA_d3'].values[0]
hepg2_d1_se = seq_1041_d1['lfcSE_HepG2_d1'].values[0]
hepg2_d2_se = seq_1041_d2['lfcSE_HepG2_d2'].values[0]
hepg2_d3_se = seq_1041_d3['lfcSE_HepG2_d3'].values[0]

k562_d1 = seq_1041_d1['log2FoldChange_K562_d1'].values[0]
k562_d2 = seq_1041_d2['log2FoldChange_K562_d2'].values[0]
k562_d3 = seq_1041_d3['log2FoldChange_K562_DNA_d3'].values[0]
k562_d1_se = seq_1041_d1['lfcSE_K562_d1'].values[0]
k562_d2_se = seq_1041_d2['lfcSE_K562_d2'].values[0]
k562_d3_se = seq_1041_d3['lfcSE_K562_d3'].values[0]

hepg2_avg = (hepg2_d1/hepg2_d1_se**2 + hepg2_d2/hepg2_d2_se**2 + hepg2_d3/hepg2_d3_se**2) / (1/hepg2_d1_se**2 + 1/hepg2_d2_se**2 + 1/hepg2_d3_se**2)
hepg2_avg_se = 1 / np.sqrt(1/hepg2_d1_se**2 + 1/hepg2_d2_se**2 + 1/hepg2_d3_se**2)

k562_avg = (k562_d1/k562_d1_se**2 + k562_d2/k562_d2_se**2 + k562_d3/k562_d3_se**2) / (1/k562_d1_se**2 + 1/k562_d2_se**2 + 1/k562_d3_se**2)
k562_avg_se = 1 / np.sqrt(1/k562_d1_se**2 + 1/k562_d2_se**2 + 1/k562_d3_se**2)

h2k_avg = hepg2_avg - k562_avg
h2k_avg_se = np.sqrt(hepg2_avg_se**2 + k562_avg_se**2)

print(hepg2_avg,hepg2_avg_se)
print(k562_avg,k562_avg_se)
print(h2k_avg,h2k_avg_se)

-1.93573361465433 0.11359565929084797
3.094342785555943 0.07536347660222159
-5.030076400210273 0.13632177894707806


In [17]:
# columns in seq_dfs that will need to be updated with values from ctrl_cols
d3_cols = ['log2FoldChange_HepG2_DNA','log2FoldChange_K562_DNA','log2FoldChange_H2K','lfcSE_HEPG2','lfcSE_K562','lfcSE_H2K']
d2_cols = ['log2FoldChange_HEPG2','log2FoldChange_K562','log2FoldChange_H2K','lfcSE_HEPG2','lfcSE_K562','lfcSE_H2K']
d1_cols = ['log2FoldChange_HEPG2_DNA','log2FoldChange_K562_DNA','log2FoldChange_H2K','lfcSE_HEPG2_DNA','lfcSE_K562_DNA','lfcSE_H2K']

d23_ctrl_cols = ['log2FoldChange_HepG2_DNA_avg','log2FoldChange_K562_DNA_avg','log2FoldChange_H2K_avg','lfcSE_HepG2_avg','lfcSE_K562_avg','lfcSE_H2K_avg']
d12_ctrl_cols = ['log2FoldChange_HEPG2_avg','log2FoldChange_K562_avg','log2FoldChange_H2K_avg','lfcSE_HepG2_avg','lfcSE_K562_avg','lfcSE_H2K_avg']

# merge d23_ctrl_df into d3_seq_df on sequence_name
# for each column in d23_ctrl_cols, update the corresponding column in d3_seq_df, using sequence_name

for _,row in d23_ctrl_df.iterrows():
    seqname = row['sequence_name']
    for col1,col2 in zip(d3_cols,d23_ctrl_cols):
        d3_seq_df.loc[d3_seq_df['sequence_name']==row['sequence_name'],col1] = row[col2]
        d3_deseq_plus_cluster_cnts_df.loc[d3_deseq_plus_cluster_cnts_df['sequence_name']==row['sequence_name'],col1] = row[col2]

for _,row in d23_ctrl_df.iterrows():
    seqname = row['sequence_name']
    for col1,col2 in zip(d2_cols,d23_ctrl_cols):
        d2_deseq_df.loc[d2_deseq_df['sequence_name']==row['sequence_name'],col1] = row[col2]
        d2_deseq_plus_cluster_cnts_df.loc[d2_deseq_plus_cluster_cnts_df['sequence_name']==row['sequence_name'],col1] = row[col2]

for _,row in d12_ctrl_df.iterrows():
    seqname = row['sequence_name']
    for col1,col2 in zip(d2_cols,d12_ctrl_cols):
        d2_deseq_df.loc[d2_deseq_df['sequence_name']==row['sequence_name'],col1] = row[col2]
        d2_deseq_plus_cluster_cnts_df.loc[d2_deseq_plus_cluster_cnts_df['sequence_name']==row['sequence_name'],col1] = row[col2]

for _,row in d12_ctrl_df.iterrows():
    seqname = row['d1_sequence_name']
    if seqname not in d1_deseq_plus_cluster_cnts_df['d1_sequence_name'].values:
        print(seqname)
        continue
    for col1,col2 in zip(d1_cols,d12_ctrl_cols):
        d1_deseq_plus_cluster_cnts_df.loc[d1_deseq_plus_cluster_cnts_df['d1_sequence_name']==seqname,col1] = row[col2]


# lastly, update the 1041 sequence in all 3 dataframes with the averaged values
d3_seq_df.loc[d3_seq_df['sequence_name']=='d2_seq_1041','log2FoldChange_HepG2_DNA'] = hepg2_avg
d3_seq_df.loc[d3_seq_df['sequence_name']=='d2_seq_1041','log2FoldChange_K562_DNA'] = k562_avg
d3_seq_df.loc[d3_seq_df['sequence_name']=='d2_seq_1041','log2FoldChange_H2K'] = h2k_avg
d3_seq_df.loc[d3_seq_df['sequence_name']=='d2_seq_1041','lfcSE_HEPG2'] = hepg2_avg_se
d3_seq_df.loc[d3_seq_df['sequence_name']=='d2_seq_1041','lfcSE_K562'] = k562_avg_se
d3_seq_df.loc[d3_seq_df['sequence_name']=='d2_seq_1041','lfcSE_H2K'] = h2k_avg_se

d3_deseq_plus_cluster_cnts_df.loc[d3_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','log2FoldChange_HepG2_DNA'] = hepg2_avg
d3_deseq_plus_cluster_cnts_df.loc[d3_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','log2FoldChange_K562_DNA'] = k562_avg
d3_deseq_plus_cluster_cnts_df.loc[d3_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','log2FoldChange_H2K'] = h2k_avg
d3_deseq_plus_cluster_cnts_df.loc[d3_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','lfcSE_HepG2'] = hepg2_avg_se
d3_deseq_plus_cluster_cnts_df.loc[d3_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','lfcSE_K562'] = k562_avg_se
d3_deseq_plus_cluster_cnts_df.loc[d3_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','lfcSE_H2K'] = h2k_avg_se

d2_deseq_df.loc[d2_deseq_df['sequence_name']=='d2_seq_1041','log2FoldChange_HEPG2'] = hepg2_avg
d2_deseq_df.loc[d2_deseq_df['sequence_name']=='d2_seq_1041','log2FoldChange_K562'] = k562_avg
d2_deseq_df.loc[d2_deseq_df['sequence_name']=='d2_seq_1041','log2FoldChange_H2K'] = h2k_avg
d2_deseq_df.loc[d2_deseq_df['sequence_name']=='d2_seq_1041','lfcSE_HEPG2'] = hepg2_avg_se
d2_deseq_df.loc[d2_deseq_df['sequence_name']=='d2_seq_1041','lfcSE_K562'] = k562_avg_se
d2_deseq_df.loc[d2_deseq_df['sequence_name']=='d2_seq_1041','lfcSE_H2K'] = h2k_avg_se

d2_deseq_plus_cluster_cnts_df.loc[d2_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','log2FoldChange_HEPG2'] = hepg2_avg
d2_deseq_plus_cluster_cnts_df.loc[d2_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','log2FoldChange_K562'] = k562_avg
d2_deseq_plus_cluster_cnts_df.loc[d2_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','log2FoldChange_H2K'] = h2k_avg
d2_deseq_plus_cluster_cnts_df.loc[d2_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','lfcSE_HEPG2'] = hepg2_avg_se
d2_deseq_plus_cluster_cnts_df.loc[d2_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','lfcSE_K562'] = k562_avg_se
d2_deseq_plus_cluster_cnts_df.loc[d2_deseq_plus_cluster_cnts_df['sequence_name']=='d2_seq_1041','lfcSE_H2K'] = h2k_avg_se

seq_1041 = d2_deseq_df[d2_deseq_df['sequence_name']=='d2_seq_1041']['enhancer'].values[0]
d1_deseq_plus_cluster_cnts_df.loc[d1_deseq_plus_cluster_cnts_df['seq']==seq_1041,'log2FoldChange_HEPG2_DNA'] = hepg2_avg
d1_deseq_plus_cluster_cnts_df.loc[d1_deseq_plus_cluster_cnts_df['seq']==seq_1041,'log2FoldChange_K562_DNA'] = k562_avg
d1_deseq_plus_cluster_cnts_df.loc[d1_deseq_plus_cluster_cnts_df['seq']==seq_1041,'log2FoldChange_H2K'] = h2k_avg
d1_deseq_plus_cluster_cnts_df.loc[d1_deseq_plus_cluster_cnts_df['seq']==seq_1041,'lfcSE_HEPG2_DNA'] = hepg2_avg_se
d1_deseq_plus_cluster_cnts_df.loc[d1_deseq_plus_cluster_cnts_df['seq']==seq_1041,'lfcSE_K562_DNA'] = k562_avg_se
d1_deseq_plus_cluster_cnts_df.loc[d1_deseq_plus_cluster_cnts_df['seq']==seq_1041,'lfcSE_H2K'] = h2k_avg_se

In [142]:
# okay! I think this is finally done. I need to doublecheck some rows and then I can save these dataframes to csvs, and update the figure notebooks to use these new dataframes
# well, maybe I should do the batch correction in this notebook too, otherwise it will be annoying to avoid overwriting the control seqs

In [16]:
for seqname in ['d2_seq_1041']:#d23_ctrl_df['sequence_name'].values[0:1]:
    print(seqname)
    print(d3_seq_df[d3_seq_df['sequence_name']==seqname][['log2FoldChange_HepG2_DNA','log2FoldChange_K562_DNA','log2FoldChange_H2K','lfcSE_HepG2','lfcSE_K562','lfcSE_H2K']])
    print('---')
    # print(d3_deseq_plus_cluster_cnts_df[d3_deseq_plus_cluster_cnts_df['sequence_name']==seqname][['log2FoldChange_HepG2_DNA','log2FoldChange_K562_DNA','log2FoldChange_H2K','lfcSE_HepG2_DNA','lfcSE_K562_DNA','lfcSE_H2K']])
    # print('---')
    # print(d2_deseq_df[d2_deseq_df['sequence_name']==seqname][['log2FoldChange_HEPG2','log2FoldChange_K562','log2FoldChange_H2K','lfcSE_HEPG2','lfcSE_K562','lfcSE_H2K']])
    # print('---')
    # print(d2_deseq_plus_cluster_cnts_df[d2_deseq_plus_cluster_cnts_df['sequence_name']==seqname][['log2FoldChange_HEPG2','log2FoldChange_K562','log2FoldChange_H2K','lfcSE_HEPG2','lfcSE_K562','lfcSE_H2K']])
    # print('---')
    print(d1_deseq_plus_cluster_cnts_df[d1_deseq_plus_cluster_cnts_df['seq']==seq_1041][['log2FoldChange_HEPG2_DNA','log2FoldChange_K562_DNA','log2FoldChange_H2K','lfcSE_HEPG2_DNA','lfcSE_K562_DNA','lfcSE_H2K']])

d2_seq_1041
   log2FoldChange_HepG2_DNA  log2FoldChange_K562_DNA  log2FoldChange_H2K  \
6                 -1.935855                 3.094874           -5.030729   

   lfcSE_HepG2  lfcSE_K562  lfcSE_H2K  
6     0.109482    0.074024   0.132159  
---
       log2FoldChange_HEPG2_DNA  log2FoldChange_K562_DNA  log2FoldChange_H2K  \
22263                 -1.935855                 3.094874           -5.030729   

       lfcSE_HEPG2_DNA  lfcSE_K562_DNA  lfcSE_H2K  
22263         0.109482        0.074024   0.132159  


In [17]:
d1_deseq_plus_cluster_cnts_df.loc[d1_deseq_plus_cluster_cnts_df['d1_sequence_name']=='d1_seq_2814',d1_cols[2]] = d12_ctrl_df.loc[d12_ctrl_df['d1_sequence_name']=='d1_seq_2814',d12_ctrl_cols[2]].values[0]

In [70]:
d12_ctrl_df.loc[d12_ctrl_df['d1_sequence_name']=='d1_seq_2814',d12_ctrl_cols[2]].values[0]

4.799710970150846

In [71]:
d1_deseq_plus_cluster_cnts_df[d1_deseq_plus_cluster_cnts_df['d1_sequence_name']=='d1_seq_2814']['log2FoldChange_H2K']

2814    4.799711
Name: log2FoldChange_H2K, dtype: float64

In [19]:
# rename columns to match across dataframes, otherwise gets really annoying downstream

D2_HEPG2_COL = 'log2FoldChange_HEPG2'
D2_K562_COL = 'log2FoldChange_K562'

HEPG2_SE_COL = 'lfcSE_HEPG2'
K562_SE_COL = 'lfcSE_K562'
H2K_SE_COL = 'lfcSE_H2K'

D3_HEPG2_SE_COL = 'lfcSE_HepG2'
D3_K562_SE_COL = 'lfcSE_K562_DNA'

D1_HEPG2_SE_COL = 'lfcSE_HEPG2_DNA'
D1_K562_SE_COL = 'lfcSE_K562_DNA'

HEPG2_COL = 'log2FoldChange_HepG2'
K562_COL = 'log2FoldChange_K562'

# rename D1_HEPG2_SE_COL to HEPG2_SE_COL, D1_K562_SE_COL to K562_SE_COL
d1_deseq_plus_cluster_cnts_df.rename(columns={D1_HEPG2_SE_COL: HEPG2_SE_COL, D1_K562_SE_COL: K562_SE_COL}, inplace=True)
d1_deseq_plus_cluster_cnts_df.rename(columns={'log2FoldChange_HEPG2_DNA':'log2FoldChange_HepG2','log2FoldChange_K562_DNA':'log2FoldChange_K562'},inplace=True)

d2_deseq_df.rename(columns={'log2FoldChange_HEPG2':HEPG2_COL},inplace=True)
d2_deseq_plus_cluster_cnts_df.rename(columns={'log2FoldChange_HepG2':HEPG2_COL,D2_K562_COL:K562_COL},inplace=True)
# rename D3_HEPG2_SE_COL to HEPG2_SE_COL, D3_K562_SE_COL to K562_SE_COL in d3_design_df - no longer necessary, already updated
# d3_seq_df.rename(columns={D3_HEPG2_SE_COL:HEPG2_SE_COL,D3_K562_SE_COL:K562_SE_COL},inplace=True)
d3_seq_df.rename(columns={'log2FoldChange_HepG2_DNA':HEPG2_COL,'log2FoldChange_K562_DNA':'log2FoldChange_K562'},inplace=True)
# do the same for d3_deseq_plus_cluster_cnts_df
d3_deseq_plus_cluster_cnts_df.rename(columns={D3_HEPG2_SE_COL:HEPG2_SE_COL,D3_K562_SE_COL:K562_SE_COL},inplace=True)
d3_deseq_plus_cluster_cnts_df.rename(columns={'log2FoldChange_HepG2_DNA':HEPG2_COL,'log2FoldChange_K562_DNA':'log2FoldChange_K562'},inplace=True)

In [73]:
# K562_COL = 'log2FoldChange_K562'
# d2_deseq_plus_cluster_cnts_df.rename(columns={'log2FoldChange_K562_DNA':K562_COL},inplace=True)

In [20]:
d2_deseq_plus_cluster_cnts_df.columns

Index(['enhancer', 'model', 'design_type', 'indexes', 'cell_type',
       'log2FoldChange_HEPG2', 'log2FoldChange_K562',
       'log2FoldChange_H2K_deseq', 'pred_h2k_score', 'padj_HEPG2', 'padj_K562',
       'n_motifs', 'cluster_50', 'cluster_1', 'cluster_62', 'cluster_8',
       'cluster_27', 'cluster_112', 'cluster_39', 'cluster_9', 'cluster_2',
       'cluster_5', 'cluster_4', 'cluster_135', 'cluster_45', 'cluster_78',
       'cluster_59', 'cluster_18', 'cluster_99', 'cluster_43', 'cluster_7',
       'cluster_90', 'cluster_58', 'cluster_21', 'cluster_100', 'cluster_47',
       'cluster_76', 'cluster_84', 'cluster_10', 'cluster_56', 'cluster_42',
       'cluster_82', 'cluster_40', 'cluster_11', 'cluster_49', 'cluster_69',
       'cluster_67', 'cluster_53', 'cluster_120', 'cluster_55', 'cluster_65',
       'cluster_31', 'cluster_41', 'cluster_77', 'cluster_103', 'cluster_61',
       'cluster_6', 'cluster_32', 'n_unique_clusters', 'sequence_name',
       'lfcSE_HEPG2', 'lfcSE_K562', 'l

In [34]:
# finally, save these processed dfs

save_dir = 'bc_deseq_dfs/weighted/comb_dhs'
# if it doesn't exist, make it
if not os.path.exists(save_dir):
    os.makedirs(save_dir)


# rename d1_sequence_name col to sequence_name
d1_deseq_plus_cluster_cnts_df.rename(columns={'d1_sequence_name':'sequence_name'},inplace=True)
d1_deseq_plus_cluster_cnts_df.to_csv(f'{save_dir}/d1_deseq_plus_cluster_cnts_df.csv',index=False)

d2_deseq_df.to_csv(f'{save_dir}/d2_deseq_df.csv',index=False)
d2_deseq_plus_cluster_cnts_df.to_csv(f'{save_dir}/d2_deseq_plus_cluster_cnts_df.csv',index=False)

d3_seq_df.to_csv(f'{save_dir}/d3_seq_df.csv',index=False)
d3_deseq_plus_cluster_cnts_df.to_csv(f'{save_dir}/d3_deseq_plus_cluster_cnts_df.csv',index=False)

dhs_deseq_df.to_csv(f'{save_dir}/dhs_deseq_df.csv',index=False) # this includes cluster counts