In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import itertools
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.ticker as mticker
import seaborn as sns
from scipy import stats
chrom_list = ['chr'+str(i) for i in range(1,23)]
chrom_list.append('chrX')
plt.rcParams["figure.figsize"] = (6,4)
plt.rcParams["font.family"] = "Arial"

# Loading the data


In [2]:
frag_counts = pd.read_csv('../data/ATAC_fragcounts_raw.txt', sep=' ')
peak_ids = frag_counts.index
parse_range = lambda x: pd.Series([i.split('-')[x] for i in frag_counts.index])
chroms = parse_range(0)
starts = parse_range(1).astype('int')
ends = parse_range(2).astype('int')
frag_counts = frag_counts.reset_index(drop=True)

frag_counts['F9'] = frag_counts['F5']
frag_counts = frag_counts.drop('F5', axis=1)

par_cols = ['P'+str(i) for i in range(1,11)]
par_cols.append('parent')
new_cols = ['par_'+str(i) for i in range(1,11)]
new_cols.append('parent')
merged_parental = frag_counts[new_cols].sum(axis=1)
print(frag_counts.shape)

(259036, 43)


In [3]:
par_subclones = ['par_' + str(i) for i in range(1,11)]
median_pars = frag_counts[par_subclones].median(axis=1)

par_cutoff = 8
frag_counts_filtered = frag_counts.copy().loc[median_pars>par_cutoff]

chroms_filtered = pd.Series(chroms)[median_pars>par_cutoff]
starts_filtered = pd.Series(starts)[median_pars>par_cutoff]
ends_filtered = pd.Series(ends)[median_pars>par_cutoff]

print(frag_counts_filtered.shape)

(230007, 43)


# Copy-number normalization

In [48]:
def loadCN(cn_path, sample_key):
    sample_dict = {i:j for i,j in zip(sample_key.ID, sample_key.clone)}
    cn = 2*pd.read_csv(cn_path, sep='\t')
    cn_samples = [i.split('_')[1] for i in cn.columns]
    cn.columns = [sample_dict[i] for i in cn_samples]

    return cn


def binCN(cn_df, step):
    bins_10kb = pd.read_csv('../data/10kb_bins.csv', sep='\t')
    cn_binned = pd.DataFrame()
    new_bins = pd.DataFrame()
    for chrom in chrom_list:
        chrom_df = pd.DataFrame()
        chrom_bins = bins_10kb[bins_10kb.Chr==chrom]
        chrom_cn = cn_df.copy().iloc[np.where(bins_10kb.Chr==chrom)[0],:]
        bins_step = range(0, max(chrom_bins.End), step*10000)
        bin_id = np.digitize(chrom_bins.Start, bins_step)
        mean_cn = chrom_cn.groupby(bin_id).mean()
        chrom_new_bins = pd.DataFrame()
        chrom_new_bins['Start'] = pd.Series([bins_step[i-1] for i in np.unique(bin_id)])
        chrom_new_bins['End'] = chrom_new_bins['Start'].values+step*10000
        chrom_new_bins['Chr'] = chrom
        cn_binned = cn_binned.append(mean_cn)
        new_bins = new_bins.append(chrom_new_bins)
    cn_binned = cn_binned.reset_index(drop=True)
    new_bins = new_bins.reset_index(drop=True)
        
    return cn_binned, new_bins

def normalizeCN(frag_df, frag_chroms, frag_starts, cn_df, bins, samples):
    frag_df = frag_df.copy()[samples]
    cn_df = cn_df.copy()[samples]
    frag_cn = pd.DataFrame()
    for chrom in chrom_list:
        chrom_bins = bins[bins.Chr==chrom]
        chrom_cn = cn_df.copy()[bins.Chr==chrom]
        chrom_fragments = frag_df[frag_chroms==chrom]
        chrom_starts = frag_starts[frag_chroms==chrom]
        peak_bins = np.digitize(chrom_starts, chrom_bins.Start)
        chrom_cn.index = range(chrom_cn.shape[0])
        chrom_cn = chrom_cn.iloc[[i-1 for i in peak_bins],:]
        frag_cn = frag_cn.append(chrom_cn)    
    frag_cn.index = range(frag_cn.shape[0])
    frag_cn.index = frag_df.index
    frag_count_norm = 2*frag_df/frag_cn
    
    return frag_count_norm

In [49]:
cn_path = '../data/NormalizedReadDepth.txt'
sample_key = pd.read_csv('sample_key.txt', sep='\t')
cn_df = loadCN(cn_path, sample_key)
cn_df[cn_df<0.5]=np.nan
cn_df['parent'] = cn_df['P10']
bins_10kb = pd.read_csv('../data/10kb_bins.csv', sep='\t')
cn_binned, bins_250kb = binCN(cn_df, 25)
cn_250kb = cn_binned.copy()
#Fixing the sample name discrepancies between ATAC and WGS experiments
for i in range(1,11):
    cn_250kb['F11_'+str(i)] = cn_250kb['1a'+str(i)].values
    cn_250kb['F9_'+str(i)] = cn_250kb['2a'+str(i)].values
    cn_250kb['par_'+str(i)] = cn_250kb['P'+str(i)].values
cn_250kb['parent'] = cn_250kb['par_1']

# Using quantile normalization

In [6]:
##Loading the data
frag_counts = pd.read_csv('../data/peakmatrixNORM220815.txt', sep=' ')
frag_counts = frag_counts.reset_index(drop=True)

frag_counts['F9'] = frag_counts['F5']
frag_counts = frag_counts.drop('F5', axis=1)

frag_counts_quantile = frag_counts.loc[frag_counts_filtered.index]
frag_counts_quantile.shape

(230007, 43)

In [51]:
lib_norm = frag_counts_quantile.copy()
cn_norm = normalizeCN(lib_norm, chroms_filtered, starts_filtered, cn_250kb, bins_250kb, frag_counts_filtered.columns.tolist())
cn_norm.index = peak_ids[cn_norm.index]
# cn_norm.fillna('NA').to_csv('ATAC_frag_counts_CN_masked.txt', sep=' ')

# Fold-change calculation after normalization for library size and CN

In [52]:
fc = cn_norm.copy().divide(cn_norm[par_subclones].median(axis=1), axis=0)
sample_list = ['par_'+str(i) for i in range(1,11)]
sample_list.extend(['F9_'+str(i) for i in range(1,11)])
sample_list.extend(['F11_'+str(i) for i in range(1,11)])
sample_list.extend(['parent', 'K11', 'F11', 'B4', 'F9', 'N6', 'E5', 'F3', 'O16', 'K2', 'F2', 'E8', 'N11'])
fc = fc[sample_list]