# ATAC-seq - Prepare .bed files for enrichment analysis
- goal: .bed files of result region sets
- input: DARs, DAR clusters, time-series clusters
- output: corresponding .bed files

In [1]:
cd ../

/home/sreichl/projects/bmdm-stim


In [2]:
# libraries
import os
import pandas as pd
import pybedtools as bedtools
import pathlib

In [3]:
# configs
dir_data = os.path.join('results', 'ATAC', 'all')
metadata_path=os.path.join('metadata','ATAC_sample_metadata.csv')
project_prefix_path = os.getcwd()

# load data

In [4]:
# Load sample annotation
annot = pd.read_csv(metadata_path, index_col=0, header=0,)
print(annot.shape)
annot.head()

(78, 10)


Unnamed: 0_level_0,library,timepoint after thawing,cell_line,Treatment,Treatment_time,mouse_pool,sex,bio-replicate,organism,experiment_id
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
PT76_R1_C_albicans_2h,ATAC-seq,d010,BMDM,C_albicans,2h,PT62_c,female,R1,mouse,PT76
PT76_R1_C_albicans_4h,ATAC-seq,d010,BMDM,C_albicans,4h,PT62_c,female,R1,mouse,PT76
PT76_R1_C_albicans_6h,ATAC-seq,d010,BMDM,C_albicans,6h,PT62_c,female,R1,mouse,PT76
PT76_R1_C_albicans_8h,ATAC-seq,d010,BMDM,C_albicans,8h,PT62_c,female,R1,mouse,PT76
PT76_R1_untreated_0h,ATAC-seq,d010,BMDM,untreated,0h,PT62_c,female,R1,mouse,PT76


In [5]:
# load consensus region set
consensus_regions = pd.read_csv(os.path.join(dir_data,'all_consensus_regions_filtered.bed'), sep='\t', index_col=3, header=None,)
print(consensus_regions.shape)
consensus_regions.head()

(136735, 3)


Unnamed: 0_level_0,0,1,2
3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CONS00000000005,chr1,4414052,4415256
CONS00000000006,chr1,4416568,4417221
CONS00000000011,chr1,4491871,4492835
CONS00000000013,chr1,4622075,4622934
CONS00000000017,chr1,4755824,4756551


In [6]:
# load region annotation
region_annot = pd.read_csv(os.path.join(dir_data,'consensus_regions_annotation.csv'), index_col=0, header=0,)
print(region_annot.shape)
region_annot.head()

(194339, 25)


Unnamed: 0_level_0,gencode_chr,gencode_start,gencode_end,gencode_length,gencode_feat_anchor,gencode_distance,gencode_location,gencode_feat_type,gencode_gene_id,gencode_gene_name,...,homer_Entrez ID,homer_Nearest Unigene,homer_Nearest Refseq,homer_Nearest Ensembl,homer_Gene Name,homer_Gene Alias,homer_Gene Description,homer_Gene Type,regulatoryBuild_reg_feature,regulatoryBuild_reg_feature_id
peak_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CONS00000000000,chr1,3445775,3446352,577,center,7364,PeakInsideFeature,gene:protein_coding,ENSMUSG00000051951.5,Xkr4,...,497097.0,Mm.454496,NM_001011874,ENSMUSG00000051951,Xkr4,AY534250|Gm210|XRG4|mKIAA1889,X-linked Kx blood group related 4,protein-coding,regulatory_region,ENSMUSR00000476182
CONS00000000001,chr1,4255761,4256262,501,center,51613,PeakInsideFeature,gene:protein_coding,ENSMUSG00000025900.13,Rp1,...,19888.0,Mm.294263,NM_011283,ENSMUSG00000025900,Rp1,Dcdc3|Gm38717|Orp1|Rp1h|mG145,retinitis pigmentosa 1 (human),protein-coding,reg_NONE,
CONS00000000002,chr1,4260896,4261397,501,center,56748,PeakInsideFeature,gene:protein_coding,ENSMUSG00000025900.13,Rp1,...,19888.0,Mm.294263,NM_011283,ENSMUSG00000025900,Rp1,Dcdc3|Gm38717|Orp1|Rp1h|mG145,retinitis pigmentosa 1 (human),protein-coding,reg_NONE,
CONS00000000003,chr1,4401366,4401938,572,start,7589,PeakInsideFeature,gene:protein_coding,ENSMUSG00000025900.13,Rp1,...,19888.0,Mm.294263,NM_011283,ENSMUSG00000025900,Rp1,Dcdc3|Gm38717|Orp1|Rp1h|mG145,retinitis pigmentosa 1 (human),protein-coding,reg_NONE,
CONS00000000004,chr1,4405390,4405891,501,start,3601,PeakInsideFeature,gene:protein_coding,ENSMUSG00000025900.13,Rp1,...,19888.0,Mm.294263,NM_011283,ENSMUSG00000025900,Rp1,Dcdc3|Gm38717|Orp1|Rp1h|mG145,retinitis pigmentosa 1 (human),protein-coding,reg_NONE,


In [7]:
treatments = list(annot['Treatment'].unique())
treatments.remove('untreated')
treatments

['C_albicans', 'IFN_beta', 'IFN_gamma', 'LCMV_Cl13', 'LO28', 'LPS']

In [8]:
# name & path to background region set
bg_regions_name = "BMDM"
bg_regions_path = os.path.join(project_prefix_path, dir_data, "all_consensus_regions_filtered.bed")
bg_regions_path

'/nobackup/lab_bock/projects/bmdm-stim/results/ATAC/all/all_consensus_regions_filtered.bed'

# DAR results

In [17]:
# result folder
dir_results=os.path.join(dir_data,'DEA', 'DAR_bedfiles')
if not os.path.exists(dir_results):
        os.mkdir(dir_results)

In [10]:
# get significant regions per treatment and time point ie group
region_lists = dict()

for treatment in treatments:
    # load analysis results per treatment
    tmp_results = pd.read_csv(os.path.join(dir_data,'DEA','DEA_'+treatment+'.tsv'), header=0, sep='\t', index_col='rn')
    for group in tmp_results['group'].unique():
        
        for direction in ['up','down']:
            if direction=='up':
                tmp_genes = tmp_results.loc[(tmp_results['adj.P.Val']<0.05) & (tmp_results['group']==group) & (tmp_results['logFC']>0), ].index.unique()
            else:
                tmp_genes = tmp_results.loc[(tmp_results['adj.P.Val']<0.05) & (tmp_results['group']==group) & (tmp_results['logFC']<0), ].index.unique()
        
            if len(tmp_genes)==0:
                continue
            
            region_lists[group+"_"+direction] = tmp_genes

print(len(region_lists.keys()))        
region_lists.keys()

51


dict_keys(['C_albicans_2h_up', 'C_albicans_2h_down', 'C_albicans_4h_up', 'C_albicans_4h_down', 'C_albicans_6h_up', 'C_albicans_6h_down', 'C_albicans_8h_up', 'C_albicans_8h_down', 'IFN_beta_2h_up', 'IFN_beta_2h_down', 'IFN_beta_4h_up', 'IFN_beta_4h_down', 'IFN_beta_6h_up', 'IFN_beta_6h_down', 'IFN_beta_8h_up', 'IFN_beta_8h_down', 'IFN_beta_24h_up', 'IFN_beta_24h_down', 'IFN_gamma_2h_up', 'IFN_gamma_2h_down', 'IFN_gamma_4h_up', 'IFN_gamma_4h_down', 'IFN_gamma_6h_up', 'IFN_gamma_6h_down', 'IFN_gamma_8h_up', 'IFN_gamma_8h_down', 'IFN_gamma_24h_up', 'IFN_gamma_24h_down', 'LCMV_Cl13_8h_down', 'LCMV_Cl13_24h_up', 'LCMV_Cl13_24h_down', 'LO28_2h_up', 'LO28_2h_down', 'LO28_4h_up', 'LO28_4h_down', 'LO28_6h_up', 'LO28_6h_down', 'LO28_8h_up', 'LO28_8h_down', 'LO28_24h_up', 'LO28_24h_down', 'LPS_2h_up', 'LPS_2h_down', 'LPS_4h_up', 'LPS_4h_down', 'LPS_6h_up', 'LPS_6h_down', 'LPS_8h_up', 'LPS_8h_down', 'LPS_24h_up', 'LPS_24h_down'])

In [11]:
for region_list in region_lists.keys():
    tmp_regions = consensus_regions.loc[region_lists[region_list],:]
    tmp_regions['ID'] = tmp_regions.index
    bedtools.BedTool().from_dataframe(tmp_regions).saveas(os.path.join(dir_results,"{}.bed".format(region_list)))

In [12]:
enrichment_config = pd.DataFrame()

for region_list in region_lists.keys():
    enrichment_config=enrichment_config.append(pd.Series([region_list, os.path.join(project_prefix_path,dir_results,"{}.bed".format(region_list)), bg_regions_name, bg_regions_path]), ignore_index=True)

enrichment_config.columns = ['name', 'regions_bed', 'background_name','background_bed']
enrichment_config.to_csv(os.path.join(dir_results,"DAR_enrichment_annotation.csv"), index=False)
print(enrichment_config.shape)
enrichment_config.head()

(51, 4)


Unnamed: 0,name,regions_bed,background_name,background_bed
0,C_albicans_2h_up,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
1,C_albicans_2h_down,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
2,C_albicans_4h_up,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
3,C_albicans_4h_down,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
4,C_albicans_6h_up,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...


# DAR cluster results

In [43]:
# result folder
dir_results=os.path.join(dir_data, 'DEA', 'DARclusters_bedfiles')
if not os.path.exists(dir_results):
        pathlib.Path(dir_results).mkdir(parents=True, exist_ok=True) 

In [18]:
# configs
ks = list(range(4,13))
ks

[4, 5, 6, 7, 8, 9, 10, 11, 12]

In [40]:
# get DAR cluster regions
region_lists = dict()
for k in ks:
    # load clustering results
    DAR_cluster_results = pd.read_csv(os.path.join(dir_data,'DEA','Clusters_HM_{}'.format(k),'Regions.tsv'), header=0, sep='\t')
    # make region lists
    tmp_region_lists=DAR_cluster_results.groupby('value')['rn'].apply(list).to_dict()

    # rename gene_lists
    for key in tmp_region_lists.keys():
        region_lists["k{}_cluster_{}".format(k,key)] = tmp_region_lists[key]

print(len(region_lists.keys()))        
print(region_lists.keys())

72
dict_keys(['k4_cluster_1', 'k4_cluster_2', 'k4_cluster_3', 'k4_cluster_4', 'k5_cluster_1', 'k5_cluster_2', 'k5_cluster_3', 'k5_cluster_4', 'k5_cluster_5', 'k6_cluster_1', 'k6_cluster_2', 'k6_cluster_3', 'k6_cluster_4', 'k6_cluster_5', 'k6_cluster_6', 'k7_cluster_1', 'k7_cluster_2', 'k7_cluster_3', 'k7_cluster_4', 'k7_cluster_5', 'k7_cluster_6', 'k7_cluster_7', 'k8_cluster_1', 'k8_cluster_2', 'k8_cluster_3', 'k8_cluster_4', 'k8_cluster_5', 'k8_cluster_6', 'k8_cluster_7', 'k8_cluster_8', 'k9_cluster_1', 'k9_cluster_2', 'k9_cluster_3', 'k9_cluster_4', 'k9_cluster_5', 'k9_cluster_6', 'k9_cluster_7', 'k9_cluster_8', 'k9_cluster_9', 'k10_cluster_1', 'k10_cluster_2', 'k10_cluster_3', 'k10_cluster_4', 'k10_cluster_5', 'k10_cluster_6', 'k10_cluster_7', 'k10_cluster_8', 'k10_cluster_9', 'k10_cluster_10', 'k11_cluster_1', 'k11_cluster_2', 'k11_cluster_3', 'k11_cluster_4', 'k11_cluster_5', 'k11_cluster_6', 'k11_cluster_7', 'k11_cluster_8', 'k11_cluster_9', 'k11_cluster_10', 'k11_cluster_11', 'k

In [44]:
for region_list in region_lists.keys():
    tmp_regions = consensus_regions.loc[region_lists[region_list],:]
    tmp_regions['ID'] = tmp_regions.index
    bedtools.BedTool().from_dataframe(tmp_regions).saveas(os.path.join(dir_results,"{}.bed".format(region_list)))

In [45]:
enrichment_config = pd.DataFrame()

for region_list in region_lists.keys():
    enrichment_config=enrichment_config.append(pd.Series([region_list, os.path.join(project_prefix_path,dir_results,"{}.bed".format(region_list)), bg_regions_name, bg_regions_path]), ignore_index=True)

enrichment_config.columns = ['name', 'regions_bed', 'background_name','background_bed']
enrichment_config.to_csv(os.path.join(dir_results,"DARclusters_enrichment_annotation.csv"), index=False)
print(enrichment_config.shape)
enrichment_config.head()

(72, 4)


Unnamed: 0,name,regions_bed,background_name,background_bed
0,k4_cluster_1,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
1,k4_cluster_2,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
2,k4_cluster_3,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
3,k4_cluster_4,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
4,k5_cluster_1,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...


# time-series cluster results

In [9]:
# result folder
dir_results=os.path.join(dir_data, 'time_series', 'timeseries_bedfiles')
if not os.path.exists(dir_results):
        pathlib.Path(dir_results).mkdir(parents=True, exist_ok=True) 

In [10]:
# config
ks = list(range(2,11))#+[15,20]
ks

[2, 3, 4, 5, 6, 7, 8, 9, 10]

In [11]:
# get significant regions per treatment and per k clustering
region_lists = dict()
for treatment in treatments:
    for k in ks:
        # load clustering result
        tmp_results = pd.read_csv(os.path.join(dir_data,'time_series',treatment, "k_{}".format(k), "clustering_"+treatment+".csv"), header=0, index_col=0)
        # make gene lists
        tmp_region_lists=tmp_results.groupby('1')['0'].apply(list).to_dict()

        # rename gene_lists
        for key in tmp_region_lists.keys():
            region_lists["{}_k{}_cluster_{}".format(treatment, k, key)] = tmp_region_lists[key]

print(len(region_lists.keys()))        
print(region_lists.keys())

324
dict_keys(['C_albicans_k2_cluster_1', 'C_albicans_k2_cluster_2', 'C_albicans_k3_cluster_1', 'C_albicans_k3_cluster_2', 'C_albicans_k3_cluster_3', 'C_albicans_k4_cluster_1', 'C_albicans_k4_cluster_2', 'C_albicans_k4_cluster_3', 'C_albicans_k4_cluster_4', 'C_albicans_k5_cluster_1', 'C_albicans_k5_cluster_2', 'C_albicans_k5_cluster_3', 'C_albicans_k5_cluster_4', 'C_albicans_k5_cluster_5', 'C_albicans_k6_cluster_1', 'C_albicans_k6_cluster_2', 'C_albicans_k6_cluster_3', 'C_albicans_k6_cluster_4', 'C_albicans_k6_cluster_5', 'C_albicans_k6_cluster_6', 'C_albicans_k7_cluster_1', 'C_albicans_k7_cluster_2', 'C_albicans_k7_cluster_3', 'C_albicans_k7_cluster_4', 'C_albicans_k7_cluster_5', 'C_albicans_k7_cluster_6', 'C_albicans_k7_cluster_7', 'C_albicans_k8_cluster_1', 'C_albicans_k8_cluster_2', 'C_albicans_k8_cluster_3', 'C_albicans_k8_cluster_4', 'C_albicans_k8_cluster_5', 'C_albicans_k8_cluster_6', 'C_albicans_k8_cluster_7', 'C_albicans_k8_cluster_8', 'C_albicans_k9_cluster_1', 'C_albicans_k

In [12]:
for region_list in region_lists.keys():
    tmp_regions = consensus_regions.loc[region_lists[region_list],:]
    tmp_regions['ID'] = tmp_regions.index
    bedtools.BedTool().from_dataframe(tmp_regions).saveas(os.path.join(dir_results,"{}.bed".format(region_list)))

In [13]:
enrichment_config = pd.DataFrame()

for region_list in region_lists.keys():
    enrichment_config=enrichment_config.append(pd.Series([region_list, os.path.join(project_prefix_path,dir_results,"{}.bed".format(region_list)), bg_regions_name, bg_regions_path]), ignore_index=True)

enrichment_config.columns = ['name', 'regions_bed', 'background_name','background_bed']
enrichment_config.to_csv(os.path.join(dir_results,"timeseries_enrichment_annotation.csv"), index=False)
print(enrichment_config.shape)
enrichment_config.head()

(324, 4)


Unnamed: 0,name,regions_bed,background_name,background_bed
0,C_albicans_k2_cluster_1,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
1,C_albicans_k2_cluster_2,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
2,C_albicans_k3_cluster_1,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
3,C_albicans_k3_cluster_2,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
4,C_albicans_k3_cluster_3,/nobackup/lab_bock/projects/bmdm-stim/results/...,BMDM,/nobackup/lab_bock/projects/bmdm-stim/results/...
