In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import re
import sys
local_modules_path = (Path(".") / '..').resolve()
sys.path.append(str(local_modules_path))
from utils import merge_intervals

In [69]:
files = list(Path(".").glob("*.bed.gz"))
files

[PosixPath('GSM4799763_H3K27ac_Spleen_M22_Peaks.bed.gz'),
 PosixPath('GSM4799624_ATAC_Liver_P350_Peaks.bed.gz'),
 PosixPath('GSM4799620_ATAC_Hypothalamus_P350_Peaks.bed.gz'),
 PosixPath('GSM4799752_H3K27ac_Lung_P348_Peaks.bed.gz'),
 PosixPath('GSM4799622_ATAC_Liver_M22_Peaks.bed.gz'),
 PosixPath('GSM4799726_H3K27ac_Cerebellum_M08_Peaks.bed.gz'),
 PosixPath('GSM4799765_H3K27ac_Spleen_P350_Peaks.bed.gz'),
 PosixPath('GSM4799606_ATAC_Adipose_M08_Peaks.bed.gz'),
 PosixPath('GSM4799756_H3K27ac_Muscle_M08_Peaks.bed.gz'),
 PosixPath('GSM4799610_ATAC_Cerebellum_M22_Peaks.bed.gz'),
 PosixPath('GSM4799734_H3K27ac_Cortex_P348_Peaks.bed.gz'),
 PosixPath('GSM4799761_H3K27ac_Spleen_B_Peaks.bed.gz'),
 PosixPath('GSM4799636_ATAC_Spleen_P350_Peaks.bed.gz'),
 PosixPath('GSM4799749_H3K27ac_Lung_B_Peaks.bed.gz'),
 PosixPath('GSM4799739_H3K27ac_Hypothalamus_M22_Peaks.bed.gz'),
 PosixPath('GSM4799744_H3K27ac_Liver_M08_Peaks.bed.gz'),
 PosixPath('GSM4799628_ATAC_Lung_P350_Peaks.bed.gz'),
 PosixPath('GSM47996

In [70]:
def get_meta(m): 
    return [ m.group(i) for i in [1,2,3] ]

matches = [ 
    get_meta(re.search(r"GSM\d+_(\w+)_([A-Za-z]+)_([A-Z])[0-9]*_Peaks\.bed\.gz", str(f)))
    for f in files ]
sample_meta = pd.DataFrame(matches, columns = ["assay", "tissue", "species"])
sample_meta["file"] = files
sample_meta

Unnamed: 0,assay,tissue,species,file
0,H3K27ac,Spleen,M,GSM4799763_H3K27ac_Spleen_M22_Peaks.bed.gz
1,ATAC,Liver,P,GSM4799624_ATAC_Liver_P350_Peaks.bed.gz
2,ATAC,Hypothalamus,P,GSM4799620_ATAC_Hypothalamus_P350_Peaks.bed.gz
3,H3K27ac,Lung,P,GSM4799752_H3K27ac_Lung_P348_Peaks.bed.gz
4,ATAC,Liver,M,GSM4799622_ATAC_Liver_M22_Peaks.bed.gz
...,...,...,...,...
74,ATAC,Hypothalamus,M,GSM4799617_ATAC_Hypothalamus_M08_Peaks.bed.gz
75,H3K27ac,Muscle,M,GSM4799757_H3K27ac_Muscle_M22_Peaks.bed.gz
76,H3K27ac,Adipose,B,GSM4799719_H3K27ac_Adipose_B_Peaks.bed.gz
77,ATAC,Adipose,M,GSM4799607_ATAC_Adipose_M22_Peaks.bed.gz


In [71]:
sample_meta["species"] = sample_meta["species"].map({
    "A" : "gallus_gallus", 
    "B" : "gallus_gallus", 
    "M" : "bos_taurus", 
    "P" : "sus_scrofa"
})
sample_meta

Unnamed: 0,assay,tissue,species,file
0,H3K27ac,Spleen,bos_taurus,GSM4799763_H3K27ac_Spleen_M22_Peaks.bed.gz
1,ATAC,Liver,sus_scrofa,GSM4799624_ATAC_Liver_P350_Peaks.bed.gz
2,ATAC,Hypothalamus,sus_scrofa,GSM4799620_ATAC_Hypothalamus_P350_Peaks.bed.gz
3,H3K27ac,Lung,sus_scrofa,GSM4799752_H3K27ac_Lung_P348_Peaks.bed.gz
4,ATAC,Liver,bos_taurus,GSM4799622_ATAC_Liver_M22_Peaks.bed.gz
...,...,...,...,...
74,ATAC,Hypothalamus,bos_taurus,GSM4799617_ATAC_Hypothalamus_M08_Peaks.bed.gz
75,H3K27ac,Muscle,bos_taurus,GSM4799757_H3K27ac_Muscle_M22_Peaks.bed.gz
76,H3K27ac,Adipose,gallus_gallus,GSM4799719_H3K27ac_Adipose_B_Peaks.bed.gz
77,ATAC,Adipose,bos_taurus,GSM4799607_ATAC_Adipose_M22_Peaks.bed.gz


In [72]:
unique = sample_meta[ ["assay", "tissue", "species"] ].drop_duplicates()
unique.shape

(40, 3)

In [75]:
import shutil
import time

column_names = [
    'chrom', 'start', 'end', 'name', 'score', 'strand',
    'signalValue', 'pValue', 'qValue', 'peak'
]
outdir = Path("merged")
outdir.mkdir(parents=True, exist_ok=True)

og_files = []
out_files = []
for i in range(unique.shape[0]):
    start_time = time.time()
    tissue = unique["tissue"].iloc[i]
    assay = unique["assay"].iloc[i]
    species = unique["species"].iloc[i]
    files = sample_meta.loc[ 
        (sample_meta["tissue"] == tissue) & 
        (sample_meta["assay"] == assay) & 
        (sample_meta["species"] == species), 
        "file"]
    outfile = f"{tissue}_{assay}_{species}.bed.gz"
    if False:
        df_all = pd.concat(
                    [ pd.read_csv(g, sep='\t', header=None, names=column_names) 
                     for g in files ],
                    axis=0)
        merged = merge_intervals(df_all)
        merged.to_csv(outdir / outfile, sep = "\t", index = False, header = False)
    out_files.append(outfile)
    og_files.append(",".join(files.astype(str)))
    print(f"{tissue} {assay} {species} done, took {time.time()-start_time}")

Spleen H3K27ac bos_taurus done, took 0.003409862518310547
Liver ATAC sus_scrofa done, took 0.0020742416381835938
Hypothalamus ATAC sus_scrofa done, took 0.002010822296142578
Lung H3K27ac sus_scrofa done, took 0.0020160675048828125
Liver ATAC bos_taurus done, took 0.0020067691802978516
Cerebellum H3K27ac bos_taurus done, took 0.0020172595977783203
Spleen H3K27ac sus_scrofa done, took 0.0019807815551757812
Adipose ATAC bos_taurus done, took 0.0019326210021972656
Muscle H3K27ac bos_taurus done, took 0.0019202232360839844
Cerebellum ATAC bos_taurus done, took 0.001920938491821289
Cortex H3K27ac sus_scrofa done, took 0.0019130706787109375
Spleen H3K27ac gallus_gallus done, took 0.0019145011901855469
Spleen ATAC sus_scrofa done, took 0.0018494129180908203
Lung H3K27ac gallus_gallus done, took 0.001798391342163086
Hypothalamus H3K27ac bos_taurus done, took 0.0017518997192382812
Liver H3K27ac bos_taurus done, took 0.0017559528350830078
Lung ATAC sus_scrofa done, took 0.0017855167388916016
Live

In [76]:
unique["filename"] = out_files
unique["original"] = og_files
unique

Unnamed: 0,assay,tissue,species,filename,original
0,H3K27ac,Spleen,bos_taurus,Spleen_H3K27ac_bos_taurus.bed.gz,"GSM4799763_H3K27ac_Spleen_M22_Peaks.bed.gz,GSM..."
1,ATAC,Liver,sus_scrofa,Liver_ATAC_sus_scrofa.bed.gz,"GSM4799624_ATAC_Liver_P350_Peaks.bed.gz,GSM479..."
2,ATAC,Hypothalamus,sus_scrofa,Hypothalamus_ATAC_sus_scrofa.bed.gz,GSM4799620_ATAC_Hypothalamus_P350_Peaks.bed.gz...
3,H3K27ac,Lung,sus_scrofa,Lung_H3K27ac_sus_scrofa.bed.gz,"GSM4799752_H3K27ac_Lung_P348_Peaks.bed.gz,GSM4..."
4,ATAC,Liver,bos_taurus,Liver_ATAC_bos_taurus.bed.gz,"GSM4799622_ATAC_Liver_M22_Peaks.bed.gz,GSM4799..."
5,H3K27ac,Cerebellum,bos_taurus,Cerebellum_H3K27ac_bos_taurus.bed.gz,GSM4799726_H3K27ac_Cerebellum_M08_Peaks.bed.gz...
6,H3K27ac,Spleen,sus_scrofa,Spleen_H3K27ac_sus_scrofa.bed.gz,"GSM4799765_H3K27ac_Spleen_P350_Peaks.bed.gz,GS..."
7,ATAC,Adipose,bos_taurus,Adipose_ATAC_bos_taurus.bed.gz,"GSM4799606_ATAC_Adipose_M08_Peaks.bed.gz,GSM47..."
8,H3K27ac,Muscle,bos_taurus,Muscle_H3K27ac_bos_taurus.bed.gz,"GSM4799756_H3K27ac_Muscle_M08_Peaks.bed.gz,GSM..."
9,ATAC,Cerebellum,bos_taurus,Cerebellum_ATAC_bos_taurus.bed.gz,GSM4799610_ATAC_Cerebellum_M22_Peaks.bed.gz


In [77]:
meta = unique[["tissue","assay","species","filename","original"]]
meta.to_csv("meta.tsv", sep="\t", index=False)

In [78]:
meta

Unnamed: 0,tissue,assay,species,filename,original
0,Spleen,H3K27ac,bos_taurus,Spleen_H3K27ac_bos_taurus.bed.gz,"GSM4799763_H3K27ac_Spleen_M22_Peaks.bed.gz,GSM..."
1,Liver,ATAC,sus_scrofa,Liver_ATAC_sus_scrofa.bed.gz,"GSM4799624_ATAC_Liver_P350_Peaks.bed.gz,GSM479..."
2,Hypothalamus,ATAC,sus_scrofa,Hypothalamus_ATAC_sus_scrofa.bed.gz,GSM4799620_ATAC_Hypothalamus_P350_Peaks.bed.gz...
3,Lung,H3K27ac,sus_scrofa,Lung_H3K27ac_sus_scrofa.bed.gz,"GSM4799752_H3K27ac_Lung_P348_Peaks.bed.gz,GSM4..."
4,Liver,ATAC,bos_taurus,Liver_ATAC_bos_taurus.bed.gz,"GSM4799622_ATAC_Liver_M22_Peaks.bed.gz,GSM4799..."
5,Cerebellum,H3K27ac,bos_taurus,Cerebellum_H3K27ac_bos_taurus.bed.gz,GSM4799726_H3K27ac_Cerebellum_M08_Peaks.bed.gz...
6,Spleen,H3K27ac,sus_scrofa,Spleen_H3K27ac_sus_scrofa.bed.gz,"GSM4799765_H3K27ac_Spleen_P350_Peaks.bed.gz,GS..."
7,Adipose,ATAC,bos_taurus,Adipose_ATAC_bos_taurus.bed.gz,"GSM4799606_ATAC_Adipose_M08_Peaks.bed.gz,GSM47..."
8,Muscle,H3K27ac,bos_taurus,Muscle_H3K27ac_bos_taurus.bed.gz,"GSM4799756_H3K27ac_Muscle_M08_Peaks.bed.gz,GSM..."
9,Cerebellum,ATAC,bos_taurus,Cerebellum_ATAC_bos_taurus.bed.gz,GSM4799610_ATAC_Cerebellum_M22_Peaks.bed.gz
