In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import re

In [2]:
files = list(Path(".").glob("*.bed.gz"))
files

[PosixPath('GSM4322285_ATACseq_stage105_R1_peaks.bed.gz'),
 PosixPath('GSM4322287_ATACseq_stage12_R1_peaks.bed.gz'),
 PosixPath('GSM4322284_ATACseq_stage9_R2_peaks.bed.gz'),
 PosixPath('GSM4322294_ATACseq_Blastoporelip_R2_peaks.bed.gz'),
 PosixPath('GSM4322286_ATACseq_stage105_R2_peaks.bed.gz'),
 PosixPath('GSM4322289_ATACseq_stage16_R1_peaks.bed.gz'),
 PosixPath('GSM4322292_ATACseq_AnimalCap_R2_peaks.bed.gz'),
 PosixPath('GSM4322293_ATACseq_Blastoporelip_R1_peaks.bed.gz'),
 PosixPath('GSM4322288_ATACseq_stage12_R2_peaks.bed.gz'),
 PosixPath('GSM4322291_ATACseq_AnimalCap_R1_peaks.bed.gz'),
 PosixPath('GSM4322283_ATACseq_stage9_R1_peaks.bed.gz'),
 PosixPath('GSM4322290_ATACseq_stage16_R2_peaks.bed.gz')]

In [23]:
tissues = pd.DataFrame({ 
    "file" : files,
    "tissue" : [ re.search(r'ATACseq_([-A-Za-z0-9]+)_R[1-2]_peaks.bed.gz', str(g)).group(1) for g in files ]
})
tissues

Unnamed: 0,file,tissue
0,GSM4322285_ATACseq_stage105_R1_peaks.bed.gz,stage105
1,GSM4322287_ATACseq_stage12_R1_peaks.bed.gz,stage12
2,GSM4322284_ATACseq_stage9_R2_peaks.bed.gz,stage9
3,GSM4322294_ATACseq_Blastoporelip_R2_peaks.bed.gz,Blastoporelip
4,GSM4322286_ATACseq_stage105_R2_peaks.bed.gz,stage105
5,GSM4322289_ATACseq_stage16_R1_peaks.bed.gz,stage16
6,GSM4322292_ATACseq_AnimalCap_R2_peaks.bed.gz,AnimalCap
7,GSM4322293_ATACseq_Blastoporelip_R1_peaks.bed.gz,Blastoporelip
8,GSM4322288_ATACseq_stage12_R2_peaks.bed.gz,stage12
9,GSM4322291_ATACseq_AnimalCap_R1_peaks.bed.gz,AnimalCap


In [24]:
unique_tissues = tissues["tissue"].value_counts().index
unique_tissues

Index(['stage105', 'stage12', 'stage9', 'Blastoporelip', 'stage16',
       'AnimalCap'],
      dtype='object', name='tissue')

In [21]:
from pathlib import Path
import sys
local_modules_path = (Path(".") / '..').resolve()
sys.path.append(str(local_modules_path))
from utils import merge_intervals

0    GSM4322285_ATACseq_stage105_R1_peaks.bed.gz
4    GSM4322286_ATACseq_stage105_R2_peaks.bed.gz
Name: file, dtype: object
1    GSM4322287_ATACseq_stage12_R1_peaks.bed.gz
8    GSM4322288_ATACseq_stage12_R2_peaks.bed.gz
Name: file, dtype: object
2     GSM4322284_ATACseq_stage9_R2_peaks.bed.gz
10    GSM4322283_ATACseq_stage9_R1_peaks.bed.gz
Name: file, dtype: object
3    GSM4322294_ATACseq_Blastoporelip_R2_peaks.bed.gz
7    GSM4322293_ATACseq_Blastoporelip_R1_peaks.bed.gz
Name: file, dtype: object
5     GSM4322289_ATACseq_stage16_R1_peaks.bed.gz
11    GSM4322290_ATACseq_stage16_R2_peaks.bed.gz
Name: file, dtype: object
6    GSM4322292_ATACseq_AnimalCap_R2_peaks.bed.gz
9    GSM4322291_ATACseq_AnimalCap_R1_peaks.bed.gz
Name: file, dtype: object


In [34]:
column_names = [
    'chrom', 'start', 'end', 'name', 'score', 'strand',
    'signalValue', 'pValue', 'qValue', 'peak'
]

bedfiles = []
for tissue in unique_tissues: 
    files = tissues[tissues["tissue"] == tissue]["file"]
    concated = pd.concat([ pd.read_csv(f, sep='\t', names = column_names) for f in files ], axis = 0)
    merged = merge_intervals(concated)
    filename = f"{tissue}_ATAC.bed.gz"
    merged.to_csv(filename, sep="\t", header=False, index=False)
    bedfiles.append(filename)

In [38]:
meta = pd.DataFrame({
    "tissue" : unique_tissues, 
    "assay" : "ATAC", 
    "species" : "xenopus_tropicalis", 
    "filename" : bedfiles, 
    "original" : [ ",".join(tissues[tissues["tissue"] == tissue]["file"].astype(str)) for tissue in unique_tissues]
})

meta = meta[["tissue","assay","species","filename","original"]]
meta.to_csv("meta.tsv", sep="\t", index=False)

In [40]:
meta

Unnamed: 0,tissue,assay,species,filename,original
0,stage105,ATAC,xenopus_tropicalis,stage105_ATAC.bed.gz,"GSM4322285_ATACseq_stage105_R1_peaks.bed.gz,GS..."
1,stage12,ATAC,xenopus_tropicalis,stage12_ATAC.bed.gz,"GSM4322287_ATACseq_stage12_R1_peaks.bed.gz,GSM..."
2,stage9,ATAC,xenopus_tropicalis,stage9_ATAC.bed.gz,"GSM4322284_ATACseq_stage9_R2_peaks.bed.gz,GSM4..."
3,Blastoporelip,ATAC,xenopus_tropicalis,Blastoporelip_ATAC.bed.gz,GSM4322294_ATACseq_Blastoporelip_R2_peaks.bed....
4,stage16,ATAC,xenopus_tropicalis,stage16_ATAC.bed.gz,"GSM4322289_ATACseq_stage16_R1_peaks.bed.gz,GSM..."
5,AnimalCap,ATAC,xenopus_tropicalis,AnimalCap_ATAC.bed.gz,"GSM4322292_ATACseq_AnimalCap_R2_peaks.bed.gz,G..."
