In [8]:
import os
import subprocess
# 设置BAM文件目录
bam_dir = "/data/haocheng/data/bam/GM/"
bam_files = sorted([f for f in os.listdir(bam_dir) if f.endswith('.bam')])


# 获取第一个和第二个BAM文件的路径
bam_file1 = os.path.join(bam_dir, bam_files[0])
bam_file2 = os.path.join(bam_dir, bam_files[1])

sample1_peak_file = f"{bam_dir}/sample1_peaks.narrowPeak"
sample2_peak_file = f"{bam_dir}/sample2_peaks.narrowPeak"
sample1_signal_file = f"{bam_dir}/sample1_signal.bed"
sample2_signal_file = f"{bam_dir}/sample2_signal.bed"
combined_signal_file = f"{bam_dir}/combined_signals.bed"
output_file = f"{bam_dir}/updated_combined_signals.bed"


In [10]:
# 检查每个 BAM 文件是否存在索引，如果没有，则添加索引
for bam_file in bam_files:
    bam_file_path = os.path.join(bam_dir, bam_file)
    index_file_path = bam_file_path + ".bai"  # 索引文件的路径

    # 检查索引文件是否存在
    if not os.path.exists(index_file_path):
        print(f"Index file for {bam_file} not found. Creating index...")
        # 使用 samtools 创建索引
        subprocess.run(f"samtools index {bam_file_path}", shell=True, check=True)
        print(f"Index file created for {bam_file}.")
    else:
        print(f"Index file for {bam_file} already exists.")

Index file for ENCFF185MTG.bam not found. Creating index...


[W::bam_hdr_read] EOF marker is absent. The input is probably truncated
[E::bgzf_uncompress] Inflate operation failed: 1
[E::bgzf_read] Read block operation failed with error 1 after 0 of 4 bytes
samtools index: failed to create index for "/data/haocheng/data/bam/GM/ENCFF185MTG.bam"


CalledProcessError: Command 'samtools index /data/haocheng/data/bam/GM/ENCFF185MTG.bam' returned non-zero exit status 1.

In [None]:

# 运行MACS2以获得峰值
macs2_command1 = f"macs2 callpeak -t {bam_file1} -f BAM -g 3.1e9 -n sample1 --outdir {bam_dir} --keep-dup all"
macs2_command2 = f"macs2 callpeak -t {bam_file2} -f BAM -g 3.1e9 -n sample2 --outdir {bam_dir} --keep-dup all"

subprocess.run(macs2_command1, shell=True, check=True)
subprocess.run(macs2_command2, shell=True, check=True)

In [None]:
# 提取必要列并保存到新的文件中
awk_command1 = f"awk '{{print $1\"\\t\"$2\"\\t\"$3\"\\t\"$10}}' {sample1_peak_file} > {sample1_signal_file}"
awk_command2 = f"awk '{{print $1\"\\t\"$2\"\\t\"$3\"\\t\"$10}}' {sample2_peak_file} > {sample2_signal_file}"

# 运行 awk 命令
subprocess.run(awk_command1, shell=True, check=True)
subprocess.run(awk_command2, shell=True, check=True)

In [None]:
# 对输入文件进行排序
sort_sample1_command = f"bedtools sort -i {sample1_signal_file} > {sample1_signal_file}.sorted"
sort_sample2_command = f"bedtools sort -i {sample2_signal_file} > {sample2_signal_file}.sorted"
# 运行排序命令
subprocess.run(sort_sample1_command, shell=True, check=True)
subprocess.run(sort_sample2_command, shell=True, check=True)

In [None]:
# 使用 bedtools unionbedg 进行信号叠加，并将空白地方设置为0
unionbedg_command = f"bedtools unionbedg -i {sample1_signal_file}.sorted {sample2_signal_file}.sorted -filler 0 > {combined_signal_file}"
subprocess.run(unionbedg_command, shell=True, check=True)

print(f"Combined signals written to {combined_signal_file}")

In [None]:
output_file = f"{bam_dir}/updated_combined_signals.bed"
# awk 命令
awk_command = f"awk '{{print $1\"\\t\"$2\"\\t\"$3\"\\t\"($4 + $5)}}' {combined_signal_file} > {sample1_signal_file}"

# 调用 awk 命令
result =subprocess.run(awk_command, shell=True, check=True)

print(f"Processed data written to {sample1_signal_file}")


In [None]:

# 从数组中移除已使用的 BAM 文件
bam_files = bam_files[2:]  # 删除前两个元素
# 删除中间文件，只保留最终结果文件


In [None]:
# 处理剩余的 BAM 文件
while bam_files:
    next_bam = os.path.join(bam_dir, bam_files[0])  # 获取下一个 BAM 文件的路径
    print(f"现在正在处理数据: {next_bam}")
    
    sample2_peak_file = f"{bam_dir}/sample2_peaks.narrowPeak"
    sample2_signal_file = f"{bam_dir}/sample2_signal.bed"
    combined_signal_file = f"{bam_dir}/combined_signals.bed"
    output_file = f"{bam_dir}/updated_combined_signals.bed"

    print(f"Running MACS2 on {next_bam}...")
    macs2_command2 = f"macs2 callpeak -t {next_bam} -f BAM -g 3.1e9 -n sample2 --outdir {bam_dir} --keep-dup all"
    subprocess.run(macs2_command2, shell=True, check=True)

    print(f"Extracting necessary columns from {sample2_peak_file} to {sample2_signal_file}...")
    awk_command2 = f"awk '{{print $1\"\\t\"$2\"\\t\"$3\"\\t\"$10}}' {sample2_peak_file} > {sample2_signal_file}"
    subprocess.run(awk_command2, shell=True, check=True)

    print(f"Combining signals from {sample1_signal_file} and {sample2_signal_file} into {combined_signal_file}...")
    unionbedg_command = f"bedtools unionbedg -i {sample1_signal_file} {sample2_signal_file} -filler 0 > {combined_signal_file}"
    subprocess.run(unionbedg_command, shell=True, check=True)

    print(f"Combined signals written to {combined_signal_file}")

    print(f"Processing combined signals in {combined_signal_file} to {sample1_signal_file}...")
    awk_command = f"awk '{{print $1\"\\t\"$2\"\\t\"$3\"\\t\"($4 + $5)}}' {combined_signal_file} > {sample1_signal_file}"
    result = subprocess.run(awk_command, shell=True, check=True)

    print(f"Processed data written to {sample1_signal_file}")

    # 从数组中移除已使用的 BAM 文件
    bam_files = bam_files[1:]  # 删除第一个元素


In [None]:
# 指定输出路径
output_file = '/data/haocheng/data/DNA/zero_coverage.bed'

# 定义染色体数据
chromosomes = [
    ("chr1", 1, 248956422, 0),
    ("chr2", 1, 242193529, 0),
    ("chr3", 1, 198295559, 0),
    ("chr4", 1, 190214555, 0),
    ("chr5", 1, 181538259, 0),
    ("chr6", 1, 170805979, 0),
    ("chr7", 1, 159345973, 0),
    ("chr8", 1, 145138636, 0),
    ("chr9", 1, 138394717, 0),
    ("chr10", 1, 133797422, 0),
    ("chr11", 1, 135086622, 0),
    ("chr12", 1, 133275309, 0),
    ("chr13", 1, 114364328, 0),
    ("chr14", 1, 107043718, 0),
    ("chr15", 1, 101991189, 0),
    ("chr16", 1, 90338345, 0),
    ("chr17", 1, 83257441, 0),
    ("chr18", 1, 80373285, 0),
    ("chr19", 1, 58617616, 0),
    ("chr20", 1, 64444167, 0),
    ("chr21", 1, 46709983, 0),
    ("chr22", 1, 50818468, 0),
    ("chrX", 1, 156040895, 0),
    ("chrY", 1, 57227415, 0),
]

# 写入到文件
with open(output_file, 'w') as f:
    for chrom in chromosomes:
        f.write(f"{chrom[0]}\t{chrom[1]}\t{chrom[2]}\t{chrom[3]}\n")

print(f"BED 文件已生成：{output_file}")

In [None]:
# 使用 bedtools unionbedg 进行信号叠加，并将空白地方设置为0
unionbedg_command = f"bedtools unionbedg -i /data/haocheng/data/bam/GM12878/sample1_signal.bed /data/haocheng/sorted_zero_coverage.bed -filler 0 > /data/haocheng/data/bam/GM12878/GM12878_combined.bed"
subprocess.run(unionbedg_command, shell=True, check=True)

print("Combined signals written to /data/haocheng/k562_combined.bed")

In [None]:
output_file = f"/data/haocheng/data/bam/GM12878/GM12878.bed"
# awk 命令
awk_command = f"awk '{{print $1\"\\t\"$2\"\\t\"$3\"\\t\"($4 + $5)}}' {'/data/haocheng/data/bam/GM12878/GM12878_combined.bed'} > {output_file}"

# 调用 awk 命令
result =subprocess.run(awk_command, shell=True, check=True)

print(f"Processed data written to {sample1_signal_file}")

In [None]:
import pyBigWig

def calculate_max_min(bigwig_file):
    # 打开BigWig文件
    bw = pyBigWig.open(bigwig_file)

    # 初始化最大值和最小值
    max_value = float('-inf')
    min_value = float('inf')

    # 获取所有染色体及其长度
    chroms = bw.chroms()

    # 遍历所有染色体
    for chrom in chroms:
        # 获取染色体上的所有值
        values = bw.values(chrom, 0, chroms[chrom])

        # 移除 NaN 值
        values = [v for v in values if not isinstance(v, float) or not (v != v)]

        # 检查是否有有效值
        if len(values) > 0:
            # 更新最大值和最小值
            chrom_max = max(values)
            chrom_min = min(values)

            if chrom_max > max_value:
                max_value = chrom_max
            if chrom_min < min_value:
                min_value = chrom_min

    # 关闭BigWig文件
    bw.close()

    return max_value, min_value

bigwig_file = "/data/haocheng/k562.bigWig"
max_value, min_value = calculate_max_min(bigwig_file)
print(f"Max value: {max_value}")
print(f"Min value: {min_value}")