In [2]:
import os
import subprocess


In [3]:

# 获取 BAM 文件目录
bam_dir = "/data/haocheng/data/bam/GM/"
bam_files = [f for f in os.listdir(bam_dir) if f.endswith('.bam')]
bam_files = sorted(bam_files)  # 确保文件按字母顺序排序


In [4]:
# 初始化 m 和 n 的值
m = 2  # 用于归一化的参数
n = 1  # 用于归一化的参数
bam_file1 = os.path.join(bam_dir, bam_files[0])  # 第一个 BAM 文件
bam_file2 = os.path.join(bam_dir, bam_files[1])  # 第二个 BAM 文件


In [4]:
# 函数：计算 BAM 文件的覆盖度
def get_coverage(bam_file):
    cmd = ["samtools", "depth", bam_file]
    result = subprocess.run(cmd, capture_output=True, text=True)
    return result.stdout


In [4]:
# 函数：计算 BAM 文件的覆盖度
def get_coverage(bam_file):
    cmd = f"samtools depth {bam_file} | awk '($1 ~ /^chr([1-9]|1[0-9]|2[0-2]|X|Y)$/)'"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return result.stdout


In [4]:
# 函数：计算 BAM 文件的覆盖度
def get_coverage(bam_file):
    cmd = ["samtools", "depth", bam_file]
    return subprocess.Popen(cmd, stdout=subprocess.PIPE, text=True)


In [5]:

# 获取两个 BAM 文件的覆盖度
coverage1 = get_coverage(bam_file1)  # 获取第一个 BAM 文件的覆盖度
coverage2 = get_coverage(bam_file2)  # 获取第二个 BAM 文件的覆盖度


In [None]:
with open('coverage1.txt', 'r') as file:
    for i in range(5):  # 打印前5行
        line = file.readline()
        print(line.strip())


In [11]:
# 函数：计算 BAM 文件的覆盖度并保存到文件
def get_coverage(bam_file, output_file):
    cmd = f"samtools depth {bam_file} | awk '($1 ~ /^chr([1-9]|1[0-9]|2[0-2]|X|Y)$/)' > {output_file}"
    subprocess.run(cmd, shell=True)

# 计算两个 BAM 文件的覆盖度并保存到文件
coverage1_file = "coverage1.txt"
coverage2_file = "coverage2.txt"
get_coverage(bam_file1, coverage1_file)  # 计算并保存第一个 BAM 文件的覆盖度
get_coverage(bam_file2, coverage2_file)  # 计算并保存第二个 BAM 文件的覆盖度

In [4]:
# 函数：计算 BAM 文件的覆盖度并分块保存到多个文件
def get_coverage_and_split(bam_file, output_file_base, num_chunks):
    cmd = f"samtools depth {bam_file} | awk '($1 ~ /^chr([1-9]|1[0-9]|2[0-2]|X|Y)$/)'"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    
    lines = result.stdout.splitlines()
    chunk_size = len(lines) // num_chunks
    
    for i in range(num_chunks):
        chunk_lines = lines[i * chunk_size: (i + 1) * chunk_size]
        with open(f"{output_file_base}_part{i + 1}.txt", 'w') as f_chunk:
            f_chunk.write("\n".join(chunk_lines) + "\n")
    
    # 处理最后一块数据（如果总行数不能被整除）
    if len(lines) % num_chunks != 0:
        with open(f"{output_file_base}_part{num_chunks}.txt", 'a') as f_chunk:
            f_chunk.write("\n".join(lines[num_chunks * chunk_size:]) + "\n")

# 示例使用
coverage1_base = "coverage1"
coverage2_base = "coverage2"
num_chunks = 3
get_coverage_and_split(bam_file1, coverage1_base, num_chunks)
get_coverage_and_split(bam_file2, coverage2_base, num_chunks)


In [5]:

# 修改函数：分部分计算覆盖度归一化
def normalize_with_awk_part(coverage1_part, coverage2_part, max_cov1, max_cov2, norm_max, m, n, output_file):
    awk_script = f"""
    BEGIN {{
        OFS = "\\t";  # 设置输出字段分隔符为制表符
        norm_max = {norm_max};
        m = {m};
        n = {n};
    }}
    FNR == NR {{
        split($0, fields, OFS);
        key = fields[1] OFS fields[2];  # 使用染色体和位点作为键
        cov1[key] = fields[3];  # 覆盖度是第三列
        next;
    }}
    {{
        split($0, fields, OFS);
        key = fields[1] OFS fields[2];  # 使用染色体和位点作为键
        cov2[key] = fields[3];  # 覆盖度是第三列
    }}
    END {{
        for (key in cov1) {{
            if (!(key in cov2)) cov2[key] = 0;
        }}
        for (key in cov2) {{
            if (!(key in cov1)) cov1[key] = 0;
        }}
        for (key in cov1) {{
            norm1 = (cov1[key] / {max_cov1}) * norm_max;
            norm2 = (cov2[key] / {max_cov2}) * norm_max;
            norm_cov = (norm1 * (n/m)) + (norm2 * (1/m));
            split(key, parts, OFS);  # 将key分割为染色体和位点
            print parts[0], parts[1], norm_cov;  # 打印染色体、位点和最终值
        }}
    }}
    """

    cmd = f"awk -f - {coverage1_part} {coverage2_part} > {output_file}"
    process = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    process.communicate(input=awk_script)
    if process.returncode != 0:
        print(f"Error running awk: {process.stderr.read()}")

# 函数：找到覆盖度文件中的最大覆盖度值
def find_max_coverage(file):
    cmd = f"awk '{{if($3>max) max=$3}} END {{print max}}' {file}"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return int(result.stdout.strip())

# 找到两个覆盖度文件中的最大覆盖度值
max_cov1 = 0
max_cov2 = 0

for i in range(1, num_chunks + 1):
    coverage1_part = f"{coverage1_base}_part{i}.txt"
    coverage2_part = f"{coverage2_base}_part{i}.txt"
    max_cov1 = max(max_cov1, find_max_coverage(coverage1_part))
    max_cov2 = max(max_cov2, find_max_coverage(coverage2_part))

norm_max = max(max_cov1, max_cov2)

In [15]:
# 函数：使用 awk 进行归一化
def normalize_with_awk_part(coverage1_part, coverage2_part, norm_max, m, n, output_file):
    awk_script = f"""
    BEGIN {{
        OFS = "\\t";  # 设置输出字段分隔符为制表符
        norm_max = {norm_max};
        m = {m};
        n = {n};
        max_cov1 = 0;
        max_cov2 = 0;
    }}
    FNR == NR {{
        split($0, fields, OFS);
        key = fields[1] OFS fields[2];  # 使用染色体和位点作为键
        cov1[key] = fields[3];  # 覆盖度是第三列
        if (fields[3] > max_cov1) max_cov1 = fields[3];
        next;
    }}
    {{
        split($0, fields, OFS);
        key = fields[1] OFS fields[2];  # 使用染色体和位点作为键
        cov2[key] = fields[3];  # 覆盖度是第三列
        if (fields[3] > max_cov2) max_cov2 = fields[3];
    }}
    END {{
        for (key in cov1) {{
            if (!(key in cov2)) cov2[key] = 0;
        }}
        for (key in cov2) {{
            if (!(key in cov1)) cov1[key] = 0;
        }}
        norm_max = (max_cov1 > max_cov2) ? max_cov1 : max_cov2;
        for (key in cov1) {{
            norm1 = (cov1[key] / max_cov1) * norm_max;
            norm2 = (cov2[key] / max_cov2) * norm_max;
            norm_cov = (norm1 * (n/m)) + (norm2 * (1/m));
            split(key, parts, OFS);  # 将key分割为染色体和位点
            print parts[0], parts[1], norm_cov;  # 打印染色体、位点和最终值
        }}
    }}
    """

    # 调用 awk 脚本进行处理
    with open(output_file, 'w') as out:
        process = subprocess.Popen(
            ['awk', '-f', '-', coverage1_part, coverage2_part],
            stdin=subprocess.PIPE,
            stdout=out,
            stderr=subprocess.PIPE,
            text=True
        )
        
        result, error = process.communicate(input=awk_script)
        if process.returncode != 0:
            print(f"Error running awk: {error}")


m = 2  # 用于归一化的参数
n = 1  # 用于归一化的参数

# 逐块计算并保存结果
result_parts = []
for i in range(1, num_chunks + 1):
    coverage1_part = f"{coverage1_base}_part{i}.txt"
    coverage2_part = f"{coverage2_base}_part{i}.txt"
    output_part = f"result_part{i}.txt"
    normalize_with_awk_part(coverage1_part, coverage2_part, norm_max, m, n, output_part)
    with open(output_part, 'r') as f:
        result_parts.append(f.read())
    os.remove(output_part)

# 合并所有分块结果
result = "".join(result_parts)

# 删除分块文件
for i in range(1, num_chunks + 1):
    os.remove(f"{coverage1_base}_part{i}.txt")
    os.remove(f"{coverage2_base}_part{i}.txt")

Error running awk: 
Error running awk: 
Error running awk: 


In [1]:
def normalize_with_awk(coverage1_file, coverage2_file, norm_max, m, n):
    awk_script = f"""
    BEGIN {{
        OFS = "\\t";  # 设置输出字段分隔符为制表符
        norm_max = {norm_max};
        m = {m};
        n = {n};
        max_cov1 = 0;
        max_cov2 = 0;
    }}
    FNR == NR {{
        split($0, fields, OFS);
        key = fields[1] OFS fields[2];  # 使用染色体和位点作为键
        cov1[key] = fields[3];  # 覆盖度是第三列
        if (fields[3] > max_cov1) max_cov1 = fields[3];
        next;
    }}
    {{
        split($0, fields, OFS);
        key = fields[1] OFS fields[2];  # 使用染色体和位点作为键
        cov2[key] = fields[3];  # 覆盖度是第三列
        if (fields[3] > max_cov2) max_cov2 = fields[3];
    }}
    END {{
        for (key in cov1) {{
            if (!(key in cov2)) cov2[key] = 0;
        }}
        for (key in cov2) {{
            if (!(key in cov1)) cov1[key] = 0;
        }}
        norm_max = (max_cov1 > max_cov2) ? max_cov1 : max_cov2;
        for (key in cov1) {{
            norm1 = (cov1[key] / max_cov1) * norm_max;
            norm2 = (cov2[key] / max_cov2) * norm_max;
            norm_cov = (norm1 * (n/m)) + (norm2 * (1/m));
            split(key, parts, OFS);  # 将key分割为染色体和位点
            print parts[0], parts[1], norm_cov;  # 打印染色体、位点和最终值
        }}
    }}
    """

    # 调用 awk 脚本进行处理
    process = subprocess.Popen(
        ['awk', '-f', '-', coverage1_file, coverage2_file],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    
    result, error = process.communicate(input=awk_script)
    if process.returncode != 0:
        print(f"Error running awk: {error}")
    return result

# 调用函数并打印结果
result = normalize_with_awk(coverage1_file, coverage2_file, norm_max=100, m=m, n=n)

NameError: name 'coverage1_file' is not defined

In [6]:

def normalize_with_awk(coverage1, coverage2, norm_max, m, n):
    awk_script = f"""
    BEGIN {{
        OFS = "\\t";  # 设置输出字段分隔符为制表符
        norm_max = {norm_max};
        m = {m};
        n = {n};
        max_cov1 = 0;
        max_cov2 = 0;
    }}
    {{
        split($0, fields, OFS);
        key = fields[1] OFS fields[2];  # 使用染色体和位点作为键
        
        if (NR <= lines_in_coverage1) {{
            cov1[key] = fields[3];  # 覆盖度是第三列
            if (fields[3] > max_cov1) max_cov1 = fields[3];
        }} else {{
            cov2[key] = fields[3];  # 覆盖度是第三列
            if (fields[3] > max_cov2) max_cov2 = fields[3];
        }}
    }}
    END {{
        for (key in cov1) {{
            if (!(key in cov2)) cov2[key] = 0;
        }}
        for (key in cov2) {{
            if (!(key in cov1)) cov1[key] = 0;
        }}
        norm_max = (max_cov1 > max_cov2) ? max_cov1 : max_cov2;
        for (key in cov1) {{
            norm1 = (cov1[key] / max_cov1) * norm_max;
            norm2 = (cov2[key] / max_cov2) * norm_max;
            norm_cov = (norm1 * (n/m)) + (norm2 * (1/m));
            total_cov = cov1[key] + cov2[key];
            split(key, parts, OFS);  # 将key分割为染色体和位点
            print parts[1], parts[2], norm_cov;  # 打印染色体、位点和最终值
        }}
    }}
    """

    # 计算第一部分覆盖度数据的行数
    lines_in_coverage1 = coverage1.strip().count("\n") + 1
    awk_script = f"""lines_in_coverage1={lines_in_coverage1}; {awk_script}"""
    
    # 合并覆盖度数据
    combined_coverage = coverage1 + "\n" + coverage2
    
    # 调用 awk 脚本进行处理
    process = subprocess.Popen(
        ['awk', awk_script],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    result, error = process.communicate(input=combined_coverage)
    if process.returncode != 0:
        print(f"Error running awk: {error}")
    return result

In [None]:
result = normalize_with_awk(coverage1, coverage2, norm_max=100, m=2, n=1)

In [8]:
# 使用 subprocess.run 调用 awk 进行覆盖度的处理和归一化
awk_script = (
    'BEGIN {max1=0; max2=0;} '
    'NR==FNR {cov1[$1,$2]=$3; if ($3 > max1) max1 = $3; next} '
    '{cov2[$1,$2]=$3; if ($3 > max2) max2 = $3;} '
    'END {'
    'for (key in cov1) {'
    'if (key in cov2) {'
    f'normalized_cov = (cov1[key] / max1 * ({n}/{m})) + (cov2[key] / max2 * ({1}/{m})); '
    'print key "\\t" normalized_cov; '
    '}}}'
)

# Create a new Popen process for awk
awk_proc = subprocess.Popen(
    ["awk", awk_script],
    stdin=subprocess.PIPE,
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

try:
    # 通过管道将两个覆盖度数据传递给 awk
    for line in coverage1.stdout:
        awk_proc.stdin.write(line)
    for line in coverage2.stdout:
        awk_proc.stdin.write(line)

    # Close the stdin to signal end of input
    awk_proc.stdin.close()

    # 获取 awk 处理后的结果
    result = awk_proc.stdout.read()
    awk_proc.stdout.close()
    awk_proc.wait()

    # 检查 AWK 命令的返回码
    print("AWK return code:", awk_proc.returncode)
    print(result.strip())  # 打印归一化覆盖度结果

except BrokenPipeError as e:
    print(f"Broken pipe error occurred: {e}")

finally:
    # 确保所有文件描述符都被关闭
    coverage1.stdout.close()
    coverage2.stdout.close()
    if awk_proc.stdin:
        awk_proc.stdin.close()
    if awk_proc.stdout:
        awk_proc.stdout.close()
    if awk_proc.stderr:
        awk_proc.stderr.close()

ValueError: I/O operation on closed file.

In [None]:
# 使用 subprocess.run 调用 awk 进行覆盖度的处理和归一化
subprocess_result = subprocess.run(
    ["awk", 
     'BEGIN {max1=0; max2=0;} '  # 初始化最大值
     'NR==FNR {cov1[$1,$2]=$3; if ($3 > max1) max1 = $3; next} '  # 存储到 cov1 数组并更新 max1
     '{cov2[$1,$2]=$3; if ($3 > max2) max2 = $3;} '  # 存储到 cov2 数组并更新 max2
     'END {'
     'for (key in cov1) {'  # 遍历 cov1
     'if (key in cov2) {'  # 检查 cov2 是否也存在该位点
     'normalized_cov = (cov1[key] / max1 * (1/' + str(m) + ')) + (cov2[key] / max2 * (1/' + str(m) + ')); '  # 归一化覆盖度
     'print key "\\t" normalized_cov; '  # 输出结果
     '}}}'],
    input=coverage1 + "\n" + coverage2,  # 将两个覆盖度数据传递给 awk
    text=True,
    capture_output=True  # 捕获输出结果
)

# 检查 AWK 命令的返回码和标准错误
print("AWK return code:", subprocess_result.returncode)
print("AWK stderr:", subprocess_result.stderr)

# 将 awk 处理后的结果存储在 result 变量中
result = subprocess_result.stdout.strip()

In [None]:
print("AWK stderr:", subprocess_result.stderr)


In [5]:
import os
import subprocess
import tempfile

# 初始化 m 和 n 的值
m = 2  # 用于归一化的参数
n = 1  # 用于归一化的参数

In [6]:
# 函数：计算 BAM 文件的覆盖度
def get_coverage(bam_file, temp_file):
    cmd = ["samtools", "depth", bam_file]
    with open(temp_file, 'w') as f:
        subprocess.run(cmd, stdout=f, text=True)

# 创建临时文件
with tempfile.NamedTemporaryFile(delete=False) as tmp1, tempfile.NamedTemporaryFile(delete=False) as tmp2:
    temp_file1 = tmp1.name
    temp_file2 = tmp2.name

try:
    # 获取两个 BAM 文件的覆盖度并存储到临时文件中
    get_coverage(bam_file1, temp_file1)
    get_coverage(bam_file2, temp_file2)

    # 使用 subprocess.run 调用 awk 进行覆盖度的处理和归一化
    awk_script = (
        'BEGIN {max1=0; max2=0;} '
        'NR==FNR {cov1[$1,$2]=$3; if ($3 > max1) max1 = $3; next} '
        '{cov2[$1,$2]=$3; if ($3 > max2) max2 = $3;} '
        'END {'
        'for (key in cov1) {'
        'if (key in cov2) {'
        f'normalized_cov = (cov1[key] / max1 * ({n}/{m})) + (cov2[key] / max2 * ({1}/{m})); '
        'print key "\\t" normalized_cov; '
        '}}}'
    )

    # Create a new Popen process for awk
    with open(temp_file1, 'r') as f1, open(temp_file2, 'r') as f2:
        awk_proc = subprocess.Popen(
            ["awk", awk_script],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )

        # 通过管道将两个覆盖度数据传递给 awk
        for line in f1:
            awk_proc.stdin.write(line)
        for line in f2:
            awk_proc.stdin.write(line)

        # Close the stdin to signal end of input
        awk_proc.stdin.close()

        # 获取 awk 处理后的结果
        result = awk_proc.stdout.read()
        awk_proc.stdout.close()
        awk_proc.wait()

        # 检查 AWK 命令的返回码
        print("AWK return code:", awk_proc.returncode)
        print(result.strip())  # 打印归一化覆盖度结果

except BrokenPipeError as e:
    print(f"Broken pipe error occurred: {e}")

finally:
    # 删除临时文件
    os.remove(temp_file1)
    os.remove(temp_file2)

Broken pipe error occurred: [Errno 32] Broken pipe


In [7]:

# 输出结果
print(result)  # 打印归一化覆盖度结果


NameError: name 'result' is not defined