In [1]:
# The purpose of this notebooks is to do two things:
# 1. get all uniquely mapping reads from the bam file
# 2. subset the bam file to only include top mapping reads

In [2]:
import pysam
import subprocess
import pandas as pd

In [16]:
def process_bams(prefix_path: str):

    '''
    add this to snakemake rule
     # Step 1: Convert BAM to TSV
    samtools view $bam_file | awk -F'\t' '{score=""; for(i=3; i<=NF; i++) { if ($i ~ /^AS:i:/) { score=substr($i, 6); break } } print $1 "\t" score}' >  "$input_directory/$bam_id/$bam_id.tsv"

        # Step 2: Sort TSV by Read ID and Score
        sort -k1,1 -k2,2nr -o "$input_directory/$bam_id/$bam_id.sorted.tsv" "$input_directory/$bam_id/$bam_id.tsv"'''

    bam_file = prefix_path + ".bam"
    # get top alignment score
    sorted_tsv = prefix_path + ".sorted.tsv"
    df = pd.read_csv(
        sorted_tsv,
        sep="\t",
        header=None,
    )

    unique_reads_full = list(df[0].value_counts()[df[0].value_counts() == 1].index)
    best_read_counts = {read: 0 for read in df[0].unique()}

    # iterate through df rows
    # for each new value in col 0, save the first value in col 1
    # save the read_id and alignment score in a dictionary

    read_scores = {}
    for row in df.itertuples():
        if row[1] not in read_scores:
            read_scores[row[1]] = row[2]

    # Open the input and output BAM files
    input_bam = pysam.AlignmentFile(bam_file, "rb")
    output_bam = pysam.AlignmentFile(
        prefix_path + "_bestAS.bam", "wb", template=input_bam
    )

    for read in input_bam:
        # if AS tag is not present then skip
        if read.has_tag("AS"):

            # if read does not have top AS then skip
            if read_scores[read.query_name] == read.get_tag("AS"):
                output_bam.write(read)
                best_read_counts[read.query_name] += 1

    input_bam.close()
    output_bam.close()
    pysam.index(prefix_path + "_bestAS.bam")

    # get the unique reads from the bam file
    unique_reads_best = [
        read for read in best_read_counts if best_read_counts[read] == 1
    ]
    # write the unique reads to a file
    with open(prefix_path + "_unique_reads_best.txt", "w") as f:
        for read in unique_reads_best:
            f.write(read + "\n")
    with open(prefix_path + "_unique_reads.txt", "w") as f:
        for read in unique_reads_full:
            f.write(read + "\n")

In [None]:
basepath = [
    "../longread_files/SGNex_MCF7_directcDNA_replicate3_run3/SGNex_MCF7_directcDNA_replicate3_run3",
    "../longread_files/SGNex_MCF7_directcDNA_replicate1_run2/SGNex_MCF7_directcDNA_replicate1_run2",
    "../longread_files/SGNex_MCF7_directcDNA_replicate4_run2/SGNex_MCF7_directcDNA_replicate4_run2",
]
for prefix_path in basepath:
    print("processing: ", prefix_path)
    process_bams(prefix_path)