# Usage script to process aligned, sorted and filtered bamfiles for "valid reads"

## Imports

In [1]:
import subprocess

## Step 1: Filter for "Valid Reads"

### Define path to input and output data

In [2]:
input_bam = "../../Exemplar_Data/bam/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000.bam"
reference = "../../Exemplar_Data/synthethics_reference/combined_synthetic_IVT_reference.fasta"
outdir = "../../Exemplar_Data/synthetics_outdir/"
prefix = "12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads"


### Construct Command and Run

In [3]:
cmd = [
    "python", 
    "./Filter_For_Valid_Reads.py",
    "-b", input_bam,
    "-r", reference,
    "-o", outdir,
    "-p", prefix
]

# Run without check=True to see the error
result = subprocess.run(
    cmd,
    capture_output=True,
    text=True,
    check=False
)

print(f"Return code: {result.returncode}")
print(f"STDOUT: {result.stdout}")
print(f"STDERR: {result.stderr}")

Return code: 0
STDOUT: Preview of collected Read IDs:
db9d177e-8114-42bc-893d-8bd438f7d14c
1f3e35a0-723c-4e0e-8d8b-69af1be5a1f3
a5622f83-ba37-4422-905d-7b1c8916ba47
96a1fcd9-b283-494c-9ceb-41f4d00fd22d
cedd3868-7e9c-497f-a7a6-85d7ee3eea52
bde53ae3-9e5b-4892-9ba9-38567231756b
eac88ef6-3d83-44bf-97d2-6561e42d010d
04d8aa98-bc83-44bb-a4b6-8211a73c4caa
92995c19-0e11-4db1-aa5b-d2d276e2010e
2d2562cc-487c-480f-a381-858122bbe509
Total unique read IDs: 12000
List of read IDs saved to: ../../Exemplar_Data/synthetics_outdir/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads_read_ids_list.txt
Running samtools view to filter and index reads
Running: samtools view -@ 1 -N ../../Exemplar_Data/synthetics_outdir/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads_read_ids_list.txt -b ../../Exemplar_Data/bam/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_e

## Create "data structure" to properly process the data for plotting

### Define path to input and output data

In [4]:
input_bam_1 = "../../Exemplar_Data/synthetics_outdir/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads_bam_filtered_for_valid_reads.bam"
prefix_1 = "12_16_24_Mod_IVT_dorado.1.0.0_8_mods_boxen_plot_datastructure"
tag_list = [
    "A+a.", "A+17596.", "A+69426.",
    "G+19229.",
    "C+m.", "C+19228.",
    "T+17802.", "T+19227."
]


### Construct Command and Run

In [5]:
cmd = [
    "python",
    "./Create_Data_Structure.py",
    "-b", input_bam_1,
    "-o", outdir,
    "-p", prefix_1,
    "-t", ",".join(tag_list)
]

result = subprocess.run(
    cmd,
    capture_output=True,
    text=True,
    check=False
)

print(f"Return code: {result.returncode}")
print(f"STDOUT: {result.stdout}")
print(f"STDERR: {result.stderr}")

Return code: 0
STDOUT: 
Processing BAM: ../../Exemplar_Data/synthetics_outdir/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads_bam_filtered_for_valid_reads.bam
Looking for these modifications: ['A+a.,A+17596.,A+69426.,G+19229.,C+m.,C+19228.,T+17802.,T+19227.']
Debug mode is: False

Saving final dictionary to ../../Exemplar_Data/synthetics_outdir/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_boxen_plot_datastructure.pkl (pickle)...
Done.

STDERR: 
0it [00:00, ?it/s]
479it [00:00, 4780.08it/s]
969it [00:00, 4846.20it/s]
1454it [00:00, 4834.65it/s]
1945it [00:00, 4863.84it/s]
2432it [00:00, 4852.89it/s]
2923it [00:00, 4871.94it/s]
3411it [00:00, 4806.35it/s]
3901it [00:00, 4835.30it/s]
4394it [00:00, 4863.64it/s]
4885it [00:01, 4876.58it/s]
5383it [00:01, 4905.59it/s]
5874it [00:01, 4888.54it/s]
6363it [00:01, 4761.00it/s]
6840it [00:01, 4649.89it/s]
7306it [00:01, 4544.17it/s]
7762it [00:01, 4511.71it/s]
8214it [00:01, 4