# Usage script to process aligned, sorted and filtered bamfiles for "valid reads"

## Imports

In [1]:
import subprocess

## Step 1: Filter for "Valid Reads"

### Define path to input and output data

In [2]:
input_bam = "../../Exemplar_Data/bam/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000.bam"
reference = "../../Exemplar_Data/synthethics_reference/combined_synthetic_IVT_reference.fasta"
outdir = "../../Exemplar_Data/synthetics_outdir/"
prefix = "12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads"


### Construct Command and Run

In [3]:
cmd = [
    "python", 
    "./Filter_For_Valid_Reads.py",
    "-b", input_bam,
    "-r", reference,
    "-o", outdir,
    "-p", prefix
]

# Run without check=True to see the error
result = subprocess.run(
    cmd,
    capture_output=True,
    text=True,
    check=False
)

print(f"Return code: {result.returncode}")
print(f"STDOUT: {result.stdout}")
print(f"STDERR: {result.stderr}")

Return code: 0
STDOUT: Preview of collected Read IDs:
db9d177e-8114-42bc-893d-8bd438f7d14c
1f3e35a0-723c-4e0e-8d8b-69af1be5a1f3
a5622f83-ba37-4422-905d-7b1c8916ba47
96a1fcd9-b283-494c-9ceb-41f4d00fd22d
cedd3868-7e9c-497f-a7a6-85d7ee3eea52
bde53ae3-9e5b-4892-9ba9-38567231756b
eac88ef6-3d83-44bf-97d2-6561e42d010d
04d8aa98-bc83-44bb-a4b6-8211a73c4caa
92995c19-0e11-4db1-aa5b-d2d276e2010e
2d2562cc-487c-480f-a381-858122bbe509
Total unique read IDs: 12000
List of read IDs saved to: ../../Exemplar_Data/synthetics_outdir/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads_read_ids_list.txt
Running samtools view to filter and index reads
Running: samtools view -@ 1 -N ../../Exemplar_Data/synthetics_outdir/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads_read_ids_list.txt -b ../../Exemplar_Data/bam/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_e

## Create "data structure" to properly process the data for plotting

### Define path to input and output data

In [7]:
input_bam_1 = "../../Exemplar_Data/synthetics_outdir/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads_bam_filtered_for_valid_reads.bam"
prefix_1 = "../../12_16_24_Mod_IVT_dorado.1.0.0_8_mods_boxen_plot_datastructure"
tag_list = [
    "A+a.", "A+17596.", "A+69426.",
    "G+19229.",
    "C+m.", "C+19228.",
    "T+17802.", "T+19227."
]


### Construct Command and Run

In [8]:
cmd = [
    "python",
    "./Create_Data_Structure.py",
    "-b", input_bam_1,
    "-o", outdir,
    "-p", prefix_1,
    "-t", ",".join(tag_list)
]

result = subprocess.run(
    cmd,
    capture_output=True,
    text=True,
    check=False
)

print(f"Return code: {result.returncode}")
print(f"STDOUT: {result.stdout}")
print(f"STDERR: {result.stderr}")

Return code: 0
STDOUT: 
Processing BAM: ../../Exemplar_Data/synthetics_outdir/12_16_24_Mod_IVT_dorado.1.0.0_8_mods_threshold_0_emit_moves_aligned_sorted_filtered_downsampled_1000_filtered_for_valid_reads_bam_filtered_for_valid_reads.bam
Looking for these modifications: ['A+a.,A+17596.,A+69426.,G+19229.,C+m.,C+19228.,T+17802.,T+19227.']
Debug mode is: False

Saving final dictionary to ../../Exemplar_Data/synthetics_outdir/../../12_16_24_Mod_IVT_dorado.1.0.0_8_mods_boxen_plot_datastructure.pkl (pickle)...
Done.

STDERR: 
0it [00:00, ?it/s]
467it [00:00, 4664.28it/s]
967it [00:00, 4860.66it/s]
1463it [00:00, 4902.57it/s]
1969it [00:00, 4961.81it/s]
2466it [00:00, 4955.50it/s]
2970it [00:00, 4983.27it/s]
3469it [00:00, 4947.24it/s]
3980it [00:00, 4997.63it/s]
4480it [00:00, 4996.57it/s]
4989it [00:01, 5022.85it/s]
5495it [00:01, 5032.06it/s]
5999it [00:01, 5023.44it/s]
6502it [00:01, 4820.37it/s]
6986it [00:01, 4695.52it/s]
7458it [00:01, 4631.13it/s]
7923it [00:01, 4611.24it/s]
8385it [00