In [1]:
import pybedtools
import pandas as pd

In [2]:
# Load the probe file
probe_input = "kmers_selected_probes.csv"
probe_df = pd.read_csv(probe_input)

In [3]:
# Rearrange and rename columns to match BED format
probe_bed_df = probe_df[["seqname", "start", "end", "gene_ID", "strand"]]
probe_bed_df.columns = ["chrom", "chromstart", "chromend", "name", "strand"]

In [4]:
probe_output = "probes_DLG4.bed"
probe_bed_df.to_csv(probe_output, sep="\t", index=False, header=False)

In [5]:
import subprocess
import pandas as pd

# Paths to your BED files
probe_bed = "probes_DLG4.bed"  # Your probes file
canonical_bed = "canonical_transcripts_cleaned.bed"  # File containing canonical transcript intervals
output_bed = "annotated_probes.bed"  # Output file to store annotated results

# BEDTools intersect command
cmd = [
    "bedtools", "intersect",
    "-a", probe_bed,
    "-b", canonical_bed,
    "-wao"
]

# Run BEDTools
with open(output_bed, "w") as output:
    subprocess.run(cmd, stdout=output)

# Load the results for analysis
df = pd.read_csv(output_bed, sep="\t", header=None)

# Assign column names including the strand
df.columns = [
    "probe_chr", "probe_start", "probe_end", "probe_info", "probe_strand",
    "canonical_chr", "canonical_start", "canonical_end", "canonical_info", "canonical_strand",
    "overlap"
]

# Quantify coverage
coverage_summary = df.groupby("probe_info")["overlap"].sum()

# Display annotated probes with strand information
print(df.head())
print("\nCoverage summary by probe:")
print(coverage_summary)

  probe_chr  probe_start  probe_end probe_info probe_strand canonical_chr  \
0     chr17      7203470    7203590       DLG4            -         chr17   
1     chr17      7189800    7189920       DLG4            -         chr17   
2     chr17      7187649    7187769       DLG4            -         chr17   
3     chr17      7204275    7204395       DLG4            -         chr17   
4     chr17      7202755    7202875       DLG4            -         chr17   

   canonical_start  canonical_end canonical_info canonical_strand  overlap  
0          7187187        7217627           DLG4                -      120  
1          7187187        7217627           DLG4                -      120  
2          7187187        7217627           DLG4                -      120  
3          7187187        7217627           DLG4                -      120  
4          7187187        7217627           DLG4                -      120  

Coverage summary by probe:
probe_info
DLG4    1680
Name: overlap, dtype: i