In [1]:
import pybedtools
import pandas as pd

In [2]:
# Load the probe file
probe_input = "kmers_selected_probes.csv"
probe_df = pd.read_csv(probe_input)

In [4]:
# Rearrange and rename columns to match BED format
probe_bed_df = probe_df[["seqname", "start", "end", "gene_ID", "strand"]]
probe_bed_df.columns = ["chrom", "chromstart", "chromend", "name", "strand"]

In [5]:
probe_output = "probes_HTT.bed"
probe_bed_df.to_csv(probe_output, sep="\t", index=False, header=False)

In [6]:
import subprocess
import pandas as pd

# Paths to your BED files
probe_bed = "probes_HTT.bed"  # Your probes file
canonical_bed = "canonical_transcripts_cleaned.bed"  # File containing canonical transcript intervals
output_bed = "annotated_probes.bed"  # Output file to store annotated results

# BEDTools intersect command
cmd = [
    "bedtools", "intersect",
    "-a", probe_bed,
    "-b", canonical_bed,
    "-wao"
]

# Run BEDTools
with open(output_bed, "w") as output:
    subprocess.run(cmd, stdout=output)

# Load the results for analysis
df = pd.read_csv(output_bed, sep="\t", header=None)

# Assign column names including the strand
df.columns = [
    "probe_chr", "probe_start", "probe_end", "probe_info", "probe_strand",
    "canonical_chr", "canonical_start", "canonical_end", "canonical_info", "canonical_strand",
    "overlap"
]

# Quantify coverage
coverage_summary = df.groupby("probe_info")["overlap"].sum()

# Display annotated probes with strand information
print(df.head())
print("\nCoverage summary by probe:")
print(coverage_summary)

  probe_chr  probe_start  probe_end probe_info probe_strand canonical_chr  \
0      chr4      3223406    3223526        HTT            +          chr4   
1      chr4      3041999    3042119        HTT            +             .   
2      chr4      3127431    3127551        HTT            +          chr4   
3      chr4      3132578    3132698        HTT            +          chr4   
4      chr4      3131655    3131775        HTT            +          chr4   

   canonical_start  canonical_end canonical_info canonical_strand  overlap  
0          3074681        3243957            HTT                +      120  
1               -1             -1              .                .        0  
2          3074681        3243957            HTT                +      120  
3          3074681        3243957            HTT                +      120  
4          3074681        3243957            HTT                +      120  

Coverage summary by probe:
probe_info
HTT    1680
Name: overlap, dtype: in