In [1]:
import pandas as pd

In [6]:
# Load the canonical transcripts file 
input_file = "2024-08-12_grch38_105_gencode-canonical-protein-coding-transcripts.tsv"
canonical_df = pd.read_csv(input_file, sep="\t")

In [7]:
# Rearrange and rename columns to match BED format
bed_df = canonical_df[["chrom", "start", "end", "gene_name", "strand"]]
bed_df.columns = ["chrom", "chromstart", "chromend", "name", "strand"]

In [9]:
# Replace strand values
bed_df.loc[:, "strand"] = bed_df["strand"].replace({1: "+", -1: "-"})

In [10]:
# Save as a BED file
output_file = "canonical_transcripts.bed"
bed_df.to_csv(output_file, sep="\t", index=False, header=False)

In [2]:
import pybedtools

In [5]:
# Load the probe file
probe_input = "kmers_selected_probes.csv"
probe_df = pd.read_csv(probe_input)

In [6]:
# Rearrange and rename columns to match BED format
probe_bed_df = probe_df[["seqname", "start", "end", "gene_ID", "strand"]]
probe_bed_df.columns = ["chrom", "chromstart", "chromend", "name", "strand"]

In [7]:
probe_output = "probes_NUMB.bed"
probe_bed_df.to_csv(probe_output, sep="\t", index=False, header=False)

fix

In [4]:
canonical_bed = "canonical_transcripts.bed"
fix_bed_df = pd.read_csv(canonical_bed, sep="\t", header=None, on_bad_lines='skip')

In [6]:
# Find rows that do not have exactly 5 columns
expected_columns = 5
malformed_rows = fix_bed_df[fix_bed_df.isnull().any(axis=1) | (fix_bed_df.shape[1] != expected_columns)]

# Display malformed rows
print("Malformed rows:")
print(malformed_rows)

Malformed rows:
       0            1            2             3  4
16     1          NaN          NaN           NaN  -
59     1          NaN          NaN  MICOS10-NBL1  +
156    1   38864529.0   38881602.0           NaN  -
163    1          NaN          NaN           NaN  +
177    1          NaN          NaN           NaN  -
...   ..          ...          ...           ... ..
19603  X   49071470.0   49079887.0           NaN  -
19630  X   23783278.0   23783367.0           NaN  +
19638  X  102712495.0  102753530.0           NaN  +
19649  X  119072960.0  119076373.0           NaN  +
19820  X  140091678.0  140092911.0           NaN  +

[641 rows x 5 columns]


In [8]:
# Keep only rows with exactly 5 columns
fix_bed_df = fix_bed_df[fix_bed_df.apply(lambda row: len(row.dropna()), axis=1) == 5]


In [9]:
fix_bed_df.to_csv("canonical_transcripts_fixed.bed", sep="\t", index=False, header=False)

fix again

In [13]:

# Load the canonical transcript BED file
canonical_bed = "canonical_transcripts_fixed.bed"
bed_df = pd.read_csv(canonical_bed, sep="\t", header=None)

# Update the chromosome column (column 0) to prepend "chr"
bed_df[0] = bed_df[0].apply(lambda x: f"chr{x}" if not str(x).startswith("chr") else x)

# Save the updated file
output_file = "canonical_transcripts_chr_fixed.bed"
bed_df.to_csv(output_file, sep="\t", index=False, header=False)


fix more

In [25]:


# Load and inspect probes file
#probes_df = pd.read_csv("probes_NUMB.bed", sep="\t", header=None)
#print(probes_df.info())

# Load and inspect canonical transcripts file
canonical_df = pd.read_csv("canonical_transcripts_chr_fixed.bed", sep="\t", header=None)
print(canonical_df.info())

# Convert coordinates to integers
canonical_df[1] = canonical_df[1].astype(int)
canonical_df[2] = canonical_df[2].astype(int)

canonical_df.to_csv("canonical_transcripts_cleaned.bed", sep="\t", index=False, header=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19311 entries, 0 to 19310
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       19311 non-null  object 
 1   1       19311 non-null  float64
 2   2       19311 non-null  float64
 3   3       19311 non-null  object 
 4   4       19311 non-null  object 
dtypes: float64(2), object(3)
memory usage: 754.5+ KB
None


In [26]:
canonical_df = pd.read_csv("canonical_transcripts_cleaned.bed", sep="\t", header=None)
print(canonical_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19311 entries, 0 to 19310
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       19311 non-null  object
 1   1       19311 non-null  int64 
 2   2       19311 non-null  int64 
 3   3       19311 non-null  object
 4   4       19311 non-null  object
dtypes: int64(2), object(3)
memory usage: 754.5+ KB
None


intersect

In [27]:
# Define paths to the BED files
probe_bed = "probes_NUMB.bed"
canonical_bed = "canonical_transcripts_cleaned.bed"

In [28]:
# Create BedTool objects
probes = pybedtools.BedTool(probe_bed)
canonical = pybedtools.BedTool(canonical_bed)


In [30]:
# Perform the intersection
# The '-wao' option will keep all intervals and report overlaps
result = probes.intersect(canonical, wao=True)

In [31]:
output_file = "annotated_probes_with_overlap.bed"
result.saveas(output_file)

<BedTool(annotated_probes_with_overlap.bed)>

trying another way...

In [32]:
import subprocess
import pandas as pd

# Paths to your BED files
probe_bed = "probes_NUMB.bed"  # Your probes file
canonical_bed = "canonical_transcripts_cleaned.bed"  # File containing canonical transcript intervals
output_bed = "annotated_probes.bed"  # Output file to store annotated results

# BEDTools intersect command
cmd = [
    "bedtools", "intersect",
    "-a", probe_bed,
    "-b", canonical_bed,
    "-wao"
]

# Run BEDTools
with open(output_bed, "w") as output:
    subprocess.run(cmd, stdout=output)

# Load the results for analysis
df = pd.read_csv(output_bed, sep="\t", header=None)

# Assign column names including the strand
df.columns = [
    "probe_chr", "probe_start", "probe_end", "probe_info", "probe_strand",
    "canonical_chr", "canonical_start", "canonical_end", "canonical_info", "canonical_strand",
    "overlap"
]

# Quantify coverage
coverage_summary = df.groupby("probe_info")["overlap"].sum()

# Display annotated probes with strand information
print(df.head())
print("\nCoverage summary by probe:")
print(coverage_summary)


  probe_chr  probe_start  probe_end probe_info probe_strand canonical_chr  \
0     chr14     73462429   73462549       NUMB            -             .   
1     chr14     73284190   73284310       NUMB            -         chr14   
2     chr14     73292734   73292854       NUMB            -         chr14   
3     chr14     73355634   73355754       NUMB            -         chr14   
4     chr14     73409941   73410061       NUMB            -         chr14   

   canonical_start  canonical_end canonical_info canonical_strand  overlap  
0               -1             -1              .                .        0  
1         73275216       73458546           NUMB                -      120  
2         73275216       73458546           NUMB                -      120  
3         73275216       73458546           NUMB                -      120  
4         73275216       73458546           NUMB                -      120  

Coverage summary by probe:
probe_info
NUMB    1974
Name: overlap, dtype: i