In [1]:
import pandas as pd
import numpy as np
import pybedtools 
from Bio import SeqIO
from io import StringIO
pybedtools.helpers.set_tempdir("/data/projects/temp")

In [2]:
# Function to read a BED file and extract the necessary columns
def read_bed(file_path, usecols, column_names):
    return pd.read_csv(file_path, sep='\t', header=None, usecols=usecols, names=column_names)

In [3]:
# Read the input BED files
input_bed = read_bed('/data/private/pdutta/DNABERT_2/CAGE_seq/CAGE_Seq_hg19.bed', [0, 1, 2, 3], ['chr', 'start', 'end', 'details'])          # hg19 coordinates
output1 = read_bed('/data/private/pdutta/DNABERT_2/CAGE_seq_hg38/CAGE_Seq_hg19_mapped.bed', [0, 1, 2, 3], ['chr', 'start', 'end', 'details'])     # hg38 coordinates for mapped regions
output2 = read_bed('/data/private/pdutta/DNABERT_2/CAGE_seq_hg38/CAGE_Seq_hg19_unmapped.bed', [0, 1, 2, 3], ['chr', 'start', 'end', 'details'])   # hg19 coordinates for unmapped regions

In [4]:
pred_promoter_df = pd.read_csv("/data/private/pdutta/DNABERT_2/DNABERT2_Prom_Pred/Predicted_Prom_bed.tsv", sep="\t")
pred_promoter_df

Unnamed: 0,Chromosome,Start,End
0,chr22,10510051,10510140
1,chr22,10510251,10510340
2,chr22,10510301,10510390
3,chr22,10510351,10510440
4,chr22,10510401,10510490
...,...,...,...
565012,chr22,50808151,50808240
565013,chr22,50808201,50808290
565014,chr22,50808251,50808340
565015,chr22,50808301,50808390


In [5]:
input_bed

Unnamed: 0,chr,start,end,details
0,chr1,564571,564600,"chr1:564571..564600,+"
1,chr1,564639,564649,"chr1:564639..564649,+"
2,chr1,565266,565278,"chr1:565266..565278,+"
3,chr1,565478,565483,"chr1:565478..565483,+"
4,chr1,565509,565541,"chr1:565509..565541,+"
...,...,...,...,...
201797,chrY,28817276,28817283,"chrY:28817276..28817283,-"
201798,chrY,58856051,58856076,"chrY:58856051..58856076,+"
201799,chrY,58884631,58884640,"chrY:58884631..58884640,+"
201800,chrY,58903885,58903887,"chrY:58903885..58903887,+"


In [6]:
# Create hg19 and hg38 coordinate strings
input_bed['hg19_coordinates'] = input_bed['chr'].astype(str) + ':' + input_bed['start'].astype(str) + '-' + input_bed['end'].astype(str)
output1['hg38_coordinates'] = output1['chr'].astype(str) + ':' + output1['start'].astype(str) + '-' + output1['end'].astype(str)

In [7]:
output1

Unnamed: 0,chr,start,end,details,hg38_coordinates
0,chr1,629191,629220,"chr1:564571..564600,+",chr1:629191-629220
1,chr1,629259,629269,"chr1:564639..564649,+",chr1:629259-629269
2,chr1,629886,629898,"chr1:565266..565278,+",chr1:629886-629898
3,chr1,630098,630103,"chr1:565478..565483,+",chr1:630098-630103
4,chr1,630129,630161,"chr1:565509..565541,+",chr1:630129-630161
...,...,...,...,...,...
201824,chrY,26671129,26671136,"chrY:28817276..28817283,-",chrY:26671129-26671136
201825,chrY,56734794,56734819,"chrY:58856051..58856076,+",chrY:56734794-56734819
201826,chrY,56706230,56706239,"chrY:58884631..58884640,+",chrY:56706230-56706239
201827,chrY,56686983,56686985,"chrY:58903885..58903887,+",chrY:56686983-56686985


In [8]:
# Create a dictionary from the output1 DataFrame for mapped coordinates
mapped_dict = dict(zip(output1['details'], output1['hg38_coordinates']))

In [9]:
mapped_dict['chr1:564571..564600,+']

'chr1:629191-629220'

In [10]:
# Map hg38 coordinates to hg19 coordinates and handle unmapped regions
input_bed['hg38_coordinates'] = input_bed['details'].map(mapped_dict).fillna('Unmapped')

In [11]:
input_bed

Unnamed: 0,chr,start,end,details,hg19_coordinates,hg38_coordinates
0,chr1,564571,564600,"chr1:564571..564600,+",chr1:564571-564600,chr1:629191-629220
1,chr1,564639,564649,"chr1:564639..564649,+",chr1:564639-564649,chr1:629259-629269
2,chr1,565266,565278,"chr1:565266..565278,+",chr1:565266-565278,chr1:629886-629898
3,chr1,565478,565483,"chr1:565478..565483,+",chr1:565478-565483,chr1:630098-630103
4,chr1,565509,565541,"chr1:565509..565541,+",chr1:565509-565541,chr1:630129-630161
...,...,...,...,...,...,...
201797,chrY,28817276,28817283,"chrY:28817276..28817283,-",chrY:28817276-28817283,chrY:26671129-26671136
201798,chrY,58856051,58856076,"chrY:58856051..58856076,+",chrY:58856051-58856076,chrY:56734794-56734819
201799,chrY,58884631,58884640,"chrY:58884631..58884640,+",chrY:58884631-58884640,chrY:56706230-56706239
201800,chrY,58903885,58903887,"chrY:58903885..58903887,+",chrY:58903885-58903887,chrY:56686983-56686985


In [12]:
# Create the final DataFrame with only the coordinate columns
final_df = input_bed[['hg19_coordinates', 'hg38_coordinates']]

In [13]:
final_df

Unnamed: 0,hg19_coordinates,hg38_coordinates
0,chr1:564571-564600,chr1:629191-629220
1,chr1:564639-564649,chr1:629259-629269
2,chr1:565266-565278,chr1:629886-629898
3,chr1:565478-565483,chr1:630098-630103
4,chr1:565509-565541,chr1:630129-630161
...,...,...
201797,chrY:28817276-28817283,chrY:26671129-26671136
201798,chrY:58856051-58856076,chrY:56734794-56734819
201799,chrY:58884631-58884640,chrY:56706230-56706239
201800,chrY:58903885-58903887,chrY:56686983-56686985


In [14]:
final_df[final_df['hg38_coordinates']=="Unmapped"]

Unnamed: 0,hg19_coordinates,hg38_coordinates
60646,chr14:106539557-106539581,Unmapped
60647,chr14:106552655-106552665,Unmapped
60648,chr14:106552755-106552765,Unmapped
60649,chr14:106552901-106552907,Unmapped
60650,chr14:106556293-106556339,Unmapped
94187,chr19:40407511-40407513,Unmapped
114703,chr2:242744542-242744583,Unmapped
174783,chr7:100550674-100550679,Unmapped
174784,chr7:100550681-100550704,Unmapped
174785,chr7:100550748-100550757,Unmapped


In [15]:
cage_df_38 = final_df[final_df['hg38_coordinates']!="Unmapped"]
cage_df_38

Unnamed: 0,hg19_coordinates,hg38_coordinates
0,chr1:564571-564600,chr1:629191-629220
1,chr1:564639-564649,chr1:629259-629269
2,chr1:565266-565278,chr1:629886-629898
3,chr1:565478-565483,chr1:630098-630103
4,chr1:565509-565541,chr1:630129-630161
...,...,...
201797,chrY:28817276-28817283,chrY:26671129-26671136
201798,chrY:58856051-58856076,chrY:56734794-56734819
201799,chrY:58884631-58884640,chrY:56706230-56706239
201800,chrY:58903885-58903887,chrY:56686983-56686985


In [16]:
cage_df_38_bed = cage_df_38['hg38_coordinates'].str.split(':', expand=True)
cage_df_38_bed[['start', 'end']] = cage_df_38_bed[1].str.split('-', expand=True)
cage_df_38_bed.drop(columns=1, inplace=True)
cage_df_38_bed.rename(columns={0: 'chr'}, inplace=True)

In [17]:
cage_df_38_bed

Unnamed: 0,chr,start,end
0,chr1,629191,629220
1,chr1,629259,629269
2,chr1,629886,629898
3,chr1,630098,630103
4,chr1,630129,630161
...,...,...,...
201797,chrY,26671129,26671136
201798,chrY,56734794,56734819
201799,chrY,56706230,56706239
201800,chrY,56686983,56686985


In [20]:
cage_df_38_bed[cage_df_38_bed['chr']=="chr22"]

Unnamed: 0,chr,start,end
122817,chr22,16595545,16595548
122818,chr22,16601345,16601385
122819,chr22,16601462,16601492
122820,chr22,16601842,16601885
122821,chr22,16601920,16601935
...,...,...,...
126687,chr22,50738149,50738156
126688,chr22,50738198,50738235
126689,chr22,50783614,50783641
126690,chr22,50783642,50783664


In [18]:
cage_df_38_bed.to_csv("/data/private/pdutta/DNABERT_2/CAGE_seq_hg38/CAGE_seq_hg38_bed.tsv", sep="\t", index=False)

In [53]:
promoter_bed = pybedtools.BedTool.from_dataframe(pred_promoter_df.iloc[1:])
cage_38_bed = pybedtools.BedTool.from_dataframe(cage_df_38_bed.iloc[1:])

In [56]:
intersect_bed = promoter_bed.intersect(cage_38_bed, wa=True, wb=True)

In [58]:
intersect_df = intersect_bed.to_dataframe()
new_columns = ['chr_name', 'CoreProm_start', 'CoreProm_end', 'chrom', 'CAGE_start', 'CAGE_end']
intersect_df.columns = new_columns
intersect_df

Unnamed: 0,chr_name,CoreProm_start,CoreProm_end,chrom,CAGE_start,CAGE_end
0,chr22,16601301,16601390,chr22,16601345,16601385
1,chr22,16601351,16601440,chr22,16601345,16601385
2,chr22,16601401,16601490,chr22,16601462,16601492
3,chr22,16601451,16601540,chr22,16601462,16601492
4,chr22,16601801,16601890,chr22,16601842,16601885
...,...,...,...,...,...,...
5865,chr22,50783601,50783690,chr22,50783614,50783641
5866,chr22,50783601,50783690,chr22,50783642,50783664
5867,chr22,50783701,50783790,chr22,50783779,50783835
5868,chr22,50783751,50783840,chr22,50783779,50783835


In [59]:
intersect_df.to_csv("/data/private/pdutta/DNABERT_2/intersected_chr22_CAGE_CoreProm.tsv", sep="\t", index=False)

In [60]:
intersect_df[['chr_name', 'CoreProm_start','CoreProm_end']].drop_duplicates()

Unnamed: 0,chr_name,CoreProm_start,CoreProm_end
0,chr22,16601301,16601390
1,chr22,16601351,16601440
2,chr22,16601401,16601490
3,chr22,16601451,16601540
4,chr22,16601801,16601890
...,...,...,...
5864,chr22,50783551,50783640
5865,chr22,50783601,50783690
5867,chr22,50783701,50783790
5868,chr22,50783751,50783840
