In [1]:
import os
import pandas as pd
import numpy as np
import pybedtools 
from Bio import SeqIO
from io import StringIO
pybedtools.helpers.set_tempdir("/data/projects/temp")
from tqdm import tqdm

In [2]:
ICGC_bed_file = "/data/projects/DNABERT_snv/Manuscript_11_2023/bed/ICGC/ICGC_hg38_combined_final.bed"

In [3]:
# Define a function to read large file in chunks and show progress
def read_large_file(file_path, chunk_size=100000):
    # Determine the total size of the file
    total_size = os.path.getsize(file_path)
    print("Total Size :", total_size)
    # Iterator to read file in chunks
    iterator = pd.read_csv(file_path, chunksize=chunk_size, sep="\t", header=None, names=["Chromosome", "Start", "End", "Mutation_ID","T1","Reference_Allele", "Alternative_Allele","T2", "Info"])

    # Use tqdm to show progress, using file size as a proxy
    chunks = []
    # Initialize tqdm with the total number of chunks (estimated)
    with tqdm(total=os.path.getsize(file_path) // (chunk_size * 1024), desc="Reading file") as pbar:
        for chunk in iterator:
            # Process each chunk
            chunks.append(chunk)

            # Update the progress bar by one chunk
            pbar.update(1)

    print("Reading done....")
    # Concatenate chunks into a single DataFrame
    df = pd.concat(chunks, ignore_index=True)

    return df

In [4]:
df = read_large_file(ICGC_bed_file)

Total Size : 30209257697


Reading file: 818it [03:04,  4.43it/s]                                                                                  


Reading done....


In [5]:
df['Chromosome'] = 'chr' + df['Chromosome'].astype(str)
df

Unnamed: 0,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
0,chr1,10001,10002,MU43280717,.,A,T,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
1,chr1,10025,10026,MU75019506,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
2,chr1,10071,10072,MU40549878,.,C,CA,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
3,chr1,10073,10074,MU121369972,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
4,chr1,10079,10080,MU121498435,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
...,...,...,...,...,...,...,...,...,...
81714693,chrY,57206259,57206260,MU81205562,.,G,A,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81714694,chrY,57208638,57208639,MU81205574,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81714695,chrY,57209763,57209764,MU81205598,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81714696,chrY,57211399,57211400,MU81205605,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...


In [8]:
df_acceptor = pd.read_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/bed/DNABERT_run_data/acceptor.bed", sep="\t", header=None)
df_acceptor[2] = df_acceptor[2].round().astype(int)
df_acceptor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,chr7,127589043,127589122,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
1,chr7,127588371,127588450,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
2,chr7,127589445,127589524,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
3,chr7,127590923,127591002,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
4,chr7,127591173,127591252,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1175762,chr17,63706700,63706779,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175763,chr17,63707379,63707458,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175764,chr17,63710575,63710654,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175765,chr17,63710797,63710876,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,


In [9]:
icgc_bed = pybedtools.BedTool.from_dataframe(df.iloc[1:])

In [10]:
acceptor_bed = pybedtools.BedTool.from_dataframe(df_acceptor)

In [11]:
intersect_bed = acceptor_bed.intersect(icgc_bed, wa=True, wb=True)

In [12]:
intersect_df = intersect_bed.to_dataframe()
new_columns = ['chr_name', 'Acceptor_start', 'Acceptor_end', "Transcript_ID","Gene ID", "strand","TSS","ID","tag","value","GENE","Transcript_type","Tissue", "Chromosome", "Start",	"End", "Mutation_ID", "T1", "Reference_Allele", "Alternative_Allele", "T2","Info"]
intersect_df.columns = new_columns
intersect_df

['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 22 fields; you can supply custom names with the `names` kwarg
  warn(


Unnamed: 0,chr_name,Acceptor_start,Acceptor_end,Transcript_ID,Gene ID,strand,TSS,ID,tag,value,...,Tissue,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
0,chr7,127589043,127589122,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127589078,127589079,MU82778890,.,G,T,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
1,chr7,127589043,127589122,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127589090,127589091,MU85932120,.,G,T,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
2,chr7,127589043,127589122,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127589096,127589097,MU84868715,.,G,T,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
3,chr7,127589043,127589122,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127589104,127589105,MU82319183,.,G,T,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
4,chr7,127588371,127588450,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588386,127588387,MU133310861,.,C,A,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6001599,chr17,63713488,63713567,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63713494,63713495,MU84976091,.,G,T,.,CONSEQUENCE=STRADA|ENSG00000266173|1|STRADA-00...
6001600,chr17,63713488,63713567,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63713502,63713503,MU129479225,.,G,A,.,CONSEQUENCE=STRADA|ENSG00000266173|1|STRADA-00...
6001601,chr17,63713488,63713567,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63713515,63713516,MU29299852,.,C,G,.,CONSEQUENCE=STRADA|ENSG00000266173|1|STRADA-00...
6001602,chr17,63713488,63713567,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63713550,63713551,MU121174896,.,A,T,.,CONSEQUENCE=STRADA|ENSG00000266173|1|STRADA-00...


In [13]:
intersect_df.to_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/Intersected_data/acceptor_ICGC_intersected.tsv", sep="\t", index=False)