In [1]:
import os
import pandas as pd
import numpy as np
import pybedtools 
from Bio import SeqIO
from io import StringIO
pybedtools.helpers.set_tempdir("/data/projects/temp")
from tqdm import tqdm

In [2]:
ICGC_bed_file = "/data/projects/DNABERT_snv/Manuscript_11_2023/bed/ICGC/ICGC_hg38_combined_final.bed"

In [3]:
# Define a function to read large file in chunks and show progress
def read_large_file(file_path, chunk_size=100000):
    # Determine the total size of the file
    total_size = os.path.getsize(file_path)
    print("Total Size :", total_size)
    # Iterator to read file in chunks
    iterator = pd.read_csv(file_path, chunksize=chunk_size, sep="\t", header=None, names=["Chromosome", "Start", "End", "Mutation_ID","T1","Reference_Allele", "Alternative_Allele","T2", "Info"])

    # Use tqdm to show progress, using file size as a proxy
    chunks = []
    # Initialize tqdm with the total number of chunks (estimated)
    with tqdm(total=os.path.getsize(file_path) // (chunk_size * 1024), desc="Reading file") as pbar:
        for chunk in iterator:
            # Process each chunk
            chunks.append(chunk)

            # Update the progress bar by one chunk
            pbar.update(1)

    print("Reading done....")
    # Concatenate chunks into a single DataFrame
    df = pd.concat(chunks, ignore_index=True)

    return df

In [4]:
df = read_large_file(ICGC_bed_file)

Total Size : 30209257697


Reading file: 818it [03:01,  4.50it/s]                                                                                  


Reading done....


In [5]:
df['Chromosome'] = 'chr' + df['Chromosome'].astype(str)
df

Unnamed: 0,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
0,chr1,10001,10002,MU43280717,.,A,T,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
1,chr1,10025,10026,MU75019506,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
2,chr1,10071,10072,MU40549878,.,C,CA,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
3,chr1,10073,10074,MU121369972,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
4,chr1,10079,10080,MU121498435,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
...,...,...,...,...,...,...,...,...,...
81714693,chrY,57206259,57206260,MU81205562,.,G,A,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81714694,chrY,57208638,57208639,MU81205574,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81714695,chrY,57209763,57209764,MU81205598,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81714696,chrY,57211399,57211400,MU81205605,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...


In [6]:
df_core_prom = pd.read_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/bed/DNABERT_run_data/core_prom.bed", sep="\t", header=None)
df_core_prom[2] = df_core_prom[2].round().astype(int)
df_core_prom

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,chr7,127588366,127588455,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
1,chr12,8949601,8949690,ENST00000000412,ENSG00000003056,-,8949645,2450,protein_coding,42.37,M6PR,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
2,chr11,64305479,64305568,ENST00000000442,ENSG00000173153,+,64305524,2274,protein_coding,57.13,ESRRA,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
3,chr12,2794925,2795014,ENST00000001008,ENSG00000004478,+,2794970,3715,protein_coding,51.53,FKBP4,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
4,chr2,72147818,72147907,ENST00000001146,ENSG00000003137,-,72147862,4556,protein_coding,59.85,CYP26B1,NullTrans,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
194768,chr20,45416096,45416185,ENST00000640986,ENSG00000124155,+,45416141,2187,nonsense_mediated_decay,45.91,PIGT,NullTrans,
194769,chr6,160510972,160511061,ENST00000640989,ENSG00000290613,-,160511016,807,lncRNA,40.95,LPAL2,NullTrans,
194770,chr2,85888732,85888821,ENST00000640992,ENSG00000115525,-,85888776,2292,protein_coding,45.07,ST3GAL5,NullTrans,
194771,chr20,45416088,45416177,ENST00000640996,ENSG00000124155,+,45416133,2219,nonsense_mediated_decay,45.91,PIGT,NullTrans,


In [7]:
icgc_bed = pybedtools.BedTool.from_dataframe(df.iloc[1:])

In [8]:
core_prom_bed = pybedtools.BedTool.from_dataframe(df_core_prom)

In [9]:
intersect_bed = core_prom_bed.intersect(icgc_bed, wa=True, wb=True)

In [10]:
intersect_df = intersect_bed.to_dataframe()
new_columns = ['chr_name', 'Core_prom_start', 'Core_prom_end', "ENSEMBL_Transcript_ID","ENSEMBL_Gene ID", "strand","TSS","Transcript_length","Transcript_type","Gene_CG_content","GENE_symbol","Category","Tissue", "Chromosome", "Start",	"End", "Mutation_ID", "T1", "Reference_Allele", "Alternative_Allele", "T2","Info"]
intersect_df.columns = new_columns
intersect_df

['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 22 fields; you can supply custom names with the `names` kwarg
  warn(


Unnamed: 0,chr_name,Core_prom_start,Core_prom_end,ENSEMBL_Transcript_ID,ENSEMBL_Gene ID,strand,TSS,Transcript_length,Transcript_type,Gene_CG_content,...,Tissue,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
0,chr7,127588366,127588455,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588387,127588388,MU133392649,.,G,A,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
1,chr7,127588366,127588455,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588429,127588430,MU39478069,.,T,C,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
2,chr12,8949601,8949690,ENST00000000412,ENSG00000003056,-,8949645,2450,protein_coding,42.37,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr12,8949659,8949660,MU49180650,.,G,A,.,CONSEQUENCE=M6PR|ENSG00000003056|1|M6PR-001|EN...
3,chr12,8949601,8949690,ENST00000000412,ENSG00000003056,-,8949645,2450,protein_coding,42.37,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr12,8949660,8949661,MU37117518,.,G,A,.,CONSEQUENCE=M6PR|ENSG00000003056|1|M6PR-001|EN...
4,chr11,64305479,64305568,ENST00000000442,ENSG00000173153,+,64305524,2274,protein_coding,57.13,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr11,64305485,64305486,MU46907284,.,C,T,.,"CONSEQUENCE=||||||intergenic_region||,ESRRA|EN..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746359,chr20,45416088,45416177,ENST00000640996,ENSG00000124155,+,45416133,2219,nonsense_mediated_decay,45.91,...,.,chr20,45416128,45416129,MU66504628,.,C,G,.,CONSEQUENCE=PIGT|ENSG00000124155|+|PIGT-004|EN...
746360,chr20,45416088,45416177,ENST00000640996,ENSG00000124155,+,45416133,2219,nonsense_mediated_decay,45.91,...,.,chr20,45416129,45416130,MU128987750,.,G,A,.,CONSEQUENCE=PIGT|ENSG00000124155|+|PIGT-004|EN...
746361,chr20,45416088,45416177,ENST00000640996,ENSG00000124155,+,45416133,2219,nonsense_mediated_decay,45.91,...,.,chr20,45416141,45416142,MU120160683,.,G,A,.,CONSEQUENCE=PIGT|ENSG00000124155|+|PIGT-004|EN...
746362,chr20,45416088,45416177,ENST00000640996,ENSG00000124155,+,45416133,2219,nonsense_mediated_decay,45.91,...,.,chr20,45416144,45416145,MU93024897,.,G,T,.,CONSEQUENCE=PIGT|ENSG00000124155|+|PIGT-004|EN...


In [11]:
intersect_df.to_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/Intersected_data/core_prom_ICGC_intersected.tsv", sep="\t", index=False)