In [1]:
import os
import pandas as pd
import numpy as np
import pybedtools 
from Bio import SeqIO
from io import StringIO
pybedtools.helpers.set_tempdir("/data/projects/temp")
from tqdm import tqdm

In [2]:
ICGC_bed_file = "/data/projects/DNABERT_snv/Manuscript_11_2023/bed/ICGC/ICGC.bed"

In [3]:
# Define a function to read large file in chunks and show progress
def read_large_file(file_path, chunk_size=100000):
    # Determine the total size of the file
    total_size = os.path.getsize(file_path)
    print("Total Size :", total_size)
    # Iterator to read file in chunks
    iterator = pd.read_csv(file_path, chunksize=chunk_size, sep="\t", header=None, names=["Chromosome", "Start", "End", "Mutation_ID","T1","Reference_Allele", "Alternative_Allele","T2", "Info"])

    # Use tqdm to show progress, using file size as a proxy
    chunks = []
    # Initialize tqdm with the total number of chunks (estimated)
    with tqdm(total=os.path.getsize(file_path) // (chunk_size * 1024), desc="Reading file") as pbar:
        for chunk in iterator:
            # Process each chunk
            chunks.append(chunk)

            # Update the progress bar by one chunk
            pbar.update(1)

    print("Reading done....")
    # Concatenate chunks into a single DataFrame
    df = pd.concat(chunks, ignore_index=True)

    return df

In [4]:
df = read_large_file(ICGC_bed_file)

Total Size : 30241674872


Reading file: 818it [03:13,  4.23it/s]                                                                                  


Reading done....


In [5]:
df['Chromosome'] = 'chr' + df['Chromosome'].astype(str)
df

Unnamed: 0,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
0,chr1,10001,10002,MU43280717,.,A,T,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
1,chr1,10025,10026,MU75019506,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
2,chr1,10071,10072,MU40549878,.,C,CA,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
3,chr1,10073,10074,MU121369972,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
4,chr1,10079,10080,MU121498435,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
...,...,...,...,...,...,...,...,...,...
81782583,chrY,59352410,59352411,MU81205562,.,G,A,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81782584,chrY,59354789,59354790,MU81205574,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81782585,chrY,59355914,59355915,MU81205598,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81782586,chrY,59357550,59357551,MU81205605,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...


In [6]:
df_donor = pd.read_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/bed/DNABERT_run_data/donor.bed", sep="\t", header=None)
df_donor[2] = df_donor[2].round().astype(int)
df_donor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,chr7,127589124,127589203,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
1,chr7,127588526,127588605,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
2,chr7,127589555,127589634,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
3,chr7,127591049,127591128,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
4,chr7,127591661,127591740,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1175762,chr17,63706595,63706674,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175763,chr17,63707207,63707286,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175764,chr17,63710451,63710530,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175765,chr17,63710688,63710767,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,


In [None]:
icgc_bed = pybedtools.BedTool.from_dataframe(df.iloc[1:])

In [None]:
donor_bed = pybedtools.BedTool.from_dataframe(df_donor)

In [None]:
intersect_bed = donor_bed.intersect(icgc_bed, wa=True, wb=True)

In [None]:
intersect_df = intersect_bed.to_dataframe()
new_columns = ['chr_name', 'Donor_start', 'Donor_end', "ENSEMBL_Transcript_ID","ENSEMBL_Gene ID", "strand","TSS","Transcript_length","Transcript_type","Gene_CG_content","GENE_symbol","Category","Tissue", "Chromosome", "Start",	"End", "Mutation_ID", "T1", "Reference_Allele", "Alternative_Allele", "T2","Info"]
intersect_df.columns = new_columns
intersect_df

In [None]:
intersect_df.to_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/Intersected_data/donor_ICGC_intersected.tsv", sep="\t", index=False)