In [1]:
import os
import pandas as pd
import numpy as np
import pybedtools 
from Bio import SeqIO
from io import StringIO
pybedtools.helpers.set_tempdir("/data/projects/temp")
from tqdm import tqdm

In [2]:
ICGC_bed_file = "/data/projects/DNABERT_snv/Manuscript_11_2023/bed/ICGC/ICGC_hg38_combined_final.bed"

In [3]:
# Define a function to read large file in chunks and show progress
def read_large_file(file_path, chunk_size=100000):
    # Determine the total size of the file
    total_size = os.path.getsize(file_path)
    print("Total Size :", total_size)
    # Iterator to read file in chunks
    iterator = pd.read_csv(file_path, chunksize=chunk_size, sep="\t", header=None, names=["Chromosome", "Start", "End", "Mutation_ID","T1","Reference_Allele", "Alternative_Allele","T2", "Info"])

    # Use tqdm to show progress, using file size as a proxy
    chunks = []
    # Initialize tqdm with the total number of chunks (estimated)
    with tqdm(total=os.path.getsize(file_path) // (chunk_size * 1024), desc="Reading file") as pbar:
        for chunk in iterator:
            # Process each chunk
            chunks.append(chunk)

            # Update the progress bar by one chunk
            pbar.update(1)

    print("Reading done....")
    # Concatenate chunks into a single DataFrame
    df = pd.concat(chunks, ignore_index=True)

    return df

In [4]:
df = read_large_file(ICGC_bed_file)

Total Size : 30209257697


Reading file: 818it [03:01,  4.52it/s]                                                                                  


Reading done....


In [5]:
df['Chromosome'] = 'chr' + df['Chromosome'].astype(str)
df

Unnamed: 0,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
0,chr1,10001,10002,MU43280717,.,A,T,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
1,chr1,10025,10026,MU75019506,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
2,chr1,10071,10072,MU40549878,.,C,CA,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
3,chr1,10073,10074,MU121369972,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
4,chr1,10079,10080,MU121498435,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
...,...,...,...,...,...,...,...,...,...
81714693,chrY,57206259,57206260,MU81205562,.,G,A,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81714694,chrY,57208638,57208639,MU81205574,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81714695,chrY,57209763,57209764,MU81205598,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81714696,chrY,57211399,57211400,MU81205605,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...


In [7]:
df_donor = pd.read_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/bed/DNABERT_run_data/donor.bed", sep="\t", header=None)
df_donor[2] = df_donor[2].round().astype(int)
df_donor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,chr7,127589124,127589203,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
1,chr7,127588526,127588605,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
2,chr7,127589555,127589634,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
3,chr7,127591049,127591128,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
4,chr7,127591661,127591740,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1175762,chr17,63706595,63706674,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175763,chr17,63707207,63707286,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175764,chr17,63710451,63710530,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175765,chr17,63710688,63710767,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,


In [8]:
icgc_bed = pybedtools.BedTool.from_dataframe(df.iloc[1:])

In [9]:
donor_bed = pybedtools.BedTool.from_dataframe(df_donor)

In [10]:
intersect_bed = donor_bed.intersect(icgc_bed, wa=True, wb=True)

In [11]:
intersect_df = intersect_bed.to_dataframe()
new_columns = ['chr_name', 'Donor_start', 'Donor_end', "ENSEMBL_Transcript_ID","ENSEMBL_Gene ID", "strand","TSS","Transcript_length","Transcript_type","Gene_CG_content","GENE_symbol","Category","Tissue", "Chromosome", "Start",	"End", "Mutation_ID", "T1", "Reference_Allele", "Alternative_Allele", "T2","Info"]
intersect_df.columns = new_columns
intersect_df

['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 22 fields; you can supply custom names with the `names` kwarg
  warn(


Unnamed: 0,chr_name,Donor_start,Donor_end,ENSEMBL_Transcript_ID,ENSEMBL_Gene ID,strand,TSS,Transcript_length,Transcript_type,Gene_CG_content,...,Tissue,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
0,chr7,127588526,127588605,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588529,127588530,MU80692579,.,G,T,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
1,chr7,127588526,127588605,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588553,127588554,MU90753368,.,G,A,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
2,chr7,127588526,127588605,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588569,127588570,MU115107606,.,G,T,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
3,chr7,127588526,127588605,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588581,127588582,MU84868698,.,G,T,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
4,chr7,127588526,127588605,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588591,127588592,MU83704948,.,G,T,.,CONSEQUENCE=ARF5|ENSG00000004059|+|ARF5-001|EN...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5792687,chr17,63710688,63710767,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63710752,63710753,MU28664773,.,C,A,.,CONSEQUENCE=STRADA|ENSG00000266173|1|STRADA-00...
5792688,chr17,63710688,63710767,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63710760,63710761,MU29299841,.,C,T,.,CONSEQUENCE=STRADA|ENSG00000266173|1|STRADA-00...
5792689,chr17,63710688,63710767,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63710765,63710766,MU129502912,.,T,C,.,CONSEQUENCE=STRADA|ENSG00000266173|1|STRADA-00...
5792690,chr17,63713366,63713445,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63713396,63713397,MU84546198,.,G,T,.,CONSEQUENCE=STRADA|ENSG00000266173|1|STRADA-00...


In [19]:
intersect_df[intersect_df['Reference_Allele'].str.len()==3]

Unnamed: 0,chr_name,Donor_start,Donor_end,ENSEMBL_Transcript_ID,ENSEMBL_Gene ID,strand,TSS,Transcript_length,Transcript_type,Gene_CG_content,...,Tissue,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
169,chr2,37242654,37242733,ENST00000002125,ENSG00000003509,+,37231658,2184,protein_coding,38.97,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr2,37242704,37242707,MU113949942,.,GGA,G,.,CONSEQUENCE=NDUFAF7|ENSG00000003509|+|NDUFAF7-...
282,chr3,50159695,50159774,ENST00000002829,ENSG00000001617,+,50155324,3607,protein_coding,57.74,...,"Adipose_Tissue, Bladder, Blood_Vessel, Breast,...",chr3,50159773,50159776,MU40919316,.,CTT,C,.,CONSEQUENCE=SEMA3F|ENSG00000001617|+|SEMA3F-00...
476,chr7,117480108,117480187,ENST00000003084,ENSG00000001626,+,117480025,6070,protein_coding,36.90,...,.,chr7,117480177,117480180,MU128922383,.,GAC,G,.,CONSEQUENCE=CFTR|ENSG00000001626|+|CFTR-001|EN...
891,chr8,17554660,17554739,ENST00000004531,ENSG00000003989,+,17538777,7560,protein_coding,40.13,...,.,chr8,17554700,17554703,MU4308423,.,TGA,T,.,CONSEQUENCE=SLC7A2|ENSG00000003989|+|SLC7A2-20...
899,chr8,17548804,17548883,ENST00000004531,ENSG00000003989,+,17538777,7560,protein_coding,40.13,...,.,chr8,17548857,17548860,MU112574363,.,GTT,G,.,CONSEQUENCE=SLC7A2|ENSG00000003989|+|SLC7A2-20...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5790762,chrX,56269450,56269529,ENST00000640927,ENSG00000102349,+,56233212,1150,protein_coding,38.82,...,.,chrX,56269524,56269527,MU67798354,.,ATG,A,.,CONSEQUENCE=KLF8|ENSG00000102349|+|KLF8-002|EN...
5790816,chr18,70135144,70135223,ENST00000640931,ENSG00000176225,-,70139607,2065,nonsense_mediated_decay,36.76,...,.,chr18,70135193,70135196,MU2853,.,TTC,T,.,CONSEQUENCE=RTTN|ENSG00000176225|1|RTTN-001|EN...
5790927,chr10,77110133,77110212,ENST00000640934,ENSG00000156113,-,77637447,3528,protein_coding,43.94,...,.,chr10,77110160,77110163,MU128767388,.,ATT,A,.,CONSEQUENCE=KCNMA1|ENSG00000156113|1|KCNMA1-00...
5791749,chr10,77110133,77110212,ENST00000640969,ENSG00000156113,-,77637806,5789,protein_coding,43.94,...,.,chr10,77110160,77110163,MU128767388,.,ATT,A,.,CONSEQUENCE=KCNMA1|ENSG00000156113|1|KCNMA1-00...


In [20]:
intersect_df.to_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/Intersected_data/donor_ICGC_intersected.tsv", sep="\t", index=False)