In [1]:
import os
import pandas as pd
import numpy as np
import pybedtools 
from Bio import SeqIO
from io import StringIO
pybedtools.helpers.set_tempdir("/data/projects/temp")
from tqdm import tqdm

In [2]:
ICGC_bed_file = "/data/projects/DNABERT_snv/Manuscript_11_2023/bed/ICGC/ICGC.bed"

In [5]:
# Define a function to read large file in chunks and show progress
def read_large_file(file_path, chunk_size=100000):
    # Determine the total size of the file
    total_size = os.path.getsize(file_path)
    print("Total Size :", total_size)
    # Iterator to read file in chunks
    iterator = pd.read_csv(file_path, chunksize=chunk_size, sep="\t", header=None, names=["Chromosome", "Start", "End", "Mutation_ID","T1","Reference_Allele", "Alternative_Allele","T2", "Info"])

    # Use tqdm to show progress, using file size as a proxy
    chunks = []
    # Initialize tqdm with the total number of chunks (estimated)
    with tqdm(total=os.path.getsize(file_path) // (chunk_size * 1024), desc="Reading file") as pbar:
        for chunk in iterator:
            # Process each chunk
            chunks.append(chunk)

            # Update the progress bar by one chunk
            pbar.update(1)

    print("Reading done....")
    # Concatenate chunks into a single DataFrame
    df = pd.concat(chunks, ignore_index=True)

    return df

In [6]:
df = read_large_file(ICGC_bed_file)

Total Size : 30241674872


Reading file: 818it [02:59,  4.56it/s]                                                                                  


Reading done....


In [21]:
df['Chromosome'] = 'chr' + df['Chromosome'].astype(str)
df

Unnamed: 0,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
0,chr1,10001,10002,MU43280717,.,A,T,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
1,chr1,10025,10026,MU75019506,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
2,chr1,10071,10072,MU40549878,.,C,CA,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
3,chr1,10073,10074,MU121369972,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
4,chr1,10079,10080,MU121498435,.,A,G,.,"CONSEQUENCE=||||||intergenic_region||,WASH7P|E..."
...,...,...,...,...,...,...,...,...,...
81782583,chrY,59352410,59352411,MU81205562,.,G,A,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81782584,chrY,59354789,59354790,MU81205574,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81782585,chrY,59355914,59355915,MU81205598,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...
81782586,chrY,59357550,59357551,MU81205605,.,C,T,.,CONSEQUENCE=||||||intergenic_region||;OCCURREN...


In [8]:
def expand_info(info):
    # Split the info field into its components
    fields = info.split(';')
    consequence = fields[0].split('=')[1] if len(fields) > 0 else ''
    occurrence = fields[1].split('=')[1] if len(fields) > 1 else ''
    affected_donors = fields[2].split('=')[1] if len(fields) > 2 else ''
    mutation = fields[3].split('=')[1] if len(fields) > 3 else ''
    project_count = fields[4].split('=')[1] if len(fields) > 4 else ''
    studies = fields[5].split('=')[1] if len(fields) > 5 else ''
    tested_donors = fields[6].split('=')[1] if len(fields) > 6 else ''
    return pd.Series([consequence, occurrence, affected_donors, mutation, project_count, studies, tested_donors])

In [9]:
# Apply the expansion function to each row
expanded_df = df['Info'].apply(expand_info)
expanded_df.columns = ["Consequedf_accepter","Occurrence", "Affected_Donors", "Mutation", "Project_Count", "Studies", "Tested_Donors"]

KeyboardInterrupt: 

In [None]:
result_df = pd.concat([df[["Chromosome", "Start", "End", "Mutation_ID","T1","Reference_Allele", "Alternative_Allele","T2"]], expanded_df], axis=1)

In [31]:
df_acceptor = pd.read_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/bed/DNABERT_run_data/acceptor.bed", sep="\t", header=None)
df_acceptor[2] = df_acceptor[2].round().astype(int)
df_acceptor

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,chr7,127589043,127589122,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
1,chr7,127588371,127588450,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
2,chr7,127589445,127589524,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
3,chr7,127590923,127591002,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
4,chr7,127591173,127591252,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,ARF5,HouseTrans,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1175762,chr17,63706700,63706779,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175763,chr17,63707379,63707458,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175764,chr17,63710575,63710654,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,
1175765,chr17,63710797,63710876,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,STRADA,NullTrans,


In [23]:
icgc_bed = pybedtools.BedTool.from_dataframe(df.iloc[1:])

In [33]:
acceptor_bed = pybedtools.BedTool.from_dataframe(df_acceptor)

In [34]:
intersect_bed = acceptor_bed.intersect(icgc_bed, wa=True, wb=True)

In [38]:
intersect_df = intersect_bed.to_dataframe()
new_columns = ['chr_name', 'Acceptor_start', 'Acceptor_end', "Transcript_ID","Gene ID", "strand","TSS","ID","tag","value","GENE","Transcript_type","Tissue", "Chromosome", "Start",	"End", "Mutation_ID", "T1", "Reference_Allele", "Alternative_Allele", "T2","Info"]
intersect_df.columns = new_columns
intersect_df

['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 22 fields; you can supply custom names with the `names` kwarg
  warn(


Unnamed: 0,chr_name,Acceptor_start,Acceptor_end,Transcript_ID,Gene ID,strand,TSS,ID,tag,value,...,Tissue,Chromosome,Start,End,Mutation_ID,T1,Reference_Allele,Alternative_Allele,T2,Info
0,chr7,127589043,127589122,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127589068,127589069,MU35907306,.,T,G,.,CONSEQUENCE=SND1|ENSG00000197157|+|SND1-001|EN...
1,chr7,127589043,127589122,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127589109,127589110,MU133330136,.,C,G,.,CONSEQUENCE=SND1|ENSG00000197157|+|SND1-001|EN...
2,chr7,127588371,127588450,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588377,127588378,MU2774305,.,G,A,.,CONSEQUENCE=SND1|ENSG00000197157|+|SND1-001|EN...
3,chr7,127588371,127588450,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127588412,127588413,MU117944068,.,G,A,.,CONSEQUENCE=SND1|ENSG00000197157|+|SND1-001|EN...
4,chr7,127589445,127589524,ENST00000000233,ENSG00000004059,+,127588411,1032,protein_coding,56.14,...,"Adipose_Tissue, Adrenal_Gland, Bladder, Blood,...",chr7,127589453,127589454,MU86144404,.,G,A,.,CONSEQUENCE=SND1|ENSG00000197157|+|SND1-001|EN...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2375836,chr17,63704008,63704087,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63704071,63704072,MU65877200,.,T,C,.,CONSEQUENCE=CEP112|ENSG00000154240|1|CEP112-00...
2375837,chr17,63704008,63704087,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63704075,63704076,MU131889209,.,G,A,.,CONSEQUENCE=CEP112|ENSG00000154240|1|CEP112-00...
2375838,chr17,63704543,63704622,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63704608,63704609,MU68047245,.,G,A,.,CONSEQUENCE=CEP112|ENSG00000154240|1|CEP112-00...
2375839,chr17,63707379,63707458,ENST00000640999,ENSG00000266173,-,63741796,2336,protein_coding,48.88,...,.,chr17,63707400,63707401,MU9270347,.,G,A,.,CONSEQUENCE=CEP112|ENSG00000154240|1|CEP112-00...


In [39]:
intersect_df.to_csv("/data/projects/DNABERT_snv/Manuscript_11_2023/Intersected_data/acceptor_ICGC_intersected.tsv", sep="\t", index=False)