In [1]:
import os, io
import pandas as pd
import pysam
import pickle
import pybedtools
print(pybedtools.__file__)
pybedtools.helpers.set_tempdir('/home/pdutta/temp')
from IPython.display import display, HTML

/home/pdutta/anaconda3/envs/GDC_VCF/lib/python3.10/site-packages/pybedtools/__init__.py


In [3]:
cancer_type="Brain" 

In [4]:
vcf_folder_path="/home/pdutta/Data/Cancer_wiseGDC/New_data/{}/Downloaded_files/VCF".format(cancer_type)
reference_genome_path="/home/pdutta/Data/Human_Genome_Data/GRCh38_latest_genomic.fna"

In [5]:
non_coding_region="acceptor"

In [6]:
df= pd.read_csv("/home/pdutta/Data/Noncoding_region/Unique_{}_RAW_Combined_BED.tsv".format(non_coding_region), sep="\t")
df

Unnamed: 0,chr_name,start,end,strand,transcript_id,exon_id,transcript_type
0,chr1,12572,12651,+,"ENST00000450305.2, ENST00000456328.2","ENSE00001758273.2, ENSE00003582793.1","transcribed_unprocessed_pseudogene, processed_..."
1,chr1,12934,13013,+,ENST00000450305.2,ENSE00001799933.2,transcribed_unprocessed_pseudogene
2,chr1,13180,13259,+,"ENST00000450305.2, ENST00000456328.2","ENSE00001746346.2, ENSE00002312635.1","transcribed_unprocessed_pseudogene, processed_..."
3,chr1,13412,13491,+,ENST00000450305.2,ENSE00001863096.1,transcribed_unprocessed_pseudogene
4,chr1,14999,15078,-,ENST00000488147.1,ENSE00001935574.1,unprocessed_pseudogene
...,...,...,...,...,...,...,...
295391,chrY,57211720,57211799,+,"ENST00000340131.12_PAR_Y, ENST00000359512.8_PA...","ENSE00001956598.1, ENSE00001416295.3, ENSE0000...","retained_intron, protein_coding, processed_tra..."
295392,chrY,57213086,57213165,-,ENST00000507418.6_PAR_Y,ENSE00002023900.1,unprocessed_pseudogene
295393,chrY,57213318,57213397,-,ENST00000507418.6_PAR_Y,ENSE00002036959.1,unprocessed_pseudogene
295394,chrY,57213563,57213642,-,ENST00000507418.6_PAR_Y,ENSE00002021169.1,unprocessed_pseudogene


In [7]:
acceptor_bed = pybedtools.BedTool.from_dataframe(df)

In [8]:
reference_fasta = pysam.FastaFile(reference_genome_path)

In [9]:
def get_vcf_gz_files_except_logs(root_folder):
    all_files = []

    # Walk through the directory tree
    for dirpath, dirnames, filenames in os.walk(root_folder):
        # If "logs" is in dirnames, remove it to avoid traversing it
        if 'logs' in dirnames:
            dirnames.remove('logs')

        # Add only the filenames with the extension .vcf.gz in the current directory to the all_files list
        for filename in filenames:
            if filename.endswith('.vcf.gz'):
                all_files.append(os.path.join(dirpath, filename))
    return all_files

In [10]:
def vcf_to_dataframe(vcf_path):
    """
    Convert a .vcf.gz file into a pandas DataFrame.

    Parameters:
    - vcf_path (str): path to the .vcf.gz file

    Returns:
    - pd.DataFrame: VCF data as a DataFrame
    """

    try:
        # Open the VCF file
        vcf_file = pysam.VariantFile(vcf_path)
    except ValueError as e:
        print(f"Failed to open file {vcf_path}: {e}")
        with open("/data/projects/GDC_Cancer_Wise/Brain/Data/"+vcf_path.split("/")[-2]+"_error_log.txt", "a") as error_log:
            error_log.write(f"{vcf_path}\n")
        return None

    # Extracting the data and the columns
    data = []
    
    columns =  ["CHROM", "START_POS", "ID", "REF", "ALT", "QUAL", "FILTER"] + list(vcf_file.header.info.keys()) 
    for record in vcf_file:
        basic_data = [record.chrom, record.pos, record.id, record.ref, 
                      ','.join(str(alt) for alt in record.alts), record.qual, record.filter.keys()[0] if record.filter.keys() else 'PASS']
        row_data = [record.info.get(key) for key in vcf_file.header.info.keys()]
        data.append(basic_data + row_data)

    df = pd.DataFrame(data, columns=columns)
    df["START_POS"] = df["START_POS"]-1
    end = df["START_POS"] + df['REF'].str.len()
    df.insert(2, 'END_POS', end)

    # Close the VCF file
    vcf_file.close()

    return df

In [11]:
files = get_vcf_gz_files_except_logs(vcf_folder_path)

In [12]:
len(files)

432

In [13]:
intersected_vcf_data = {}
df_statistics  = pd.DataFrame(columns= ["filename", "Patient_ID", 'work_flow',  'VCF_instance','VCF_feature','Intersected_instances', 'VCF_column_names' ])

In [15]:
for file_path in files:
    file_name =  file_path.split('/')[-1]
    parts = file_name.split('.')
    print(parts)
    df_vcf = vcf_to_dataframe(file_path)
    #print(df_vcf)
    if df_vcf is None:
        continue
    vcf_bed = pybedtools.BedTool.from_dataframe(df_vcf)
    intersect_vcf_acceptor = acceptor_bed.intersect(vcf_bed, wa=True, wb=True)
    column_list  = df.columns.to_list() + df_vcf.columns.to_list()
    df_intersection = intersect_vcf_acceptor.to_dataframe(names=column_list)
    #print(df_intersection.columns)
    df_intersection = df_intersection[(df_intersection['REF'].str.len() < 10) & (df_intersection['ALT'].str.len() < 10)]
    # print(df_intersection[['REF','ALT']])
    # input()
    #print(df_intersection)
    #print(df_statistics)
    #input()
    #gdc_tag = parts[-4]
    patient_ID = parts[0]
    work_flow = parts[2]
    instance =  df_vcf.shape[0]
    feature = df_vcf.shape[1]
    intersected_instance = df_intersection.shape[0]
    feature_list = list(df_vcf.columns)
    print(patient_ID, work_flow, instance, feature,intersected_instance,  feature_list)
    df_statistics.loc[len(df_statistics)] = [file_name, patient_ID, work_flow, instance, feature,intersected_instance,  feature_list]
    intersected_vcf_data[patient_ID+"_"+work_flow] = df_intersection

['9e35e7c1-b8e9-441b-9520-3195359f8e43', 'wgs', 'sanger_raw_pindel', 'raw_somatic_mutation', 'vcf', 'gz']
9e35e7c1-b8e9-441b-9520-3195359f8e43 sanger_raw_pindel 857759 17 4581 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'FF017', 'LEN', 'OLD_VARIANT', 'PC', 'RE', 'REP', 'RS', 'S1', 'S2']
['2c5b021f-f9e1-4a42-8755-8704f006016b', 'wgs', 'sanger_raw_pindel', 'raw_somatic_mutation', 'vcf', 'gz']
2c5b021f-f9e1-4a42-8755-8704f006016b sanger_raw_pindel 1059560 17 5199 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'FF017', 'LEN', 'OLD_VARIANT', 'PC', 'RE', 'REP', 'RS', 'S1', 'S2']
['1d3daf9d-7740-484e-850b-827a6b7d451d', 'wgs', 'CaVEMan', 'raw_somatic_mutation', 'vcf', 'gz']
1d3daf9d-7740-484e-850b-827a6b7d451d CaVEMan 39764 20 100 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'ASMD', 'ASRD', 'CLPM', 'DP', 'DS', 'GP', 'MP', 'SG', 'SNP', 'SP', 'TG', 'TP']
['54d621a5-8e45-4bbc-92c4-1c7f36212361', 'wgs', 'sanger_raw_pi

In [16]:
output_path = "/home/pdutta/Data/Cancer_wiseGDC/New_data/{}/Generated_files/Intersected_Data".format(cancer_type)
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [17]:
df_statistics.to_csv(output_path+"/VCF_statistics.tsv", sep="\t", index=False)

In [18]:
with open(output_path+"/intersected_vcf_{}_data.pkl".format(non_coding_region), "wb") as file:
    pickle.dump(intersected_vcf_data, file)

In [19]:
df_statistics

Unnamed: 0,filename,Patient_ID,work_flow,VCF_instance,VCF_feature,Intersected_instances,VCF_column_names
0,9e35e7c1-b8e9-441b-9520-3195359f8e43.wgs.sange...,9e35e7c1-b8e9-441b-9520-3195359f8e43,sanger_raw_pindel,857759,17,4581,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
1,2c5b021f-f9e1-4a42-8755-8704f006016b.wgs.sange...,2c5b021f-f9e1-4a42-8755-8704f006016b,sanger_raw_pindel,1059560,17,5199,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
2,1d3daf9d-7740-484e-850b-827a6b7d451d.wgs.CaVEM...,1d3daf9d-7740-484e-850b-827a6b7d451d,CaVEMan,39764,20,100,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
3,54d621a5-8e45-4bbc-92c4-1c7f36212361.wgs.sange...,54d621a5-8e45-4bbc-92c4-1c7f36212361,sanger_raw_pindel,853961,17,4556,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
4,07f93ac0-50e0-4c1d-b4da-898062a5b6b8.wgs.CaVEM...,07f93ac0-50e0-4c1d-b4da-898062a5b6b8,CaVEMan,57346,20,173,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
...,...,...,...,...,...,...,...
427,02eb77ff-2e07-4d56-9c32-d9fa415c7ce2.wgs.CaVEM...,02eb77ff-2e07-4d56-9c32-d9fa415c7ce2,CaVEMan,51672,20,165,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
428,fdfbb80c-8e66-499a-ac59-f311d6f4b6fa.wgs.sange...,fdfbb80c-8e66-499a-ac59-f311d6f4b6fa,sanger_raw_pindel,883645,17,4721,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
429,c9a63b40-b4c3-44ca-8a8c-eb42a538438d.wgs.CaVEM...,c9a63b40-b4c3-44ca-8a8c-eb42a538438d,CaVEMan,214348,20,924,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
430,7d9bd465-2e99-4b07-902f-752fa348efb8.wgs.sange...,7d9bd465-2e99-4b07-902f-752fa348efb8,sanger_raw_pindel,1096330,17,5404,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."


In [17]:
df_statistics[df_statistics['Patient_ID']=="cc884f4d-e62d-4214-bfaa-81052a02a246"]

Unnamed: 0,filename,Patient_ID,work_flow,VCF_instance,VCF_feature,Intersected_instances,VCF_column_names
12,cc884f4d-e62d-4214-bfaa-81052a02a246.wgs.CaVEM...,cc884f4d-e62d-4214-bfaa-81052a02a246,CaVEMan,48561,20,160,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
29,cc884f4d-e62d-4214-bfaa-81052a02a246.wgs.sange...,cc884f4d-e62d-4214-bfaa-81052a02a246,sanger_raw_pindel,877790,17,4597,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."


In [20]:
df_statistics.groupby('work_flow').size()

work_flow
CaVEMan              216
sanger_raw_pindel    216
dtype: int64

In [1]:
intersected_vcf_data['ef6c3ecc-0bb6-4035-9c45-57740f5bcaa1_sanger_raw_pindel']

NameError: name 'intersected_vcf_data' is not defined

In [21]:
len(intersected_vcf_data)

432

In [26]:
intersected_vcf_data['9e35e7c1-b8e9-441b-9520-3195359f8e43_sanger_raw_pindel']

Unnamed: 0,chr_name,start,end,strand,transcript_id,exon_id,transcript_type,CHROM,START_POS,END_POS,...,FILTER,FF017,LEN,OLD_VARIANT,PC,RE,REP,RS,S1,S2
0,chr1,612825,612904,-,ENST00000641296.1,ENSE00003811489.1,processed_transcript,chr1,612865,612866,...,FF015,True,1,.,I,612877,9,612866,6,249.03500366210938
1,chr1,744786,744865,-,ENST00000506640.2,ENSE00003790979.1,processed_transcript,chr1,744864,744865,...,FF010,True,6,"('chr1:744866:G/GTGTGTG',)",I,744868,0,744865,12,404.22100830078125
2,chr1,1043198,1043277,+,"ENST00000379370.7, ENST00000620552.4, ENST0000...",ENSE00001605257.1,protein_coding,chr1,1043222,1043225,...,FF010,True,2,.,D,1043226,1,1043223,120,1649.3900146484375
4,chr1,1355713,1355792,-,"ENST00000309212.11, ENST00000445648.5, ENST000...",ENSE00001208985.1,protein_coding,chr1,1355778,1355780,...,FF010,True,1,.,D,1355782,2,1355779,8,339.27099609375
5,chr1,1657263,1657342,-,"ENST00000340677.9, ENST00000341028.8","ENSE00003752042.1, ENSE00003739845.1","protein_coding, processed_transcript",chr1,1657295,1657299,...,FF010,True,3,.,D,1657303,2,1657296,12,424.0889892578125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5253,chrY,11340755,11340834,-,ENST00000331172.7,ENSE00002258116.2,unprocessed_pseudogene,chrY,11340786,11340789,...,FF010,True,2,.,D,11340793,2,11340787,70,1278.9599609375
5254,chrY,11350944,11351023,-,ENST00000331172.7,ENSE00001730908.1,unprocessed_pseudogene,chrY,11350999,11351000,...,FF010,True,2,.,I,11351008,6,11351000,4,246.76199340820312
5255,chrY,11350944,11351023,-,ENST00000331172.7,ENSE00001730908.1,unprocessed_pseudogene,chrY,11350999,11351000,...,FF010,True,4,.,I,11351001,0,11351000,15,494.7279968261719
5256,chrY,12786481,12786560,+,"ENST00000338981.7, ENST00000426564.6, ENST0000...","ENSE00003561084.1, ENSE00003494757.1","protein_coding, processed_transcript",chrY,12786500,12786502,...,FF006,True,1,.,D,12786518,16,12786501,30,744.5
