In [1]:
import os, io
import pandas as pd
import pysam
import pickle
import pybedtools
print(pybedtools.__file__)
pybedtools.helpers.set_tempdir('/home/pdutta/temp')
from IPython.display import display, HTML

/home/pdutta/anaconda3/lib/python3.9/site-packages/pybedtools/__init__.py


In [2]:
vcf_folder_path="/home/pdutta/Data/Cancer_wiseGDC/Data/Brain/WGS_VCF_Files"
reference_genome_path="/home/pdutta/Data/Human_Genome_Data/GRCh38_latest_genomic.fna"

In [3]:
non_coding_region="acceptor"

In [4]:
df= pd.read_csv("/home/pdutta/Data/Noncoding_region/Unique_{}_RAW_Combined_BED.tsv".format(non_coding_region), sep="\t")
df

Unnamed: 0,chr_name,start,end,strand,transcript_id,exon_id,transcript_type
0,chr1,12572,12651,+,"ENST00000450305.2, ENST00000456328.2","ENSE00001758273.2, ENSE00003582793.1","transcribed_unprocessed_pseudogene, processed_..."
1,chr1,12934,13013,+,ENST00000450305.2,ENSE00001799933.2,transcribed_unprocessed_pseudogene
2,chr1,13180,13259,+,"ENST00000450305.2, ENST00000456328.2","ENSE00001746346.2, ENSE00002312635.1","transcribed_unprocessed_pseudogene, processed_..."
3,chr1,13412,13491,+,ENST00000450305.2,ENSE00001863096.1,transcribed_unprocessed_pseudogene
4,chr1,14999,15078,-,ENST00000488147.1,ENSE00001935574.1,unprocessed_pseudogene
...,...,...,...,...,...,...,...
295391,chrY,57211720,57211799,+,"ENST00000340131.12_PAR_Y, ENST00000359512.8_PA...","ENSE00001956598.1, ENSE00001416295.3, ENSE0000...","retained_intron, protein_coding, processed_tra..."
295392,chrY,57213086,57213165,-,ENST00000507418.6_PAR_Y,ENSE00002023900.1,unprocessed_pseudogene
295393,chrY,57213318,57213397,-,ENST00000507418.6_PAR_Y,ENSE00002036959.1,unprocessed_pseudogene
295394,chrY,57213563,57213642,-,ENST00000507418.6_PAR_Y,ENSE00002021169.1,unprocessed_pseudogene


In [5]:
acceptor_bed = pybedtools.BedTool.from_dataframe(df)

In [6]:
reference_fasta = pysam.FastaFile(reference_genome_path)

In [7]:
def get_vcf_gz_files_except_logs(root_folder):
    all_files = []

    # Walk through the directory tree
    for dirpath, dirnames, filenames in os.walk(root_folder):
        # If "logs" is in dirnames, remove it to avoid traversing it
        if 'logs' in dirnames:
            dirnames.remove('logs')

        # Add only the filenames with the extension .vcf.gz in the current directory to the all_files list
        for filename in filenames:
            if filename.endswith('.vcf.gz'):
                all_files.append(os.path.join(dirpath, filename))
    return all_files

In [8]:
def vcf_to_dataframe(vcf_path):
    """
    Convert a .vcf.gz file into a pandas DataFrame.

    Parameters:
    - vcf_path (str): path to the .vcf.gz file

    Returns:
    - pd.DataFrame: VCF data as a DataFrame
    """

    try:
        # Open the VCF file
        vcf_file = pysam.VariantFile(vcf_path)
    except ValueError as e:
        print(f"Failed to open file {vcf_path}: {e}")
        with open("/data/projects/GDC_Cancer_Wise/Brain/Data/"+vcf_path.split("/")[-2]+"_error_log.txt", "a") as error_log:
            error_log.write(f"{vcf_path}\n")
        return None

    # Extracting the data and the columns
    data = []
    
    columns =  ["CHROM", "START_POS", "ID", "REF", "ALT", "QUAL", "FILTER"] + list(vcf_file.header.info.keys()) 
    for record in vcf_file:
        basic_data = [record.chrom, record.pos, record.id, record.ref, 
                      ','.join(str(alt) for alt in record.alts), record.qual, record.filter.keys()[0] if record.filter.keys() else 'PASS']
        row_data = [record.info.get(key) for key in vcf_file.header.info.keys()]
        data.append(basic_data + row_data)

    df = pd.DataFrame(data, columns=columns)
    df["START_POS"] = df["START_POS"]-1
    end = df["START_POS"] + df['REF'].str.len()
    df.insert(2, 'END_POS', end)

    # Close the VCF file
    vcf_file.close()

    return df

In [9]:
files = get_vcf_gz_files_except_logs(vcf_folder_path)

In [10]:
len(files)

110

In [11]:
intersected_vcf_data = {}
df_statistics  = pd.DataFrame(columns= ["filename", "Patient_ID", 'work_flow',  'VCF_instance','VCF_feature','Intersected_instances', 'VCF_column_names' ])

In [12]:
for file_path in files:
    file_name =  file_path.split('/')[-1]
    parts = file_name.split('.')
    print(parts)
    df_vcf = vcf_to_dataframe(file_path)
    #print(df_vcf)
    if df_vcf is None:
        continue
    vcf_bed = pybedtools.BedTool.from_dataframe(df_vcf)
    intersect_vcf_acceptor = acceptor_bed.intersect(vcf_bed, wa=True, wb=True)
    column_list  = df.columns.to_list() + df_vcf.columns.to_list()
    df_intersection = intersect_vcf_acceptor.to_dataframe(names=column_list)
    #print(df_intersection.columns)
    df_intersection = df_intersection[(df_intersection['REF'].str.len() < 10) & (df_intersection['ALT'].str.len() < 10)]
    # print(df_intersection[['REF','ALT']])
    # input()
    #print(df_intersection)
    #print(df_statistics)
    #input()
    #gdc_tag = parts[-4]
    patient_ID = parts[0]
    work_flow = parts[2]
    instance =  df_vcf.shape[0]
    feature = df_vcf.shape[1]
    intersected_instance = df_intersection.shape[0]
    feature_list = list(df_vcf.columns)
    print(patient_ID, work_flow, instance, feature,intersected_instance,  feature_list)
    df_statistics.loc[len(df_statistics)] = [file_name, patient_ID, work_flow, instance, feature,intersected_instance,  feature_list]
    intersected_vcf_data[patient_ID+"_"+work_flow] = df_intersection

['1d3daf9d-7740-484e-850b-827a6b7d451d', 'wgs', 'CaVEMan', 'raw_somatic_mutation', 'vcf', 'gz']
1d3daf9d-7740-484e-850b-827a6b7d451d CaVEMan 39764 20 100 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'ASMD', 'ASRD', 'CLPM', 'DP', 'DS', 'GP', 'MP', 'SG', 'SNP', 'SP', 'TG', 'TP']
['54d621a5-8e45-4bbc-92c4-1c7f36212361', 'wgs', 'sanger_raw_pindel', 'raw_somatic_mutation', 'vcf', 'gz']
54d621a5-8e45-4bbc-92c4-1c7f36212361 sanger_raw_pindel 853961 17 4556 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'FF017', 'LEN', 'OLD_VARIANT', 'PC', 'RE', 'REP', 'RS', 'S1', 'S2']
['07f93ac0-50e0-4c1d-b4da-898062a5b6b8', 'wgs', 'CaVEMan', 'raw_somatic_mutation', 'vcf', 'gz']
07f93ac0-50e0-4c1d-b4da-898062a5b6b8 CaVEMan 57346 20 173 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'ASMD', 'ASRD', 'CLPM', 'DP', 'DS', 'GP', 'MP', 'SG', 'SNP', 'SP', 'TG', 'TP']
['ef6c3ecc-0bb6-4035-9c45-57740f5bcaa1', 'wgs', 'sanger_raw_pindel', 'raw_

In [13]:
output_path = "/home/pdutta/Data/Cancer_wiseGDC/Data/Brain/Generated_files/Intersected_Data"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [14]:
df_statistics.to_csv(output_path+"/VCF_statistics.tsv", sep="\t", index=False)

In [15]:
with open(output_path+"/intersected_vcf_{}_data.pkl".format(non_coding_region), "wb") as file:
    pickle.dump(intersected_vcf_data, file)

In [16]:
df_statistics

Unnamed: 0,filename,Patient_ID,work_flow,VCF_instance,VCF_feature,Intersected_instances,VCF_column_names
0,1d3daf9d-7740-484e-850b-827a6b7d451d.wgs.CaVEM...,1d3daf9d-7740-484e-850b-827a6b7d451d,CaVEMan,39764,20,100,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
1,54d621a5-8e45-4bbc-92c4-1c7f36212361.wgs.sange...,54d621a5-8e45-4bbc-92c4-1c7f36212361,sanger_raw_pindel,853961,17,4556,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
2,07f93ac0-50e0-4c1d-b4da-898062a5b6b8.wgs.CaVEM...,07f93ac0-50e0-4c1d-b4da-898062a5b6b8,CaVEMan,57346,20,173,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
3,ef6c3ecc-0bb6-4035-9c45-57740f5bcaa1.wgs.sange...,ef6c3ecc-0bb6-4035-9c45-57740f5bcaa1,sanger_raw_pindel,1249059,17,6312,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
4,2c9bbb8f-4b4d-4c0e-a158-1fb0fb95aca7.wgs.CaVEM...,2c9bbb8f-4b4d-4c0e-a158-1fb0fb95aca7,CaVEMan,42442,20,92,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
...,...,...,...,...,...,...,...
105,3adefeae-8c71-47a0-a9d5-3d0005aa0ef4.wgs.sange...,3adefeae-8c71-47a0-a9d5-3d0005aa0ef4,sanger_raw_pindel,884112,17,4680,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
106,b01b9f70-fc7d-43d1-bc8a-6c435ffd3705.wgs.sange...,b01b9f70-fc7d-43d1-bc8a-6c435ffd3705,sanger_raw_pindel,908507,17,4792,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
107,9a3edc2a-0eb3-4d88-965d-2801cd19df51.wgs.sange...,9a3edc2a-0eb3-4d88-965d-2801cd19df51,sanger_raw_pindel,728647,17,4011,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
108,d6cb373e-d9bc-43a1-8edc-2199b38b95f7.wgs.sange...,d6cb373e-d9bc-43a1-8edc-2199b38b95f7,sanger_raw_pindel,880376,17,4695,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."


In [17]:
df_statistics[df_statistics['Patient_ID']=="cc884f4d-e62d-4214-bfaa-81052a02a246"]

Unnamed: 0,filename,Patient_ID,work_flow,VCF_instance,VCF_feature,Intersected_instances,VCF_column_names
12,cc884f4d-e62d-4214-bfaa-81052a02a246.wgs.CaVEM...,cc884f4d-e62d-4214-bfaa-81052a02a246,CaVEMan,48561,20,160,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."
29,cc884f4d-e62d-4214-bfaa-81052a02a246.wgs.sange...,cc884f4d-e62d-4214-bfaa-81052a02a246,sanger_raw_pindel,877790,17,4597,"[CHROM, START_POS, END_POS, ID, REF, ALT, QUAL..."


In [18]:
df_statistics.groupby('work_flow').size()

work_flow
CaVEMan              55
sanger_raw_pindel    55
dtype: int64

In [1]:
intersected_vcf_data['ef6c3ecc-0bb6-4035-9c45-57740f5bcaa1_sanger_raw_pindel']

NameError: name 'intersected_vcf_data' is not defined