In [1]:
import os, io
import pandas as pd
import pysam
import pickle
import pybedtools
print(pybedtools.__file__)
pybedtools.helpers.set_tempdir('/home/pdutta/temp')
from IPython.display import display, HTML

/home/pdutta/anaconda3/envs/GDC_VCF/lib/python3.10/site-packages/pybedtools/__init__.py


In [2]:
vcf_folder_path="/home/pdutta/Data/Cancer_wiseGDC/Data/Brain/WGS_VCF_Files"
reference_genome_path="/home/pdutta/Data/Human_Genome_Data/GRCh38_latest_genomic.fna"

In [3]:
non_coding_region="donor"

In [5]:
df= pd.read_csv("/home/pdutta/Data/Noncoding_region/Unique_{}_RAW_Combined_BED.tsv".format(non_coding_region), sep="\t")
df

Unnamed: 0,chr_name,start,end,strand,transcript_id,exon_id,transcript_type
0,chr1,12189,12268,+,"ENST00000450305.2, ENST00000456328.2","ENSE00001671638.2, ENSE00002234944.1","transcribed_unprocessed_pseudogene, processed_..."
1,chr1,12659,12738,+,ENST00000450305.2,ENSE00001758273.2,transcribed_unprocessed_pseudogene
2,chr1,12683,12762,+,ENST00000456328.2,ENSE00003582793.1,processed_transcript
3,chr1,13336,13415,+,ENST00000450305.2,ENSE00001746346.2,transcribed_unprocessed_pseudogene
4,chr1,15756,15835,-,ENST00000488147.1,ENSE00002030414.1,unprocessed_pseudogene
...,...,...,...,...,...,...,...
297951,chrY,57211531,57211610,+,ENST00000464205.6_PAR_Y,ENSE00003785071.1,processed_transcript
297952,chrY,57211582,57211661,+,"ENST00000340131.12_PAR_Y, ENST00000359512.8_PA...","ENSE00003627301.1, ENSE00003905275.1, ENSE0000...","retained_intron, protein_coding"
297953,chrY,57213164,57213243,-,ENST00000507418.6_PAR_Y,ENSE00002036959.1,unprocessed_pseudogene
297954,chrY,57213840,57213919,-,ENST00000507418.6_PAR_Y,ENSE00002046926.1,unprocessed_pseudogene


In [6]:
donor_bed = pybedtools.BedTool.from_dataframe(df)

In [7]:
reference_fasta = pysam.FastaFile(reference_genome_path)

In [8]:
def get_vcf_gz_files_except_logs(root_folder):
    all_files = []

    # Walk through the directory tree
    for dirpath, dirnames, filenames in os.walk(root_folder):
        # If "logs" is in dirnames, remove it to avoid traversing it
        if 'logs' in dirnames:
            dirnames.remove('logs')

        # Add only the filenames with the extension .vcf.gz in the current directory to the all_files list
        for filename in filenames:
            if filename.endswith('.vcf.gz'):
                all_files.append(os.path.join(dirpath, filename))
    return all_files

In [9]:
def vcf_to_dataframe(vcf_path):
    """
    Convert a .vcf.gz file into a pandas DataFrame.

    Parameters:
    - vcf_path (str): path to the .vcf.gz file

    Returns:
    - pd.DataFrame: VCF data as a DataFrame
    """

    try:
        # Open the VCF file
        vcf_file = pysam.VariantFile(vcf_path)
    except ValueError as e:
        print(f"Failed to open file {vcf_path}: {e}")
        with open("/data/projects/GDC_Cancer_Wise/Brain/Data/"+vcf_path.split("/")[-2]+"_error_log.txt", "a") as error_log:
            error_log.write(f"{vcf_path}\n")
        return None

    # Extracting the data and the columns
    data = []
    
    columns =  ["CHROM", "START_POS", "ID", "REF", "ALT", "QUAL", "FILTER"] + list(vcf_file.header.info.keys()) 
    for record in vcf_file:
        basic_data = [record.chrom, record.pos, record.id, record.ref, 
                      ','.join(str(alt) for alt in record.alts), record.qual, record.filter.keys()[0] if record.filter.keys() else 'PASS']
        row_data = [record.info.get(key) for key in vcf_file.header.info.keys()]
        data.append(basic_data + row_data)

    df = pd.DataFrame(data, columns=columns)
    df["START_POS"] = df["START_POS"]-1
    end = df["START_POS"] + df['REF'].str.len()
    df.insert(2, 'END_POS', end)

    # Close the VCF file
    vcf_file.close()

    return df

In [10]:
files = get_vcf_gz_files_except_logs(vcf_folder_path)

In [11]:
len(files)

110

In [12]:
intersected_vcf_data = {}
df_statistics  = pd.DataFrame(columns= ['filename', 'VCF_instance','VCF_feature','Intersected_instances', 'VCF_column_names' ])

In [13]:
for file_path in files:
    file_name =  file_path.split('/')[-1]
    parts = file_name.split('_')
    print(parts)
    df_vcf = vcf_to_dataframe(file_path)
    #print(df_vcf)
    if df_vcf is None:
        continue
    vcf_bed = pybedtools.BedTool.from_dataframe(df_vcf)
    intersect_vcf_donor = donor_bed.intersect(vcf_bed, wa=True, wb=True)
    column_list  = df.columns.to_list() + df_vcf.columns.to_list()
    df_intersection = intersect_vcf_donor.to_dataframe(names=column_list)
    #print(df_intersection)
    #print(df_statistics)
    #input()
    #gdc_tag = parts[-4]
    instance =  df_vcf.shape[0]
    feature = df_vcf.shape[1]
    intersected_instance = df_intersection.shape[0]
    feature_list = list(df_vcf.columns)
    print(file_name, instance, feature,intersected_instance,  feature_list)
    df_statistics.loc[len(df_statistics)] = [file_name, instance, feature,intersected_instance,  feature_list]
    intersected_vcf_data[file_name] = df_intersection

['1d3daf9d-7740-484e-850b-827a6b7d451d.wgs.CaVEMan.raw', 'somatic', 'mutation.vcf.gz']
1d3daf9d-7740-484e-850b-827a6b7d451d.wgs.CaVEMan.raw_somatic_mutation.vcf.gz 39764 20 123 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'ASMD', 'ASRD', 'CLPM', 'DP', 'DS', 'GP', 'MP', 'SG', 'SNP', 'SP', 'TG', 'TP']
['54d621a5-8e45-4bbc-92c4-1c7f36212361.wgs.sanger', 'raw', 'pindel.raw', 'somatic', 'mutation.vcf.gz']
54d621a5-8e45-4bbc-92c4-1c7f36212361.wgs.sanger_raw_pindel.raw_somatic_mutation.vcf.gz 853961 17 3328 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'FF017', 'LEN', 'OLD_VARIANT', 'PC', 'RE', 'REP', 'RS', 'S1', 'S2']
['07f93ac0-50e0-4c1d-b4da-898062a5b6b8.wgs.CaVEMan.raw', 'somatic', 'mutation.vcf.gz']
07f93ac0-50e0-4c1d-b4da-898062a5b6b8.wgs.CaVEMan.raw_somatic_mutation.vcf.gz 57346 20 173 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'ASMD', 'ASRD', 'CLPM', 'DP', 'DS', 'GP', 'MP', 'SG', 'SNP', 'SP', 'TG', 'TP']

In [14]:
output_path = "/home/pdutta/Data/Cancer_wiseGDC/Data/Brain/Generated_files/Intersected_Data"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [15]:
df_statistics.to_csv(output_path+"/intersected_VCF_{}_statistics.tsv".format(non_coding_region), sep="\t", index=False)

In [16]:
with open(output_path+"/intersected_vcf_{}_data.pkl".format(non_coding_region), "wb") as file:
    pickle.dump(intersected_vcf_data, file)