In [17]:
import os, io
import pandas as pd
import pysam
import pickle
import pybedtools
print(pybedtools.__file__)
pybedtools.helpers.set_tempdir('/home/pdutta/temp')
from IPython.display import display, HTML

/home/pdutta/anaconda3/envs/GDC_VCF/lib/python3.10/site-packages/pybedtools/__init__.py


In [18]:
cancer_type="Breast" 

In [19]:
vcf_folder_path="/home/pdutta/Data/Cancer_wiseGDC/New_data/{}/Downloaded_files/VCF".format(cancer_type)
reference_genome_path="/home/pdutta/Data/Human_Genome_Data/GRCh38_latest_genomic.fna"

In [20]:
non_coding_region="core_prom"

In [21]:
df= pd.read_csv("/home/pdutta/Data/Noncoding_region/{}_bed.tsv".format(non_coding_region), sep="\t")
columns = ['Chr', 'start', 'end', 'gene_id', 'transcript_id', 'exon_id', 'strand', 'TSS']
df = df[columns]
# Renaming columns
df.rename(columns={'Chr': 'chr_name'}, inplace=True)
df

Unnamed: 0,chr_name,start,end,gene_id,transcript_id,exon_id,strand,TSS
0,chr7,127588366,127588455,ENSG00000004059.11,ENST00000000233.10,ENSE00001872691.2,+,127588411
1,chr7,117479980,117480069,ENSG00000001626.16,ENST00000003084.11,ENSE00001343851.2,+,117480025
2,chr7,92134433,92134522,ENSG00000001630.17,ENST00000003100.13,ENSE00001216550.10,-,92134477
3,chr7,150800724,150800813,ENSG00000002933.9,ENST00000004103.8,ENSE00001827596.2,+,150800769
4,chr7,95596472,95596561,ENSG00000004799.8,ENST00000005178.6,ENSE00001133344.6,-,95596516
...,...,...,...,...,...,...,...,...
236970,chr15,80152965,80153054,ENSG00000103876.14,ENST00000684569.1,ENSE00003921718.1,+,80153010
236971,chr15,72375945,72376034,ENSG00000213614.11,ENST00000684602.1,ENSE00002620879.1,-,72375989
236972,chr15,72375928,72376017,ENSG00000213614.11,ENST00000684667.1,ENSE00002584031.1,-,72375972
236973,chr15,44663644,44663733,ENSG00000104133.16,ENST00000684676.1,ENSE00003922470.1,-,44663688


In [22]:
core_prom_bed = pybedtools.BedTool.from_dataframe(df)

In [23]:
reference_fasta = pysam.FastaFile(reference_genome_path)

In [24]:
def get_vcf_gz_files_except_logs(root_folder):
    all_files = []

    # Walk through the directory tree
    for dirpath, dirnames, filenames in os.walk(root_folder):
        # If "logs" is in dirnames, remove it to avoid traversing it
        if 'logs' in dirnames:
            dirnames.remove('logs')

        # Add only the filenames with the extension .vcf.gz in the current directory to the all_files list
        for filename in filenames:
            if filename.endswith('.vcf.gz'):
                all_files.append(os.path.join(dirpath, filename))
    return all_files

In [25]:
def vcf_to_dataframe(vcf_path):
    """
    Convert a .vcf.gz file into a pandas DataFrame.

    Parameters:
    - vcf_path (str): path to the .vcf.gz file

    Returns:
    - pd.DataFrame: VCF data as a DataFrame
    """

    try:
        # Open the VCF file
        vcf_file = pysam.VariantFile(vcf_path)
    except ValueError as e:
        print(f"Failed to open file {vcf_path}: {e}")
        with open("/data/projects/GDC_Cancer_Wise/Brain/Data/"+vcf_path.split("/")[-2]+"_error_log.txt", "a") as error_log:
            error_log.write(f"{vcf_path}\n")
        return None

    # Extracting the data and the columns
    data = []
    
    columns =  ["CHROM", "START_POS", "ID", "REF", "ALT", "QUAL", "FILTER"] + list(vcf_file.header.info.keys()) 
    for record in vcf_file:
        basic_data = [record.chrom, record.pos, record.id, record.ref, 
                      ','.join(str(alt) for alt in record.alts), record.qual, record.filter.keys()[0] if record.filter.keys() else 'PASS']
        row_data = [record.info.get(key) for key in vcf_file.header.info.keys()]
        data.append(basic_data + row_data)

    df = pd.DataFrame(data, columns=columns)
    df["START_POS"] = df["START_POS"]-1
    end = df["START_POS"] + df['REF'].str.len()
    df.insert(2, 'END_POS', end)

    # Close the VCF file
    vcf_file.close()

    return df

In [26]:
files = get_vcf_gz_files_except_logs(vcf_folder_path)

In [27]:
len(files)

228

In [28]:
intersected_vcf_data = {}
df_statistics  = pd.DataFrame(columns= ["filename", "Patient_ID", 'work_flow',  'VCF_instance','VCF_feature','Intersected_instances', 'VCF_column_names' ])

In [29]:
for file_path in files:
    file_name =  file_path.split('/')[-1]
    parts = file_name.split('.')
    print(parts)
    df_vcf = vcf_to_dataframe(file_path)
    #print(df_vcf)
    if df_vcf is None:
        continue
    vcf_bed = pybedtools.BedTool.from_dataframe(df_vcf)
    intersect_vcf_core_prom = core_prom_bed.intersect(vcf_bed, wa=True, wb=True)
    column_list  = df.columns.to_list() + df_vcf.columns.to_list()
    df_intersection = intersect_vcf_core_prom.to_dataframe(names=column_list)
    df_intersection = df_intersection[(df_intersection['REF'].str.len() < 10) & (df_intersection['ALT'].str.len() < 10)]
    #print(df_intersection)
    #print(df_statistics)
    #input()
    #gdc_tag = parts[-4]
    patient_ID = parts[0]
    work_flow = parts[2]
    instance =  df_vcf.shape[0]
    feature = df_vcf.shape[1]
    intersected_instance = df_intersection.shape[0]
    feature_list = list(df_vcf.columns)
    print(patient_ID, work_flow, instance, feature,intersected_instance,  feature_list)
    df_statistics.loc[len(df_statistics)] = [file_name, patient_ID, work_flow, instance, feature,intersected_instance,  feature_list]
    intersected_vcf_data[patient_ID+"_"+work_flow] = df_intersection

['460f332f-38b4-4c94-997f-a79c230aa8e1', 'wgs', 'CaVEMan', 'raw_somatic_mutation', 'vcf', 'gz']
460f332f-38b4-4c94-997f-a79c230aa8e1 CaVEMan 276752 20 2858 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'ASMD', 'ASRD', 'CLPM', 'DP', 'DS', 'GP', 'MP', 'SG', 'SNP', 'SP', 'TG', 'TP']
['a7c26dde-f8c4-4534-9c04-a90203311791', 'wgs', 'sanger_raw_pindel', 'raw_somatic_mutation', 'vcf', 'gz']
a7c26dde-f8c4-4534-9c04-a90203311791 sanger_raw_pindel 1100363 17 5132 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'FF017', 'LEN', 'OLD_VARIANT', 'PC', 'RE', 'REP', 'RS', 'S1', 'S2']
['3e85698b-e8b1-4170-9340-c5a1e4587229', 'wgs', 'sanger_raw_pindel', 'raw_somatic_mutation', 'vcf', 'gz']
3e85698b-e8b1-4170-9340-c5a1e4587229 sanger_raw_pindel 1060575 17 4992 ['CHROM', 'START_POS', 'END_POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'FF017', 'LEN', 'OLD_VARIANT', 'PC', 'RE', 'REP', 'RS', 'S1', 'S2']
['6a556615-d80f-4b48-94af-a6f25089b40c', 'wgs', 'sanger_raw

In [30]:
output_path = "/home/pdutta/Data/Cancer_wiseGDC/New_data/{}/Generated_files/Intersected_Data".format(cancer_type)
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [31]:
df_statistics.to_csv(output_path+"/intersected_VCF_{}_statistics.tsv".format(non_coding_region), sep="\t", index=False)

In [32]:
with open(output_path+"/intersected_vcf_{}_data.pkl".format(non_coding_region), "wb") as file:
    pickle.dump(intersected_vcf_data, file)