In [1]:
import os, io
import pysam
import pandas as pd
from IPython.display import display, HTML

In [2]:
parent_folder_path  = "/data/projects/VCF_files/BRCA/Download_files"

In [3]:
def vcf_to_dataframe(vcf_path):
    """
    Convert a .vcf.gz file into a pandas DataFrame.

    Parameters:
    - vcf_path (str): path to the .vcf.gz file

    Returns:
    - pd.DataFrame: VCF data as a DataFrame
    """

    # Open the VCF file
    vcf_file = pysam.VariantFile(vcf_path)

    # Extracting the data and the columns
    data = []
    print(vcf_file.header.info.keys())
    columns = list(vcf_file.header.info.keys()) + ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER"]
    for record in vcf_file:
        row_data = [record.info.get(key) for key in vcf_file.header.info.keys()]
        basic_data = [record.chrom, record.pos, record.id, record.ref, 
                      ','.join(str(alt) for alt in record.alts), record.qual, record.filter.keys()[0] if record.filter.keys() else 'PASS']
        data.append(row_data + basic_data)

    df = pd.DataFrame(data, columns=columns)

    # Close the VCF file
    vcf_file.close()

    return df

In [4]:
## Wrote this code to get .tbi file also

def get_files_except_logs(root_folder):
    all_files = []

    # Walk through the directory tree
    for dirpath, dirnames, filenames in os.walk(root_folder):
        # If "logs" is in dirnames, remove it to avoid traversing it
        if 'logs' in dirnames:
            dirnames.remove('logs')

        # Add all the filenames in the current directory to the all_files list
        for filename in filenames:
            all_files.append(os.path.join(dirpath, filename))

    return all_files

In [5]:
def get_vcf_gz_files_except_logs(root_folder):
    all_files = []

    # Walk through the directory tree
    for dirpath, dirnames, filenames in os.walk(root_folder):
        # If "logs" is in dirnames, remove it to avoid traversing it
        if 'logs' in dirnames:
            dirnames.remove('logs')

        # Add only the filenames with the extension .vcf.gz in the current directory to the all_files list
        for filename in filenames:
            if filename.endswith('.vcf.gz'):
                all_files.append(os.path.join(dirpath, filename))

    return all_files

In [6]:
files = get_vcf_gz_files_except_logs(parent_folder_path)
for file in files:
    print(file)
    if file.endswith('.tbi'):
        index = pysam.TabixFile(filename=file[:-4])
        # Print the header of the TBI file
        #print("\n".join(index.header))
        index.close()
    elif file.endswith('.vcf.gz'):
        df_vcf = vcf_to_dataframe(file)
        display(df_vcf)
    input()

/data/projects/VCF_files/BRCA/Download_files/a2843045-c672-42a8-8860-3defb53d9ded/c5bbc8ae-3efe-43a7-9613-ce7f967b34a7.wxs.mutect2.raw_somatic_mutation.vcf.gz
['DB', 'ECNT', 'HCNT', 'MAX_ED', 'MIN_ED', 'NLOD', 'PON', 'RPA', 'RU', 'STR', 'TLOD']


Unnamed: 0,DB,ECNT,HCNT,MAX_ED,MIN_ED,NLOD,PON,RPA,RU,STR,TLOD,CHROM,POS,ID,REF,ALT,QUAL,FILTER
0,False,1,1,,,23.95,,,,False,13.03,chr1,17626,,G,A,,alt_allele_in_normal
1,True,2,1,20.0,20.0,13.20,,,,False,12.02,chr1,139213,rs370723703,A,G,,clustered_events
2,True,2,1,20.0,20.0,14.40,,,,False,11.99,chr1,139233,rs373847457,C,A,,clustered_events
3,False,1,1,,,0.857,,,,False,5.22,chr1,184267,,C,T,,germline_risk
4,False,1,2,,,1.50,,,,False,15.28,chr1,184461,,G,A,,germline_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36543,False,1,4,,,0.903,,,,False,4.43,chrY,56883091,,C,T,,germline_risk
36544,False,1,14,,,8.73,,,,False,254.11,chrM,139,,T,C,,PASS
36545,True,1,14,,,102.06,,,,False,30.19,chrM,2706,rs2854128,A,G,,alt_allele_in_normal
36546,True,1,22,,,4.51,,,,False,56.97,chrM,4216,rs1599988,T,C,,germline_risk


 


/data/projects/VCF_files/BRCA/Download_files/703c0baf-8710-4619-ba28-110a7e327c89/3b280ec8-ad7d-4101-83cf-019e7b832b05.wxs.muse.raw_somatic_mutation.vcf.gz
['SOMATIC']


Unnamed: 0,SOMATIC,CHROM,POS,ID,REF,ALT,QUAL,FILTER
0,True,chr1,17538,rs200046632,C,A,,Tier1
1,True,chr1,16036547,rs2008876,C,T,,Tier1
2,True,chr1,16645843,rs60980767,T,C,,Tier1
3,True,chr1,16720336,rs9661747,A,G,,Tier3
4,True,chr1,17632375,,G,T,,PASS
...,...,...,...,...,...,...,...,...
468,True,chrX,92618914,,C,A,,Tier5
469,True,chrX,101407938,,G,A,,PASS
470,True,chrX,155900676,,A,G,,PASS
471,True,chrY,11327670,,T,C,,Tier1


 


/data/projects/VCF_files/BRCA/Download_files/745d1c6d-aaf8-4190-9401-08c9dbe03ab1/ff2f1e9e-cb3c-4043-8076-7eb19be479c3.wxs.pindel.raw_somatic_mutation.vcf.gz
['END', 'HOMLEN', 'HOMSEQ', 'NTLEN', 'OLD_VARIANT', 'PF', 'SVLEN', 'TYPEOFSV']


[E::bcf_hdr_parse_line] Could not parse the header line: "##GATKCommandLine.VariantFiltration=<ID=VariantFiltration,Version=nightly-2016-02-25-gf39d340,Date=\"Wed Jun 02 05:40:36 UTC 2021\",Epoch=1622612436608,CommandLineOptions=\"analysis_type=VariantFiltration input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] "...


Unnamed: 0,END,HOMLEN,HOMSEQ,NTLEN,OLD_VARIANT,PF,SVLEN,TYPEOFSV,CHROM,POS,ID,REF,ALT,QUAL,FILTER
0,,"(4,)","(CCTC,)",,,,"(-3,)",DEL,chr1,30581,,GCCT,G,,PASS
1,,"(0,)",,"(41,)",(chr1:11675698:TCTCTCGTGTGCACCAGGACTGTGAAGGTAC...,,"(-40,)",RPL,chr1,11675699,,CTCTCGTGTGCACCAGGACTGTGAAGGTACAGCCTGGAGA,TTCCTCTTTGTCCCCCGGCTGTGGAAGGTCAACCCGGGGAG,,TALTDP
2,,"(0,)",,"(5,)","(chr1:24058954:CAGACC/CCAACA,)",,"(-5,)",RPL,chr1,24058955,,AGACC,CAACA,,TALTDP
3,,"(0,)",,"(4,)","(chr1:25771553:CGTAC/CCGCA,)",,"(-4,)",RPL,chr1,25771554,,GTAC,CGCA,,TALTDP
4,,"(0,)",,"(10,)","(chr1:26555624:CGGTAAGCAGC/CCGGTAAGAAT,)",,"(-10,)",RPL,chr1,26555625,,GGTAAGCAGC,CGGTAAGAAT,,TALTDP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,,"(0,)",,"(52,)",(chrX:75326860:TTATAATGCAATAGTATTTTAGACCAGTTTC...,,"(-53,)",RPL,chrX,75326861,,TATAATGCAATAGTATTTTAGACCAGTTTCCCTCTCCATGTTTCCT...,ATAATGCAATAGTATTTTAGACCAGTTTCCCTCTCCATGTTTCCTA...,,TALTDP
190,,"(2,)","(CA,)",,,,"(-2,)",DEL,chrX,91983244,,CCA,C,,PASS
191,,"(0,)",,"(64,)",(chrX:120276826:GCCTTCCCCCTATGTCCAACAGGGGCCACT...,,"(-64,)",RPL,chrX,120276827,,CCTTCCCCCTATGTCCAACAGGGGCCACTGTGAAATGCAAAGTCAA...,TATTCCCCCTATGTCCAACAGGGGCCACTGTGAAATACAAAGCCAA...,,TALTDP
192,,"(0,)",,"(5,)","(chrX:152918965:GCCTCT/GACCAA,)",,"(-5,)",RPL,chrX,152918966,,CCTCT,ACCAA,,TALTDP


KeyboardInterrupt: Interrupted by user