In [1]:
import os, io
import pysam
import pandas as pd
from IPython.display import display, HTML

In [3]:
somatic_vcf_file_path = "/data/projects/GDC_Cancer_Wise/Brain/Data/VCF_Rekha/TCGA-14-0786/01B-01D-0703-09_10A-01D-0703-09.vcf.gz"
vcf_path ="/data/projects/GDC_Cancer_Wise/Brain/Data/VCF_Patientwise/TCGA-14-0786/TCGA-14-0786-01B-01D-0703-09_TCGA-14-0786-10A-01D-0703-09_Somatic.vcf" 

In [4]:
def vcf_to_dataframe(vcf_path):
    """
    Convert a .vcf.gz file into a pandas DataFrame.

    Parameters:
    - vcf_path (str): path to the .vcf.gz file

    Returns:
    - pd.DataFrame: VCF data as a DataFrame
    """

    # Open the VCF file
    vcf_file = pysam.VariantFile(vcf_path)

    # Extracting the data and the columns
    data = []
    print(vcf_file.header.info.keys())
    columns = list(vcf_file.header.info.keys()) + ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER"]
    for record in vcf_file:
        row_data = [record.info.get(key) for key in vcf_file.header.info.keys()]
        basic_data = [record.chrom, record.pos, record.id, record.ref, 
                      ','.join(str(alt) for alt in record.alts), record.qual, record.filter.keys()[0] if record.filter.keys() else 'PASS']
        data.append(row_data + basic_data)

    df = pd.DataFrame(data, columns=columns)

    # Close the VCF file
    vcf_file.close()

    return df

In [5]:
df_vcf = vcf_to_dataframe(somatic_vcf_file_path)
df_vcf

['INDEL', 'IDV', 'IMF', 'DP', 'VDB', 'RPBZ', 'MQBZ', 'BQBZ', 'MQSBZ', 'SCBZ', 'SGB', 'MQ0F', 'AC', 'AN', 'DP4', 'MQ']


[E::idx_find_and_load] Could not retrieve index file for '/data/projects/GDC_Cancer_Wise/Brain/Data/VCF_Rekha/TCGA-14-0786/01B-01D-0703-09_10A-01D-0703-09.vcf.gz'


Unnamed: 0,INDEL,IDV,IMF,DP,VDB,RPBZ,MQBZ,BQBZ,MQSBZ,SCBZ,...,AN,DP4,MQ,CHROM,POS,ID,REF,ALT,QUAL,FILTER
0,False,,,82,0.094314,-1.714980,-1.292790,1.204260,-0.370474,-1.219310,...,4,"(28, 24, 14, 5)",11,chr1,10250,,A,C,5.672510,PASS
1,False,,,58,0.477805,-0.845673,4.213320,-0.742717,-1.312320,-0.992469,...,4,"(6, 24, 15, 10)",16,chr1,14464,,A,T,169.957001,PASS
2,False,,,57,0.920926,0.920103,-0.218867,-0.888587,-2.431040,1.552030,...,4,"(9, 8, 20, 18)",2,chr1,14907,,A,G,5.988420,PASS
3,False,,,33,0.076817,,,,2.234720,,...,4,"(0, 0, 17, 15)",1,chr1,15274,,A,T,35.063900,PASS
4,False,,,79,0.053216,2.293360,-6.309440,0.723653,1.508530,-1.307000,...,4,"(14, 20, 23, 20)",23,chr1,16298,,C,T,111.367996,PASS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4717988,False,,,495,0.897021,-0.319529,-0.241483,1.470160,0.241483,0.250215,...,4,"(2, 0, 491, 2)",59,chrM,12061,,C,T,486.040009,PASS
4717989,False,,,489,0.614580,-0.460949,7.287700,1.801090,0.287433,0.489889,...,4,"(3, 0, 471, 13)",59,chrM,13680,,C,T,486.035004,PASS
4717990,False,,,495,0.864391,0.924084,-0.121905,1.054770,0.300184,0.214293,...,4,"(1, 0, 471, 6)",59,chrM,14770,,C,T,486.052002,PASS
4717991,False,,,489,0.960666,-1.174190,-0.423103,3.328810,-3.225260,-0.560337,...,4,"(6, 1, 469, 11)",59,chrM,14872,,C,T,486.022003,PASS


In [None]:
df_vcf['QUAL']

In [6]:
df_vcf.columns

Index(['INDEL', 'IDV', 'IMF', 'DP', 'VDB', 'RPBZ', 'MQBZ', 'BQBZ', 'MQSBZ',
       'SCBZ', 'SGB', 'MQ0F', 'AC', 'AN', 'DP4', 'MQ', 'CHROM', 'POS', 'ID',
       'REF', 'ALT', 'QUAL', 'FILTER'],
      dtype='object')

In [7]:
df= pd.read_csv(vcf_path, sep=",")
df

Unnamed: 0,INDEL_x,IDV_x,IMF_x,DP_x,VDB_x,RPBZ_x,MQBZ_x,BQBZ_x,MQSBZ_x,SCBZ_x,...,SCBZ_y,SGB_y,MQ0F_y,AC_y,AN_y,DP4_y,MQ_y,ID_y,QUAL_y,FILTER_y
0,False,,,40,3.722820e-02,-2.588590,-0.764502,1.034460,0.522024,-0.750471,...,,,,,,,,,,
1,False,,,19,6.322260e-02,,,,0.948683,,...,,,,,,,,,,
2,False,,,46,6.129770e-01,-0.530611,0.403622,1.775460,3.228980,0.874257,...,,,,,,,,,,
3,False,,,78,8.349650e-01,2.056940,1.476550,-1.950210,-0.351285,0.064896,...,,,,,,,,,,
4,False,,,88,2.110520e-02,0.657787,0.072712,-2.575560,0.681516,-1.105410,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303167,False,,,120,5.948950e-01,0.105077,0.582899,-2.109290,-6.633370,-0.909413,...,,,,,,,,,,
303168,False,,,123,4.592270e-02,1.288420,0.333511,-3.314710,-4.746600,0.000000,...,,,,,,,,,,
303169,False,,,160,9.916170e-01,0.077941,0.312257,-4.489230,-2.165100,-0.796856,...,,,,,,,,,,
303170,False,,,51,1.733240e-07,-0.533192,-2.948790,0.344414,-0.300534,-0.696311,...,,,,,,,,,,


In [8]:
pd.merge(df_vcf, df, on=["CHROM", "POS", "REF", "ALT"], how='inner')

Unnamed: 0,INDEL,IDV,IMF,DP,VDB,RPBZ,MQBZ,BQBZ,MQSBZ,SCBZ,...,SCBZ_y,SGB_y,MQ0F_y,AC_y,AN_y,DP4_y,MQ_y,ID_y,QUAL_y,FILTER_y
0,False,,,82,0.094314,-1.714980,-1.292790,1.204260,-0.370474,-1.219310,...,,,,,,,,,,
1,False,,,33,0.076817,,,,2.234720,,...,,,,,,,,,,
2,False,,,69,0.254260,-0.332758,0.727330,0.943119,3.742090,1.549550,...,,,,,,,,,,
3,False,,,147,0.866468,3.118080,1.264280,-0.762612,-1.764390,0.137496,...,,,,,,,,,,
4,False,,,158,0.000109,0.506953,-0.741214,-3.495510,0.068589,-1.795880,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252344,False,,,254,0.742815,-0.081939,1.895420,-2.356120,-9.857120,-1.017420,...,,,,,,,,,,
252345,False,,,286,0.030945,1.604800,-0.391697,-3.874740,-9.481900,-0.426401,...,,,,,,,,,,
252346,False,,,366,0.977810,-1.209800,1.538590,-4.073970,-4.742520,-0.940125,...,,,,,,,,,,
252347,False,,,89,0.000012,-1.392650,-4.462530,-0.389306,-1.128780,-0.536543,...,,,,,,,,,,


In [9]:
# Left Excluding Merge: Rows in df_vcf not in df
df_left_excluding = pd.merge(df_vcf, df, on=["CHROM", "POS", "REF", "ALT"], how='left', indicator=True)
df_left_excluding[df_left_excluding['_merge'] == 'left_only'].drop(columns=['_merge'])

Unnamed: 0,INDEL,IDV,IMF,DP,VDB,RPBZ,MQBZ,BQBZ,MQSBZ,SCBZ,...,SCBZ_y,SGB_y,MQ0F_y,AC_y,AN_y,DP4_y,MQ_y,ID_y,QUAL_y,FILTER_y
1,False,,,58,0.477805,-0.845673,4.213320,-0.742717,-1.312320,-0.992469,...,,,,,,,,,,
2,False,,,57,0.920926,0.920103,-0.218867,-0.888587,-2.431040,1.552030,...,,,,,,,,,,
4,False,,,79,0.053216,2.293360,-6.309440,0.723653,1.508530,-1.307000,...,,,,,,,,,,
5,False,,,100,0.089098,0.146780,1.578510,-5.634650,2.335830,1.683250,...,,,,,,,,,,
6,False,,,53,0.003876,2.347120,-0.762124,2.460920,2.935640,0.133136,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4717988,False,,,495,0.897021,-0.319529,-0.241483,1.470160,0.241483,0.250215,...,,,,,,,,,,
4717989,False,,,489,0.614580,-0.460949,7.287700,1.801090,0.287433,0.489889,...,,,,,,,,,,
4717990,False,,,495,0.864391,0.924084,-0.121905,1.054770,0.300184,0.214293,...,,,,,,,,,,
4717991,False,,,489,0.960666,-1.174190,-0.423103,3.328810,-3.225260,-0.560337,...,,,,,,,,,,


In [10]:
# Right Excluding Merge: Rows in df not in df_vcf
df_right_excluding = pd.merge(df_vcf, df, on=["CHROM", "POS", "REF", "ALT"], how='right', indicator=True)
df_right_excluding[df_right_excluding['_merge'] == 'right_only'].drop(columns=['_merge'])

Unnamed: 0,INDEL,IDV,IMF,DP,VDB,RPBZ,MQBZ,BQBZ,MQSBZ,SCBZ,...,SCBZ_y,SGB_y,MQ0F_y,AC_y,AN_y,DP4_y,MQ_y,ID_y,QUAL_y,FILTER_y
78,,,,,,,,,,,...,,,,,,,,,,
121,,,,,,,,,,,...,,,,,,,,,,
141,,,,,,,,,,,...,,,,,,,,,,
153,,,,,,,,,,,...,,,,,,,,,,
168,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303125,,,,,,,,,,,...,,,,,,,,,,
303129,,,,,,,,,,,...,,,,,,,,,,
303134,,,,,,,,,,,...,,,,,,,,,,
303149,,,,,,,,,,,...,,,,,,,,,,
