In [1]:
import io
import os
import pandas as pd

In [2]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [3]:
file_path = "/data/private/pdutta/Collab_data/Oliver_data/APOE_TREM2/APOE_regions.vcf"

In [4]:
df_test = read_vcf(file_path)

In [5]:
df_test

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,A-ACT-AC000007-BL-UPN-6888,...,C-RS-52102-BL-ERA-4891001,C-RS-52104-BL-ERA-6529001,C-RS-52105-BL-ERA-6937002,C-RS-52106-BL-ERA-8969001,C-RS-52107-BL-ERA-5794002,C-RS-52108-BL-ERA-6739002,C-RS-52109-BL-ERA-1267001,C-RS-52110-BL-ERA-2326001,C-RS-52111-BL-ERA-3023001,C-RS-52112-BL-ERA-6441002
0,19,44905879,rs373985746,G,A,1767.01,PASS,AC=1;AF=2.418e-05;AN=18892;DP=687498;VQSLOD=13...,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1,19,44905881,.,G,T,620.07,PASS,AC=1;AF=7.257e-05;AN=18886;DP=685290;VQSLOD=10...,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2,19,44905886,.,T,C,946.05,PASS,AC=2;AF=4.838e-05;AN=18890;DP=679344;VQSLOD=9....,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
3,19,44905910,rs440446,C,G,21246500.0,PASS,AC=12205;AF=0.701;AN=18874;DP=921148;VQSLOD=9....,GT,1/1,...,0/1,1/1,1/1,1/1,1/1,1/1,1/1,1/1,0/1,1/1
4,19,44905923,.,G,A,16453.2,PASS,AC=27;AF=0.0008942;AN=18838;DP=615339;VQSLOD=5...,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
5,19,44906639,.,G,A,770.55,PASS,AC=2;AF=4.742e-05;AN=18744;DP=760698;VQSLOD=10...,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
6,19,44906646,.,T,C,115.77,PASS,AC=1;AF=2.371e-05;AN=18718;DP=758793;VQSLOD=9....,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
7,19,44907785,rs111833428,G,A,23451.3,PASS,AC=12;AF=0.000498;AN=18076;DP=643034;VQSLOD=11...,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
8,19,44907788,.,G,A,277.79,PASS,AC=1;AF=4.743e-05;AN=18190;DP=645667;VQSLOD=10...,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
9,19,44907934,.,A,G,804.77,PASS,AC=1;AF=2.371e-05;AN=18898;DP=719825;VQSLOD=8....,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0


## Description of the INFO column
* "AC" =>  Allele Count in genotypes
* "AF" => Allele Frequency for each ALT allele
* "AN" => Allele Number
* "DP" => Depth
* "VQSLOD" => Variant Quality Score Log Odd Ratio. It is a score provided by a variant recalibration process (like the one implemented in the Genome Analysis Toolkit, or GATK) 
    and is an estimation of how likely the variant is to be real and not a sequencing artifact.

In [12]:
df_test.iloc[2]['INFO'].split(';')

['AC=2',
 'AF=4.838e-05',
 'AN=18890',
 'DP=679344',
 'VQSLOD=9.08',
 'EA=.',
 'Ensembl_proteinid=.',
 'Consequence=intron_variant',
 'SYMBOL=APOE',
 'BIOTYPE=protein_coding',
 'EXON=.',
 'HGVSp=.',
 'ENSP=ENSP00000252486',
 'LoF=.',
 'CSQ=intron_variant|APOE|protein_coding|||||ENSP00000252486|||-50|38|37|-45|0.00|0.00|0.15|0.04|APOE,downstream_gene_variant|TOMM40|protein_coding|||||ENSP00000252487|||-50|38|37|-45|0.00|0.00|0.15|0.04|APOE,downstream_gene_variant|TOMM40|protein_coding|||||ENSP00000385184|||-50|38|37|-45|0.00|0.00|0.15|0.04|APOE,upstream_gene_variant|APOE|protein_coding|||||ENSP00000410423|||-50|38|37|-45|0.00|0.00|0.15|0.04|APOE,downstream_gene_variant|TOMM40|protein_coding|||||ENSP00000410339|||-50|38|37|-45|0.00|0.00|0.15|0.04|APOE,synonymous_variant|APOE|protein_coding|1/4|ENSP00000413653.2:p.Ser6%3D|6/269|S|ENSP00000413653|tcT/tcC||-50|38|37|-45|0.00|0.00|0.15|0.04|APOE,intron_variant|APOE|protein_coding|||||ENSP00000413135|||-50|38|37|-45|0.00|0.00|0.15|0.04|APOE,i

In [7]:
df_test.columns.to_list()

['CHROM',
 'POS',
 'ID',
 'REF',
 'ALT',
 'QUAL',
 'FILTER',
 'INFO',
 'FORMAT',
 'A-ACT-AC000007-BL-UPN-6888',
 'A-ACT-AC000008-BL-UPN-8307',
 'A-ACT-AC000010-BL-UWA-15286',
 'A-ACT-AC000014-BL-UPN-23967',
 'A-ACT-AC000016-BL-UPN-7273',
 'A-ACT-AC000020-BL-UWA-13783',
 'A-ACT-AC000022-BL-UWA-15061',
 'A-ACT-AC000023-BL-UPN-5880',
 'A-ACT-AC000025-BL-UPN-10505',
 'A-ACT-AC000027-BL-UPN-23154',
 'A-ACT-AC000032-BL-UPN-23447',
 'A-ACT-AC000034-BL-UPN-15865',
 'A-ACT-AC000037-BL-UPN-7487',
 'A-ACT-AC000038-BL-UWA-32410',
 'A-ACT-AC000039-BL-UWA-14822',
 'A-ACT-AC000040-BL-UPN-7990',
 'A-ACT-AC000041-BL-UWA-14269',
 'A-ACT-AC000043-BL-UPN-11291',
 'A-ACT-AC000044-BL-UPN-11405',
 'A-ACT-AC000045-BL-UPN-26144',
 'A-ACT-AC000046-BL-UPN-14126',
 'A-ACT-AC000047-BL-UPN-10604',
 'A-ACT-AC000055-BL-UPN-14910',
 'A-ACT-AC000056-BL-UPN-15169',
 'A-ACT-AC000057-BL-UPN-14863',
 'A-ACT-AC000058-BL-UPN-14011',
 'A-ACT-AC000064-BL-UWA-14740',
 'A-ACT-AC000067-BL-UPN-11551',
 'A-ACT-AC000071-BL-UWA-14864