In [2]:
# vcf reader

In [14]:
from collections import OrderedDict
import gzip
import pandas as pd

In [15]:
def _count_comments(filename):
    """Count comment lines (those that start with "#") in an optionally
    gzipped file.
    :param filename:  An optionally gzipped file.
    """
    #print('in _count_comments')
    comments = 0
    fn_open = gzip.open if filename.endswith('.gz') else open
    with fn_open(filename) as fh:
        for line in fh:
            if line.startswith('#'):
                comments += 1
            else:
                break
    return comments

In [18]:
VCF_HEADER = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', '20']


def my_dataframe(filename, large=True):
    """Open an optionally gzipped VCF file and return a pandas.DataFrame with
    each INFO field included as a column in the dataframe.
    Note: Using large=False with large VCF files. It will be painfully slow.
    :param filename:    An optionally gzipped VCF file.
    :param large:       Use this with large VCF files to skip the ## lines and
                        leave the INFO fields unseparated as a single column.
    """
    #print('here i am!')
    if large:
        # Set the proper argument if the file is compressed.
        comp = 'gzip' if filename.endswith('.gz') else None
        # Count how many comment lines should be skipped.
        comments = _count_comments(filename)
        # Return a simple DataFrame without splitting the INFO column.
        return pd.read_table(filename, compression=comp, skiprows=comments,
                             names=VCF_HEADER, usecols=range(10))

    # Each column is a list stored as a value in this dict. The keys for this
    # dict are the VCF column names and the keys in the INFO column.
    result = OrderedDict()
    # Parse each line in the VCF file into a dict.
    for i, line in enumerate(lines(filename)):
        for key in line.keys():
            # This key has not been seen yet, so set it to None for all
            # previous lines.
            if key not in result:
                result[key] = [None] * i
        # Ensure this row has some value for each column.
        for key in result.keys():
            result[key].append(line.get(key, None))

    return pd.DataFrame(result)

In [19]:
my_dataframe('./vcf/O15_B003776_unfilt.vcf')



Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,20
0,chr1,14610,.,T,C,62.74,.,AC=2;AF=1.00;AN=2;DP=11;ExcessHet=3.0103;FS=0....,GT:AD:DP:GQ:PL,"1/1:0,2:2:6:90,6,0"
1,chr1,14640,.,C,T,56.74,.,AC=2;AF=1.00;AN=2;DP=38;ExcessHet=3.0103;FS=0....,GT:AD:DP:GQ:PL,"1/1:0,2:2:6:84,6,0"
2,chr1,629906,rs1578391,C,T,314.77,.,AC=2;AF=1.00;AN=2;DB;DP=253;ExcessHet=3.0103;F...,GT:AD:DP:GQ:PL,"1/1:0,10:10:30:343,30,0"
3,chr1,630026,rs6594029,C,T,60.94,.,AC=2;AF=1.00;AN=2;DB;DP=70;ExcessHet=3.0103;FS...,GT:AD:DP:GQ:PL,"1/1:0,2:2:6:81,6,0"
4,chr1,630161,rs6594031,A,G,56.74,.,AC=2;AF=1.00;AN=2;DB;DP=5;ExcessHet=3.0103;FS=...,GT:AD:DP:GQ:PL,"1/1:0,2:2:6:84,6,0"
5,chr1,630211,rs7416152,C,T,180.90,.,AC=2;AF=1.00;AN=2;DB;DP=44;ExcessHet=3.0103;FS...,GT:AD:DP:GQ:PL,"1/1:0,5:5:15:209,15,0"
6,chr1,630317,rs3021087,A,G,305.78,.,AC=2;AF=1.00;AN=2;DB;DP=72;ExcessHet=3.0103;FS...,GT:AD:DP:GQ:PL,"1/1:0,8:8:24:334,24,0"
7,chr1,630490,.,T,C,127.03,.,AC=2;AF=1.00;AN=2;DP=21;ExcessHet=3.0103;FS=0....,GT:AD:DP:GQ:PL,"1/1:0,4:4:12:155,12,0"
8,chr1,630557,rs7417504,T,C,15.65,.,AC=2;AF=1.00;AN=2;DB;DP=2;ExcessHet=3.0103;FS=...,GT:AD:DP:GQ:PL,"1/1:0,1:1:3:42,3,0"
9,chr1,630596,.,C,T,15.65,.,AC=2;AF=1.00;AN=2;DP=3;ExcessHet=3.0103;FS=0.0...,GT:AD:DP:GQ:PL,"1/1:0,1:1:3:42,3,0"
