In [18]:
import requests, sys

import pandas as pd

from pyensembl import EnsemblRelease

# release 75 uses human reference genome GRCh37
ensembl_data = EnsemblRelease(75)

In [59]:
import vcf


def split_vals(d, key, list_delim=";"):
    """
    Given a dictionary and key within it, split values into multiple keys

    Parameters
    ----------
    d : `dict`
        Dictionary
    key : `str`
        Key within `d` to split
    list_delim : `str`, optional (default=";")
        String to delimit list values in `d[key]` with

    Returns
    -------
    d : `dict`
        Dictionary with split values
    """
    assert list_delim in d[key], "Key {} does not contain delimiter {}".format(key, list_delim)
    vals = d[key].split(list_delim)  # list of split values
    d[key] = vals[0]  # original key gets first value
    for i, val in enumerate(vals[1:]):
        # add key1, key2, etc. and corresponding values
        d["{}{}".format(val, i+1)] = val


def unpack_dict(d, list_delim=";", verbose=False, indent=""):
    """
    Given a dictionary, unpack values such that lists become singular.
    Intended to prep dictionaries for pandas compatibility.

    Parameters
    ----------
    d : `dict`
        Dictionary to unpack
    list_delim : `str`, optional (default=";")
        String to delimit list values in `d` with
    verbose : `bool`, optional (default=`False`)
        If `True`, print information when values are unpacked

    Returns
    -------
    d_unpacked : `dict`
        Dictionary with unpacked values
    """
    d_unpacked = d.copy()
    for key in d:
        if isinstance(d[key], list):
            # if key is a list, do some unpacking
            if len(d[key]) > 1:
                # if value is list, join by list_delim
                d_unpacked[key] = [str(elem) for elem in d_unpacked[key]]
                d_unpacked[key] = list_delim.join(d_unpacked[key])
                if verbose:
                    print("{}Joining key {} using delimiter {}".format(indent, key, list_delim))
            elif len(d_unpacked[key]) == 1:
                # if value is list with one element, unpack
                d_unpacked[key] = d_unpacked[key][0]
                if verbose:
                    print("{}Unpacked 1 value from key {}: {}".format(indent, key, d_unpacked[key]))
            else:
                # if empty list (for some reason), remove the key
                del d_unpacked[key]
        elif isinstance(d[key], dict):
            # if key is another dict, do full unpacking
            if verbose:
                print("{}Unpacking key {} as dictionary with {} keys".format(indent, key, len(d_unpacked[key])))
            # generate new keys from dict
            unpacked_key = unpack_dict(d_unpacked[key], list_delim=list_delim, verbose=verbose, indent="{}\t".format(indent))
            # delete original key-value pair
            del d_unpacked[key]
            # merge the two dicts
            d_unpacked = {**d_unpacked, **unpacked_key}
            if verbose:
                print("{}\tAdded {} keys to dict".format(indent, len(unpacked_key)))
        elif d[key] is None:
            # if empty value, just remove it
            del d_unpacked[key]

    return d_unpacked


def extract_call(call_obj, format_str="GT:GL:GOF:GQ:NR:NV", format_delim=":"):
    """
    Given a `vcf.model._Call` object, unpack values into dictionary

    Parameters
    ----------
    call_obj : `vcf.model._Call`
        PyVCF `model._Call` object to extract information from
    format_str : `str`, optional (default="GT:GL:GOF:GQ:NR:NV")
        String describing format of `vcf.model._Call` object, and keys present
    format_delim : `str`, optional (default=":")
        String to delimit list values in `format_str` with

    Returns
    -------
    call_dict : `dict`
        Dictionary with unpacked values from `call_obj`
    """
    assert isinstance(call_obj, vcf.model._Call), "call_obj must be a vcf.model._Call object"
    call_dict = {}  # initialize empty dictionary
    keys = format_str.split(format_delim)  # list of expected keys in call_obj
    for key in keys:
        call_dict[key] = getattr(call_obj.data, key)

    return call_dict



---

In [38]:
server = "https://grch37.rest.ensembl.org"  # ensembl queries
counter = 0  # count number of records in vcf
out_df = pd.DataFrame()  # initialize empty dataframe

vcf_read = vcf.Reader(open("test_vcf_data.txt", "r"))
for record in vcf_read:
    if counter < 10:
        print(record, end = "\n\n")
    if counter == 10:
        print(". . .", end = "\n\n")

    # unpack record into dictionary of depth 1
    record_unpacked = unpack_dict(record.__dict__)
    # unpack model._Call object
    call_dict = unpack_dict(
        extract_call(record_unpacked["samples"], format_str=record_unpacked["FORMAT"]),
    )
    # remove old keys
    del record_unpacked["samples"]
    del record_unpacked["FORMAT"]
    # merge dicts
    record_unpacked = {**record_unpacked, **call_dict}
    
    # add to out_df
    out_df = pd.concat([out_df, pd.DataFrame(record_unpacked, index=[counter])])
    
    counter += 1
    
print("\nDone! Total records: {}".format(counter))

Record(CHROM=1, POS=1158631, REF=A, ALT=[G])

Record(CHROM=1, POS=1246004, REF=A, ALT=[G])

Record(CHROM=1, POS=1249187, REF=G, ALT=[A])

Record(CHROM=1, POS=1261824, REF=G, ALT=[C])

Record(CHROM=1, POS=1387667, REF=C, ALT=[G])

Record(CHROM=1, POS=1585597, REF=A, ALT=[G])

Record(CHROM=1, POS=1585642, REF=G, ALT=[T])

Record(CHROM=1, POS=1586752, REF=T, ALT=[C])

Record(CHROM=1, POS=1647686, REF=A, ALT=[C])

Record(CHROM=1, POS=1647722, REF=GCTGTGACA, ALT=[TCTAGGATG])

. . .


Done! Total records: 11765


In [34]:
out_df.columns

Index(['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'start', 'end', 'alleles',
       'affected_start', 'affected_end', 'BRF', 'FR', 'HP', 'HapScore', 'MGOF',
       'MMLQ', 'MQ', 'NF', 'NR', 'PP', 'QD', 'SC', 'SbPval', 'Source', 'TC',
       'TCF', 'TCR', 'TR', 'WE', 'WS', 'sample', 'GT', 'GL', 'GOF', 'GQ', 'NV',
       'FILTER'],
      dtype='object')

In [39]:
record.is_deletion

False

In [40]:
record.is_indel

False

In [41]:
record.is_snp

True

In [42]:
record.var_type

'snp'

In [44]:
record.heterozygosity

0.5

In [46]:
record.var_subtype

'ts'

In [None]:
# calculate VAF
out_df["VAF"] = out_df["NV"].astype(int) / out_df["NR"].astype(int)

In [None]:
out_df = out_df.fillna("")

In [None]:
out_df.loc[out_df.NV.contains(";"),:]

In [32]:
type(out_df.ALT[0])

vcf.model._Substitution

In [33]:
out_df.ALT[0].__dict__

{'type': 'SNV', 'sequence': 'G'}

---

In [2]:
server = "https://grch37.rest.ensembl.org"  # ensembl queries

# initialize output as dict
out = {
    "chr":[],
    "pos":[],
    "ref":[],
    "alt":[],
    "var_type":[],
    "read_depth":[],
    "forward_depth":[],
    "reverse_depth":[],
    "variant_depth":[],
    "pct_variant":[],
    "pct_reference":[],
    "gene_symbol":[],
    "gene_id":[],
    "biotype":[],
    "consequence_terms":[],
}

counter = 0  # count number of records in vcf

vcf_read = vcf.Reader(open("test_vcf_data.txt", "r"))
for record in vcf_read:
    if counter < 10:
        print(record.var_type)
        print(record, end = "\n\n")
        
    # add attributes directly from vcf
    out["chr"].append(record.CHROM)
    out["pos"].append(record.POS)
    out["ref"].append(record.REF)
    out["alt"].append(record.ALT)
    out["var_type"].append(record.var_type)
    out["read_depth"].append(record.INFO["TR"])
    out["forward_depth"].append(record.INFO["NF"])
    out["reverse_depth"].append(record.INFO["NR"])
    
    # query ensembl API
    ext = "/vep/homo_sapiens/region"
    headers={"Content-Type":"application/json", "Accept":"application/json"}
    r = requests.post(
        server + ext,
        headers=headers,
        data='{{ "variants" : ["{} {} {} {} {} {} {} ." ] }}'.format(
            record.CHROM,
            record.POS,
            record.ID if record.ID is not None else ".",
            record.REF,
            record.ALT[0],
            record.QUAL,
            record.FILTER,
        )
    )
    
    # QC on API request
    if not r.ok:
        r.raise_for_status()
        sys.exit()
        
    # get dict of annotations
    decoded = r.json()
        
    # confirm gene symbol
    pyensembl_id = ensembl_data.genes_at_locus(contig=record.CHROM, position=record.POS)[0].gene_id
    api_id = decoded[0]['transcript_consequences'][0]["gene_id"]
    assert pyensembl_id == api_id, "Gene IDs don't match! pyensembl: {}; API: {}".format(pyensembl_id, api_id)
    
    # add ensembl annotations
    out["gene_id"].append(decoded[0]['transcript_consequences'][0]["gene_id"])
    out["gene_symbol"].append(decoded[0]['transcript_consequences'][0]["gene_symbol"])
    out["biotype"].append(decoded[0]['transcript_consequences'][0]["biotype"])
    out["consequence_terms"].append(";".join(decoded[0]['transcript_consequences'][0]["consequence_terms"]) if len(decoded[0]['transcript_consequences'][0]["consequence_terms"]) > 1 else decoded[0]['transcript_consequences'][0]["consequence_terms"][0])
    
    counter += 1
    
print("\nDone! Total records: {}".format(counter))

snp
Record(CHROM=1, POS=1158631, REF=A, ALT=[G])

snp
Record(CHROM=1, POS=1246004, REF=A, ALT=[G])



AssertionError: Gene IDs don't match! pyensembl: ENSG00000169972; API: ENSG00000127054

In [16]:
decoded[0].keys()

dict_keys(['regulatory_feature_consequences', 'most_severe_consequence', 'assembly_name', 'strand', 'id', 'end', 'start', 'transcript_consequences', 'input', 'seq_region_name', 'allele_string', 'colocated_variants'])

In [6]:
record_unpacked = unpack_dict(record.__dict__, verbose=True)

Unpacked 1 value from key ALT: G
Unpacking key INFO as dictionary with 20 keys
	Unpacked 1 value from key FR: 1.0
	Unpacked 1 value from key HapScore: 1
	Unpacked 1 value from key MGOF: 5
	Unpacked 1 value from key MQ: 59.5
	Unpacked 1 value from key NF: 101
	Unpacked 1 value from key NR: 47
	Unpacked 1 value from key PP: 2965.0
	Unpacked 1 value from key SbPval: 0.62
	Unpacked 1 value from key Source: Platypus
	Unpacked 1 value from key TR: 148
	Added 20 keys to dict
Joining key alleles using delimiter ;
Unpacked 1 value from key samples: Call(sample=sample, CallData(GT=1/1, GL=[-300.0, -41.24, 0.0], GOF=5.0, GQ=99, NR=152, NV=148))
Unpacking key _sample_indexes as dictionary with 1 keys
	Added 1 keys to dict


In [7]:
record_unpacked

{'CHROM': '1',
 'POS': 1246004,
 'ID': None,
 'REF': 'A',
 'ALT': G,
 'QUAL': 2965,
 'FORMAT': 'GT:GL:GOF:GQ:NR:NV',
 'start': 1246003,
 'end': 1246004,
 'alleles': 'A;G',
 'samples': Call(sample=sample, CallData(GT=1/1, GL=[-300.0, -41.24, 0.0], GOF=5.0, GQ=99, NR=152, NV=148)),
 'affected_start': 1246003,
 'affected_end': 1246004,
 'BRF': 0.09,
 'FR': 1.0,
 'HP': 6,
 'HapScore': 1,
 'MGOF': 5,
 'MMLQ': 32.0,
 'MQ': 59.5,
 'NF': 101,
 'NR': 47,
 'PP': 2965.0,
 'QD': 20.0,
 'SC': 'ACAGGTACGTATTTTTCCAGG',
 'SbPval': 0.62,
 'Source': 'Platypus',
 'TC': 152,
 'TCF': 101,
 'TCR': 51,
 'TR': 148,
 'WE': 1246012,
 'WS': 1245994,
 'sample': 0}

In [10]:
call_dict = unpack_dict(extract_call(record_unpacked["samples"], format_str=record_unpacked["FORMAT"]), verbose=True)

Joining key GL using delimiter ;


In [11]:
call_dict

{'GT': '1/1',
 'GL': '-300.0;-41.24;0.0',
 'GOF': 5.0,
 'GQ': 99,
 'NR': 152,
 'NV': 148}

In [64]:
record_unpacked["FORMAT"].split(":")

['GT', 'GL', 'GOF', 'GQ', 'NR', 'NV']

In [32]:
record_unpacked["ALT"].sequence

'G'

In [33]:
record_unpacked["ALT"].type

'SNV'

In [69]:
record_unpacked["samples"].data

CallData(GT='1/1', GL=[-300.0, -41.24, 0.0], GOF=5.0, GQ=99, NR=152, NV=148)

In [51]:
record.__dict__

{'CHROM': '1',
 'POS': 1246004,
 'ID': None,
 'REF': 'A',
 'ALT': [G],
 'QUAL': 2965,
 'FILTER': [],
 'INFO': {'BRF': 0.09,
  'FR': [1.0],
  'HP': 6,
  'HapScore': [1],
  'MGOF': [5],
  'MMLQ': 32.0,
  'MQ': [59.5],
  'NF': [101],
  'NR': [47],
  'PP': [2965.0],
  'QD': 20.0,
  'SC': 'ACAGGTACGTATTTTTCCAGG',
  'SbPval': [0.62],
  'Source': ['Platypus'],
  'TC': 152,
  'TCF': 101,
  'TCR': 51,
  'TR': [148],
  'WE': 1246012,
  'WS': 1245994},
 'FORMAT': 'GT:GL:GOF:GQ:NR:NV',
 'start': 1246003,
 'end': 1246004,
 'alleles': ['A', G],
 'samples': [Call(sample=sample, CallData(GT=1/1, GL=[-300.0, -41.24, 0.0], GOF=5.0, GQ=99, NR=152, NV=148))],
 '_sample_indexes': {'sample': 0},
 'affected_start': 1246003,
 'affected_end': 1246004}

In [11]:
out

{'chr': ['1', '1'],
 'pos': [1158631, 1246004],
 'ref': ['A', 'A'],
 'alt': [[G], [G]],
 'var_type': ['snp', 'snp'],
 'read_depth': [[156], [148]],
 'forward_depth': [[89], [101]],
 'reverse_depth': [[67], [47]],
 'variant_depth': [],
 'pct_variant': [],
 'pct_reference': [],
 'gene_symbol': ['SDF4'],
 'gene_id': ['ENSG00000078808'],
 'biotype': ['protein_coding'],
 'consequence_terms': ['synonymous_variant']}

In [10]:
decoded[0]['transcript_consequences']

[{'strand': -1,
  'transcript_id': 'ENST00000323275',
  'biotype': 'retained_intron',
  'gene_id': 'ENSG00000127054',
  'consequence_terms': ['downstream_gene_variant'],
  'gene_symbol': 'CPSF3L',
  'impact': 'MODIFIER',
  'hgnc_id': 26052,
  'variant_allele': 'G',
  'gene_symbol_source': 'HGNC',
  'distance': 976},
 {'gene_id': 'ENSG00000131584',
  'biotype': 'protein_coding',
  'consequence_terms': ['upstream_gene_variant'],
  'transcript_id': 'ENST00000353662',
  'strand': -1,
  'variant_allele': 'G',
  'distance': 2735,
  'gene_symbol_source': 'HGNC',
  'gene_symbol': 'ACAP3',
  'hgnc_id': 16754,
  'impact': 'MODIFIER'},
 {'consequence_terms': ['upstream_gene_variant'],
  'gene_id': 'ENSG00000131584',
  'biotype': 'protein_coding',
  'transcript_id': 'ENST00000354700',
  'strand': -1,
  'distance': 2606,
  'gene_symbol_source': 'HGNC',
  'variant_allele': 'G',
  'hgnc_id': 16754,
  'impact': 'MODIFIER',
  'gene_symbol': 'ACAP3'},
 {'gene_id': 'ENSG00000131584',
  'biotype': 'nonsen

---

In [None]:
err_counter=0
for i in a_comb.var_names:
    if a_comb.var.loc[i, "gene_id"] == "":
        try:
            a_comb.var.loc[i, "gene_id"] = data.gene_ids_of_gene_name(i)[0]
        except:
            err_counter += 1
print("{} errors in ENSEMBL mapping".format(err_counter))