In [1]:
import gzip
import pandas as pd
import json
import re, os
import pyBigWig
import glob

HEADER_HASH = '#'
hgnc = pyBigWig.open("project/aux/hgnc.bb")
    
def is_in_gene_set(pos,gene_list):
    is_gene = False
    for gene in gene_list:
        chromo = gene.split(':')[0]
        if pos['chromosome'] != chromo: continue
        index = gene.split(':')[1]
        start = index.split('-')[0]
        end = index.split('-')[1]
        if pos['position'] >= int(start) and pos['position'] <= int(end):
            is_gene = True
            break
    return is_gene

def get_gene_name(chrom,pos):
    return hgnc.entries(chrom,pos,pos+1)[0][2].split('\t')[6], hgnc.entries(chrom,pos,pos+1)[0][2].split('\t')[7]
    
def parse_annotated_json(path, gene_subset):
    #Takes in a annotated VCF file and filters only variants that lie within a gene subset
    #Gene_subset is expected to be a list of genome positions in the format of chr:start-end
    header = ''
    positions = []
    genes = []
    is_header_line = True
    is_position_line = False
    is_gene_line = False
    gene_section_line = '],"genes":['
    end_line = ']}'
    with gzip.open(file, 'rt') as f:
        position_count = 0
        gene_count = 0
        count = 0
        for line in f:
            trim_line = line.strip()
            if is_header_line:
                ## only keep the "header" field content from the line
                header = trim_line[10:-14]
                is_header_line = False
                is_position_line = True
            elif trim_line == gene_section_line:
                is_gene_line = True
                is_position_line = False
                continue
            elif trim_line == end_line:
                break
            else:
                if is_position_line:
                    ## remove the trailing ',' if there is
                    trim_line = trim_line.rstrip(',')
                    pos = json.loads(trim_line)
                    if is_in_gene_set(pos,gene_subset):
                        positions.append(trim_line)
                        position_count += 1
                    #if position_count == 100000: break
                if is_gene_line:
                    ## remove the trailing ',' if there is
                    genes.append(trim_line.rstrip(','))
                    gene_count += 1
    return {'header':header, 'positions':positions, 'genes':genes}

def get_clingen_annot(records):
    labels = []
    for rec in records:
        labels.append(rec['clinicalInterpretation'])
    return(max(set(labels), key=labels.count))
def get_clinvar_annot(records):
    labels = []
    for rec in records:
        labels = labels + rec['significance']
    return(max(set(labels), key=labels.count))

In [2]:
files = glob.glob('project/results/germline/*/*/*.sv.vcf.annotated.json.gz')
gene_set = pd.read_csv("project/aux/Mito-Lyso-Pesticide_PD_genes.csv")

In [3]:
for file in files:
    df = []
    sample_name = file.split('/')[4]
    annotated_vcf = parse_annotated_json(file, gene_set.pos)
    for pos in annotated_vcf['positions']:
        x = json.loads(pos)
        for var_dict in x['variants']:
            build_dict = {'vid' : var_dict['vid'],
                          'chromosome' : var_dict['chromosome'],
                          'begin' : var_dict['begin'],
                          'end' : var_dict['end'],
                          'gene_symbol' : get_gene_name(var_dict['chromosome'],int(var_dict['begin']))[0],
                          'gene_name' : get_gene_name(var_dict['chromosome'],int(var_dict['begin']))[1],
                          'refAllele' : var_dict['refAllele'],
                          'altAllele' : var_dict['altAllele'],
                          'variantType' : var_dict['variantType'],
                          'quality' : x['quality'],
                          'samples' : x['samples']
                         }
            if 'svLength' in x:
                build_dict = build_dict | {'svLength' : x['svLength']}
            if 'clingen' in x:
                build_dict = build_dict | {'clingen' : get_clingen_annot(x['clingen'])}
            if 'clinvar' in x:
                build_dict = build_dict | {'clinvar' : get_clinvar_annot(x['clinvar'])}
            if 'clinvar' in var_dict: # Alot of information...unsure of how to simplify into tabular. Include all for now
                build_dict = build_dict | {'clinvar' : var_dict['clinvar']}
            if 'dbsnp' in var_dict:
                build_dict = build_dict | {'dbsnp' : var_dict['dbsnp']}
            if 'hgvsg' in var_dict:
                build_dict = build_dict | {'hgvsg' : var_dict['hgvsg']}
            if 'gnomad' in var_dict:
                prefixed_dict = {'gnomad_'+k: v for k, v in var_dict['gnomad'].items()}
                build_dict = build_dict | prefixed_dict
            if 'regulatoryRegions' in var_dict:
                prefixed_dict = {'regulatoryRegions_'+k: v for k, v in var_dict['regulatoryRegions'][0].items()}
                build_dict = build_dict | prefixed_dict
            if 'cosmic' in var_dict:
                build_dict = build_dict | {'cosmic' : var_dict['cosmic']}
            if 'isStructuralVariant' in var_dict:
                build_dict = build_dict | {'isStructuralVariant' : var_dict['isStructuralVariant']}
            df.append(build_dict)
    df = pd.DataFrame(df)
    df.to_csv(f'germline_variants/sv/{sample_name}_germline_sv.csv')

KeyboardInterrupt: 