In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 300)
import vcf

In [8]:
pheno = pd.read_csv('../16p12.2_rnaseq_analysis/data/pheno_final.tsv', sep='\t')
pheno = pheno.drop_duplicates('subject')
pheno = pheno.set_index('subject', drop=False)

In [2]:
perm = pd.read_csv('../eqtl/permutations.significant.txt', sep=' ', header=None)
cols = ['gene', 'gene_chrom', 'gene_start', 'gene_end', 'gene_strand', 'vars_tested', 'distance',
       'var_id', 'chrom', 'start', 'end', 'dof', 'dummy', 'param1', 'param2', 'nom_p', 
               'beta', 'emp_p', 'adj_p', 'FDR', 'unk']
perm.columns = cols

In [3]:
# general annotations
mapp = pd.read_csv('../16p12.2_rnaseq_analysis/data/gene_names_mapping_new.tsv', sep='\t')
mapp = mapp.set_index('ensembl')

perm['ensembl'] = perm.gene.apply(lambda s: s.split('.')[0])
perm = perm.set_index('ensembl', drop=False)
perm['gene'] = mapp.loc[perm.ensembl, 'Description']
perm['biotype'] = mapp.loc[perm.ensembl, 'biotype']

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# GTEx Annotations
gtex_eqtls = pd.read_csv('/data5/16p12_RNA/GTEx/data/GTEx_Analysis_v7_eQTL/Cells_EBV-transformed_lymphocytes.v7.signif_variant_gene_pairs.txt.gz', sep='\t')

gtex_eqtls['chrom'] = gtex_eqtls.variant_id.apply(lambda s: s.split('_')[0])
gtex_eqtls['pos'] = gtex_eqtls.variant_id.apply(lambda s: int(s.split('_')[1]))

perm['is_in_gtex'] = '.'
novel = 0
not_novel = 0
for gene in perm['ensembl'].tolist():
    if gtex_eqtls[(perm.at[gene, 'start'] == gtex_eqtls.pos)].shape[0] != 0:
#         print(gene, perm.at[gene, 'gene'], perm.at[gene, 'start'], perm.at[gene, 'end'])
#         print(gtex_eqtls[(perm.at[gene, 'start'] == gtex_eqtls.pos)][['gene_id','variant_id']])
        not_novel = not_novel + 1
        perm.at[gene, 'is_in_gtex'] = True
    else:
        novel = novel +1
        perm.at[gene, 'is_in_gtex'] = False

In [5]:
# annovar annotations
annovar = pd.read_csv('../eqtl/eqtls.discovered.perm.hg19_multianno.txt', sep='\t')
for i, row in perm.iterrows():
    chrom = row.chrom
    start = row.start
    if annovar[(annovar.Chr == chrom) & (annovar.Start == start)].shape[0] != 1:
        print(chrom, start, annovar[(annovar.Chr == chrom) & (annovar.Start == start)].shape[0])
        
    anno = annovar[(annovar.Chr == chrom) & (annovar.Start == start)].iloc[0]
    if anno['gnomAD_genome_ALL'] == '.':
        perm.at[i, 'gnomad'] = 0.
    else:
        perm.at[i, 'gnomad'] = float(anno['gnomAD_genome_ALL'])
    perm.at[i, 'Ref'] = anno['Ref']
    perm.at[i, 'Alt'] = anno['Alt']

In [9]:
# Variant annotations
vcfr = vcf.VCFReader(filename='/data3/16p12_WGS/phasing/whatshap/combined.vcf.gz')

perm['filter'] = ''
for s in pheno.subject.unique():
    perm[s] = ''
for i, row in perm.iterrows():
    num = 0
    chrom = row['chrom']
    start = row['start']
    end = row['end']
    for r in vcfr.fetch(chrom, start-1, end):
        num = num + 1
        ref = r.REF
        alt = r.ALT[-1]
        
        perm.at[i, 'filter'] = r.FILTER
        subs = []
        for samp in r.samples:
            sub = samp.sample
            gt = samp['GT']
            perm.at[i, sub] = gt

In [15]:
# protein coding filter
perm = perm[perm.biotype == 'protein_coding'].copy()

In [16]:
# save
perm.to_csv('perm_annotated.tsv', sep='\t', index=False)