In [2]:
import pandas as pd
from kipoiseq.extractors import MultiSampleVCF
from tqdm import tqdm

In [2]:
import os

try:
    snakemake
except NameError:
    from snakemk_util import load_rule_args
    snakemake = load_rule_args(
        snakefile = os.getcwd() + '/../Snakefile',
        rule_name = 'prioritize_vep_variants',
        root=os.getcwd() + "/..",
        default_wildcards={'chrom': 'chr10'}
    )

In [4]:
df_metadata = pd.read_csv(snakemake.input['released_files'])

In [5]:
sample_mapping = {
    row['CGND_ID']: row['Participant_ID']
    for i, row in df_metadata[df_metadata['omic'] == 'genomics'].iterrows()
}

In [6]:
df_results = pd.read_csv(snakemake.input['results']).set_index('geneID')
df_genes = pd.read_csv(snakemake.input['genes']).set_index('geneID')
df_results = df_results.join(df_genes)

Unnamed: 0_level_0,sampleID,pValue,padjust,zScore,l2fc,rawcounts,normcounts,meanCorrected,theta,aberrant,AberrantBySample,AberrantByGene,padj_rank,Chromosome,Start,End,Strand,gene_name,gene_biotype
geneID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ENSG00000000003,CASE.NEUAA599TMX,1.036564e-17,9.877520e-13,-9.49,-4.48,30,40.89,936.79,19.46,True,5,3,2.0,X,100627108.0,100639991.0,-,TSPAN6,protein_coding
ENSG00000000003,CTRL.NEUHE723FGT,6.894534e-09,1.313973e-03,-5.45,-2.59,77,154.45,936.79,19.46,True,4,3,1.0,X,100627108.0,100639991.0,-,TSPAN6,protein_coding
ENSG00000000003,CASE.NEUBA169GXD,3.543101e-08,1.688127e-03,-5.01,-2.39,140,179.07,936.79,19.46,True,4,3,4.0,X,100627108.0,100639991.0,-,TSPAN6,protein_coding
ENSG00000003056,CTRL.NEUCV136DHM,4.702285e-13,4.480852e-09,6.78,0.50,3296,2658.65,1884.46,462.00,True,102,1,20.0,12,8940362.0,8949955.0,-,M6PR,protein_coding
ENSG00000004455,CASE.NEUGW340YEB,8.518985e-12,1.623564e-06,-7.12,-0.75,942,816.87,1374.93,241.90,True,12,1,1.0,1,33007939.0,33080996.0,-,AK2,protein_coding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000278771,CASE.NEUGW340YEB,3.340853e-11,3.183531e-06,5.25,2.22,4043,2203.72,471.14,10.14,True,12,1,2.0,14,49853615.0,49853914.0,-,Metazoa_SRP,misc_RNA
ENSG00000278777,CASE.NEUZV656DD1,2.250425e-06,2.859269e-02,4.67,2.44,6,8.22,0.45,22.53,True,17,1,15.0,GL000220.1,108279.0,108340.0,+,AL592188.12,miRNA
ENSG00000279203,CASE.NEUZP278MR4,1.152811e-08,8.788194e-05,4.55,1.10,268,124.23,57.45,56.39,True,32,1,25.0,19,15383201.0,15383832.0,-,AC005785.5,TEC
ENSG00000279500,CASE.NEUMD869UV7,6.831112e-08,3.254716e-03,-7.31,-6.41,0,0.00,94.13,6.55,True,5,1,4.0,12,128813185.0,128814750.0,-,RP11-21K12.2,TEC


In [7]:
df_results = df_results[df_results['Chromosome'] == snakemake.wildcards['chrom'].replace('chr', '')]

In [8]:
df = pd.read_parquet(snakemake.input['vep'])
df['variant'] = snakemake.wildcards['chrom'] + ':' + df['pos'].astype(str) + ':'+ df['ref'] + '>' + df['alt']
df = df.set_index('variant')

In [9]:
df_mmsplice = pd.read_csv(snakemake.input['mmsplice'])

df_mmsplice = df_mmsplice[df_mmsplice['delta_logit_psi'].abs() > 2] \
    .rename(columns={'gene_id': 'geneID', 'ID': 'variant'})

df_mmsplice['geneID'] = df_mmsplice['geneID'].str.split('.').str.get(0)
df_mmsplice = df_mmsplice[['geneID', 'variant', 'delta_logit_psi']].drop_duplicates()

df_mmsplice = df_mmsplice[df_mmsplice['variant'].str.startswith(snakemake.wildcards['chrom'])]
df_mmsplice = df_mmsplice.groupby(['geneID', 'variant']).agg(max).reset_index()

In [10]:
df = df.join(df_mmsplice.set_index('variant')[['delta_logit_psi']])

In [21]:
# if high scored mmsplice moderate 
df.loc[(~df['delta_logit_psi'].isna()) & (df['IMPACT'] != 'HIGH'), 'IMPACT'] = 'MODERATE'

In [23]:
df = df[df['AF'].fillna(0) < 0.001] # subset outliers
print(df.shape)
df = df[df['IMPACT'].isin({'HIGH', 'MODERATE'})] # subset for high impact variants
print(df.shape)
df = df[df['Gene'].isin(set(df_results.index))] # subset outliers
print(df.shape)

(2014821, 90)
(6656, 90)
(202, 90)


In [24]:
vcf = MultiSampleVCF(snakemake.input['vcf'])

In [25]:
df_mmsplice['samples'] = [
    [
        sample_mapping[k.replace('-b38', '')]
        for k, v in vcf.get_samples(vcf.get_variant(i)).items()
    ]
    for i in df_mmsplice['variant']
]
df_mmsplice = df_mmsplice[df_mmsplice['samples'].map(len) < 7].explode('samples')

In [28]:
df['samples'] = [
    [
        sample_mapping[k.replace('-b38', '')]
        for k, v in vcf.get_samples(vcf.get_variant(i)).items()
    ]
    for i in df.index
]

In [29]:
df = df[df['samples'].map(len) < 7].explode('samples')

In [50]:
df.to_csv(snakemake.output['prioritized'])