# Define variants

In [None]:
mutations_dbNSFP = [
    ['ENST00000604567', 'M170T', 12],
    ['ENST00000604567', 'G195V', 12],
    ['ENST00000604567', 'P793R', 12],
    ['ENST00000604567', 'A1129V', 12],
    ['ENST00000604567', 'R1748C', 12],
    ['ENST00000604567', 'R1792W', 12],
    ['ENST00000604567', 'R1825P', 12],
    ['ENST00000604567', 'K1827R', 12]
]

test = [['ENST00000604567', 'V169A', 12]]

# Try the generation of a stop codon, changing M170(ATG) to TAG
# change the sequence 121806068 - 'CATG' to CTAG
test = [[12, 121806068, 'CAT', 'CTA']]

mutations = [
    '12     121806070   T   C'.split(),
    '12     121808247   G   T'.split(),
    '12     121814593   C   G'.split(),
    '12     121817872   C   T'.split(),
    '12     121825271   C   T'.split(),
    '12     121827555   C   T'.split(),
    '12     121827739   G   C'.split(),
    '12     121827745   A   G'.split()
]

# Run dbNSFP

In [None]:
import sys, os
from pathlib import Path
import json

sys.path.append(os.fspath(Path.cwd() / 'mutation_classifier/programs'))

from dbNSFP.dbNSFP_ibex import dbNSFPIbex

In [None]:
out_dir = Path('./results/dbnsfp')

In [None]:
exe = dbNSFPIbex(all_variants_list, out_dir = out_dir, time_per_command=0.5,
                 jobname='dbnsfpibex', max_jobs=100)
jobid = exe.run()

# Check results

In [None]:
from pathlib import Path
import json

In [None]:
pkls = list(Path('./results/dbnsfp').glob('*.pkl'))
len(pkls)

In [None]:
dfs = []
for pkl in pkls:
    dfs.append(pd.read_pickle(pkl))

all_variants_dbnsfp = pd.concat(dfs)

# Calculate the consensus of the deleteriousness predictions

In [None]:
def deleterious_prediction(var:pd.Series):
    sift = any([p=='D' for p in var['SIFT_pred'].split(';')])
    ph2_hdiv = any([p=='D' for p in var['Polyphen2_HDIV_pred'].split(';')])
    ph2_hvar = any([p=='D' for p in var['Polyphen2_HVAR_pred'].split(';')])
    lrt_pred = any([p=='D' for p in var['LRT_pred'].split(';')])
    mtt_pred = any([p=='D' for p in var['MutationTaster_pred'].split(';')])
    
    return all([sift, ph2_hdiv, ph2_hvar, lrt_pred, mtt_pred])

In [None]:
all_variants_dbnsfp['deleterious_consensus'] = all_variants_dbnsfp.apply(deleterious_prediction, axis=1)

# Conservation

```
151	GERP++_NR: GERP++ neutral rate
152	GERP++_RS: GERP++ RS score, the larger the score, the more conserved the site. Scores range from
		-12.3 to 6.17.
153	GERP++_RS_rankscore: GERP++ RS scores were ranked among all GERP++ RS scores in dbNSFP.
		The rankscore is the ratio of the rank of the score over the total number of GERP++ RS 
		scores in dbNSFP.
154	phyloP100way_vertebrate: phyloP (phylogenetic p-values) conservation score based on the
		multiple alignments of 100 vertebrate genomes (including human). The larger the score, 
		the more conserved the site. Scores range from -20.0 to 10.003 in dbNSFP.
155	phyloP100way_vertebrate_rankscore: phyloP100way_vertebrate scores were ranked among all
		phyloP100way_vertebrate scores in dbNSFP. The rankscore is the ratio of the rank of the 
		score over the total number of phyloP100way_vertebrate scores in dbNSFP.
156	phyloP30way_mammalian: phyloP (phylogenetic p-values) conservation score based on the
		multiple alignments of 30 mammalian genomes (including human). The larger the score, 
		the more conserved the site. Scores range from -20 to 1.312 in dbNSFP.
157	phyloP30way_mammalian_rankscore: phyloP30way_mammalian scores were ranked among all
		phyloP30way_mammalian scores in dbNSFP. The rankscore is the ratio of the rank of the 
		score over the total number of phyloP30way_mammalian scores in dbNSFP.
158	phyloP17way_primate: a conservation score based on 17way alignment primate set,
		the higher the more conservative. Scores range from -13.362 to 0.756 in dbNSFP.
159	phyloP17way_primate_rankscore: the rank of the phyloP17way_primate score among
		all phyloP17way_primate scores in dbNSFP.
160	phastCons100way_vertebrate: phastCons conservation score based on the multiple alignments
		of 100 vertebrate genomes (including human). The larger the score, the more conserved 
		the site. Scores range from 0 to 1. 
161	phastCons100way_vertebrate_rankscore: phastCons100way_vertebrate scores were ranked among
		all phastCons100way_vertebrate scores in dbNSFP. The rankscore is the ratio of the rank 
		of the score over the total number of phastCons100way_vertebrate scores in dbNSFP.
162	phastCons30way_mammalian: phastCons conservation score based on the multiple alignments
		of 30 mammalian genomes (including human). The larger the score, the more conserved 
		the site. Scores range from 0 to 1. 
163	phastCons30way_mammalian_rankscore: phastCons30way_mammalian scores were ranked among
		all phastCons30way_mammalian scores in dbNSFP. The rankscore is the ratio of the rank 
		of the score over the total number of phastCons30way_mammalian scores in dbNSFP.
164	phastCons17way_primate: a conservation score based on 17way alignment primate set,
		The larger the score, the more conserved the site. Scores range from 0 to 1. 
165	phastCons17way_primate_rankscore: the rank of the phastCons17way_primate score among
		all phastCons17way_primate scores in dbNSFP.
166	SiPhy_29way_pi: The estimated stationary distribution of A, C, G and T at the site,
		using SiPhy algorithm based on 29 mammals genomes. 
167	SiPhy_29way_logOdds: SiPhy score based on 29 mammals genomes. The larger the score,
		the more conserved the site. Scores range from 0 to 37.9718 in dbNSFP.
168	SiPhy_29way_logOdds_rankscore: SiPhy_29way_logOdds scores were ranked among all
		SiPhy_29way_logOdds scores in dbNSFP. The rankscore is the ratio of the rank 
		of the score over the total number of SiPhy_29way_logOdds scores in dbNSFP.
169	bStatistic: Background selection (B) value estimates from doi.org/10.1371/journal.pgen.1000471.
		Ranges from 0 to 1000. It estimates the expected fraction (*1000) of neutral diversity present 
		at a site. Values close to 0 represent near complete removal of diversity as a result of 
		background selection and values near 1000 indicating absent of background selection. 
		Data from CADD v1.4.
170	bStatistic_converted_rankscore: bStatistic scores were first converted to -bStatistic, then 
		ranked among all -bStatistic scores in dbNSFP. The rankscore is the ratio of the rank of 
		-bStatistic over the total number of -bStatistic scores in dbNSFP.
```

In [None]:
pd.merge(dbnsfp_all.loc[:,'HGVSp_VEP'], dbnsfp_all.loc[:,'GERP++_NR':'bStatistic_converted_rankscore'], left_index=True, right_index=True)

# Interpro domains

In [None]:
dbnsfp_all.loc[:,'Interpro_domain']