# Data Preparation for True Relationship Prediction

After predicting the co-occurence of candidates on the sentence level, the next step is to predict whether a candidate is a true relationship or just occured by chance. Through out this notebook the main events involved here are calculating summary statistics and obtaining the LSTM marginal probabilities.

In [14]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from collections import defaultdict
import csv
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from scipy.stats import fisher_exact
import scipy
from sqlalchemy import and_
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sns

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
#Set up the environment
username = "danich1"
password = "snorkel"
dbname = "pubmeddb"

#Path subject to change for different os
database_str = "postgresql+psycopg2://{}:{}@/{}?host=/var/run/postgresql".format(username, password, dbname)
os.environ['SNORKELDB'] = database_str

from snorkel import SnorkelSession
session = SnorkelSession()

In [16]:
from snorkel.models import Candidate, candidate_subclass
from snorkel.learning.disc_models.rnn import reRNN

In [17]:
DiseaseGene = candidate_subclass('DiseaseGene', ['Disease', 'Gene'])

# Count the Number of Sentences for Each Candidate

In [None]:
%%time
doc_counter = defaultdict(set)
sentence_counter = defaultdict(set)
offset = 0
chunk_size = 1e5

while True:
    cands = session.query(DiseaseGene).limit(chunk_size).offset(offset).all()
    
    if not cands:
        break
        
    for candidate in tqdm.tqdm(cands):
        sentence_counter[(candidate.Disease_cid, candidate.Gene_cid)].add(hash(candidate[0].get_parent()))
        doc_counter[(candidate.Disease_cid, candidate.Gene_cid)].add(hash(candidate[0].get_parent().document_id))
    
    offset+= chunk_size

In [None]:
candidate_df = pd.DataFrame(map(lambda x: [x[0], x[1], len(sentence_counter[x]), len(doc_counter[x])], sentence_counter), columns=["disease_id", "gene_id", "sentence_count", "doc_count"])

# Perform Fisher Exact Test on each Co-Occurence

In [6]:
def diffprop(obs):
    """
    `obs` must be a 2x2 numpy array.

    Returns:
    delta
        The difference in proportions
    ci
        The Wald 95% confidence interval for delta
    corrected_ci
        Yates continuity correction for the 95% confidence interval of delta.
    """
    n1, n2 = obs.sum(axis=1)
    prop1 = obs[0,0] / n1.astype(np.float64)
    prop2 = obs[1,0] / n2.astype(np.float64)
    delta = prop1 - prop2

    # Wald 95% confidence interval for delta
    se = np.sqrt(prop1*(1 - prop1)/n1 + prop2*(1 - prop2)/n2)
    ci = (delta - 1.96*se, delta + 1.96*se)

    # Yates continuity correction for confidence interval of delta
    correction = 0.5*(1/n1 + 1/n2)
    corrected_ci = (ci[0] - correction, ci[1] + correction)

    return delta, ci, corrected_ci

In [7]:
total = candidate_df["sentence_count"].sum()
odds = []
p_val = []
expected = []
lower_ci = []

for disease, gene in tqdm.tqdm(zip(candidate_df["disease_id"],candidate_df["gene_id"])):
    cond = (candidate_df["disease_id"] == disease) & (candidate_df["gene_id"] == gene)
    a = candidate_df[cond]["sentence_count"].values[0] + 1
                                        
    cond = (candidate_df["disease_id"] != disease) & (candidate_df["gene_id"] == gene)
    b = sum(candidate_df[cond]["sentence_count"].values) + 1
    
    cond = (candidate_df["disease_id"] == disease) & (candidate_df["gene_id"] != gene)
    c = sum(candidate_df[cond]["sentence_count"].values) +1
    
    cond = (candidate_df["disease_id"] != disease) & (candidate_df["gene_id"] != gene)
    d = sum(candidate_df[cond]["sentence_count"].values) + 1
    
    c_table = np.array([[a, b], [c, d]])
    
    # Gather confidence interval
    delta, ci, corrected_ci = diffprop(c_table)
    lower_ci.append(ci[0])
    
    # Gather corrected odds ratio and p_values
    odds_ratio, p_value = fisher_exact(c_table, alternative='greater')
    odds.append(odds_ratio)
    p_val.append(p_value)
    
    
    total_disease = candidate_df[candidate_df["disease_id"] == disease]["sentence_count"].sum()
    total_gene = candidate_df[candidate_df["gene_id"] == gene]["sentence_count"].sum()
    expected.append((total_gene * total_disease)/float(total))

candidate_df["p_value"] = p_val
candidate_df["odds_ratio"] = odds
candidate_df["expected_sen"] = expected
candidate_df["lower_ci"] = lower_ci

100%|██████████| 150280/150280 [2:41:42<00:00, 15.68it/s] 


In [None]:
candidate_df.sort_values("p_value", ascending=True).head(20)

# Combine sentence marginals

In [19]:
train_marginals_df = pd.read_csv("stratified_data/lstm_disease_gene_holdout/lstm_train_marginals.csv")
dev_marginals_df = pd.read_csv("stratified_data/lstm_disease_gene_holdout/lstm_dev_marginals.csv")
test_marginals_df = pd.read_csv("stratified_data/lstm_disease_gene_holdout/lstm_test_marginals.csv")

In [20]:
train_sentences_df = pd.read_csv("stratified_data/lstm_disease_gene_holdout/train_candidates_sentences.csv")
dev_sentences_df = pd.read_csv("stratified_data/lstm_disease_gene_holdout/dev_candidates_sentences.csv")
test_sentences_df = pd.read_csv("stratified_data/lstm_disease_gene_holdout/test_candidates_sentences.csv")

In [23]:
train_sentences_df["marginals"] = train_marginals_df["RNN_marginals"].values
dev_sentences_df["marginals"] = dev_marginals_df["RNN_marginals"].values
test_sentences_df["marginals"] = test_marginals_df["RNN_10_Marginals"].values

# Combine Marginal Probabilities

In [30]:
candidate_df = pd.read_csv("disease_gene_summary_stats.csv")

In [43]:
candidate_df.head(10)

Unnamed: 0,disease_id,gene_id,sentence_count,doc_num,p_value,odds_ratio,expected_sen,hetnet_labels,avg_marginal,final_model_pred,quantile_zero,quantile_twenty,quantile_forty,quantile_sixty,quantile_eighty,lower_ci
0,DOID:12849,3569,38,18,0.9995228,0.617773,62.795531,1,0.572253,0.173056,0.057443,0.246951,0.484126,0.773133,0.898218,-0.001252
1,DOID:3070,567,16,11,1.0,0.192395,86.200979,-1,0.08313,0.004616,0.076999,0.080302,0.081056,0.082545,0.085014,-0.025416
2,DOID:1324,6647,28,13,1.0,0.142479,194.551269,-1,0.371562,0.002645,0.073665,0.087599,0.095345,0.130062,0.896128,-0.044854
3,DOID:3121,2353,5,1,0.04840916,2.319352,2.589809,-1,0.08666,0.176227,0.079303,0.083052,0.084625,0.086648,0.09042,-0.000705
4,DOID:10652,5520,3,2,0.07655809,2.601331,1.54983,-1,0.567257,0.075174,0.093024,0.093384,0.473634,0.913429,0.913594,-0.022814
5,DOID:5041,319100,6,2,3.238321e-08,24.599058,0.301847,-1,0.086803,0.026659,0.083442,0.083687,0.084111,0.085787,0.089258,0.020276
6,DOID:2531,6400,4,1,0.8555222,0.674023,6.90929,-1,0.084754,0.00811,0.073178,0.077675,0.080208,0.080606,0.082958,-0.10229
7,DOID:1936,22846,1,1,0.3799695,1.528825,1.291256,-1,0.172828,0.012048,0.172828,0.172828,0.172828,0.172828,0.172828,-0.020213
8,DOID:2377,10804,2,2,0.02619467,4.934903,0.613036,-1,0.087173,0.023758,0.081853,0.083981,0.086109,0.088237,0.090365,-0.018946
9,DOID:14227,83844,13,8,5.753071e-42,4841.654246,0.0099,1,0.206593,0.150682,0.071549,0.083113,0.084878,0.086511,0.135542,0.53826


In [37]:
candidate_marginals = (
                        train_sentences_df[["disease_id", "gene_id", "marginals"]]
                        .append(dev_sentences_df[["disease_id", "gene_id", "marginals"]])
                        .append(test_sentences_df[["disease_id", "gene_id", "marginals"]])
                      )

In [42]:
group = candidate_marginals.groupby(["disease_id", "gene_id"])
quantile_zero = []
quantile_twenty = []
quantile_forty = []
quantile_sixty = []
quantile_eighty = []
avg_marginals = []

for i, cand in tqdm.tqdm(candidate_df[["disease_id", "gene_id"]].iterrows()):
    dg_series = group.get_group((cand["disease_id"], cand["gene_id"]))
    avg_marginals.append(dg_series["marginals"].mean())
    quantile_zero.append(dg_series["marginals"].quantile(0))
    quantile_twenty.append(dg_series["marginals"].quantile(0.2))
    quantile_forty.append(dg_series["marginals"].quantile(0.4))
    quantile_sixty.append(dg_series["marginals"].quantile(0.6))
    quantile_eighty.append(dg_series["marginals"].quantile(0.8))

# Save the evidence into a dataframe
candidate_df["quantile_zero"] = quantile_zero
candidate_df["quantile_twenty"] = quantile_twenty
candidate_df["quantile_forty"] = quantile_forty
candidate_df["quantile_sixty"] = quantile_sixty
candidate_df["quantile_eighty"] = quantile_eighty
candidate_df["avg_marginal"] = avg_marginals

150280it [02:32, 984.41it/s] 


## Save the data to a file

In [44]:
candidate_df.to_csv("disease_gene_summary_stats.csv", index=False)