In [3]:
from gensim.models import KeyedVectors

wv = KeyedVectors.load("~/ppi-core-genes/data/walking.output", mmap='r')

In [4]:
import numpy as np



In [18]:
np.corrcoef(wv["NDUFA1"].T, wv["NDUFA2"])

array([[1.        , 0.74105393],
       [0.74105393, 1.        ]])

In [15]:
np.corrcoef(wv["NDUFA1"].T, wv["MTOR"])

array([[1.       , 0.3051454],
       [0.3051454, 1.       ]])

In [12]:
len(wv[wv.index_to_key[0]])

100

In [5]:
import networkx as nx
import pandas as pd

In [7]:
edgelist = pd.read_csv("~/ppi-core-genes/data/ppi/bioplex_hgnc_only.tsv",sep="\t",names=["source","target"])
graph = nx.from_pandas_edgelist(edgelist,create_using=nx.Graph)

In [9]:
nx.info(graph)

'Graph with 13957 nodes and 118162 edges'

In [1]:
import os
os.chdir("../../")

In [2]:
from src.datasets import Mapper, Preprocessor
from src.utils.config import Config
from src.utils.metrics import *
from src.utils.rank_metrics import *
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, RandomForestRegressor,BaggingRegressor
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np


In [3]:
config = Config()

In [4]:
mappings = Mapper(config.input.gene_sets,config.input.gwas).get_mappings(config.input.tag, field = config.input.field)
preprocessor = Preprocessor(mappings)
X, y, _ = preprocessor.get_data(use_embeddings=True)

In [5]:
X_noemb = X[:,:100]

In [6]:
metricHelper = MetricsHelper(y,masks={},pred_cutoff=0.5)

In [43]:
skf = StratifiedKFold(n_splits=4)
predicted_probas = []
tests = []
for i, (train, test) in enumerate(skf.split(X, y)):
    clf = RandomForestClassifier(max_depth=100, random_state=0, oob_score=True, class_weight="balanced").fit(X[train], y[train])
    print(clf.oob_score_)
    mask_key = "split{}".format(i)
    metricHelper.masks.update({mask_key: test})
    predicted_proba = np.log(clf.predict_proba(X[test])+1e-32)[:,1]
    predicted_probas.extend(predicted_proba.tolist())
    tests.extend(test.tolist())
    metricHelper.update(predicted_proba, mask_key)
    probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
    print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, mask_key))

sorted_indices, sorted_probas = zip(*sorted(zip(tests,predicted_probas)))
metricHelper.update(np.asarray(sorted_probas), "all")
probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, "all"))
ranking = np.argsort(sorted_probas)[::-1]
ordered_truth_val = np.take_along_axis(y, ranking, axis=0)
get_metrics([ordered_truth_val])


TypeError: __init__() got an unexpected keyword argument 'class_weight'

In [54]:
skf = StratifiedKFold(n_splits=4)
predicted_probas = []
tests = []
for i, (train, test) in enumerate(skf.split(X, y)):
    clf = RandomForestRegressor(max_depth=10, random_state=0, oob_score=True,n_estimators=100).fit(X[train], y[train])
    print(clf.oob_score_)
    mask_key = "split{}".format(i)
    metricHelper.masks.update({mask_key: test})
    predicted_proba = np.log(clf.predict(X[test])+1e-32)
    predicted_probas.extend(predicted_proba.tolist())
    tests.extend(test.tolist())
    metricHelper.update(predicted_proba, mask_key)
    probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
    print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, mask_key))

sorted_indices, sorted_probas = zip(*sorted(zip(tests,predicted_probas)))
metricHelper.update(np.asarray(sorted_probas), "all")
probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, "all"))
ranking = np.argsort(sorted_probas)[::-1]
ordered_truth_val = np.take_along_axis(y, ranking, axis=0)
get_metrics([ordered_truth_val])

-0.0024651262563888388
Accuracy: 0.9678753954733512, Recall: 0.0, Precision: 0.0, AUROC: 0.691199777508553, AUPRC: 0.08119632681880311, F1: 0.0, MRR: 0.010949548395070226, Target: split0
0.003385120324312596
Accuracy: 0.9681110029211295, Recall: 0.0, Precision: 0.0, AUROC: 0.6423154512492634, AUPRC: 0.07094565670256417, F1: 0.0, MRR: 0.005925695586192437, Target: split1
0.003624660558230852
Accuracy: 0.9681110029211295, Recall: 0.0, Precision: 0.0, AUROC: 0.6112628530078486, AUPRC: 0.05752360455864895, F1: 0.0, MRR: 0.003867986755502078, Target: split2
0.00365368148398848
Accuracy: 0.9681110029211295, Recall: 0.0, Precision: 0.0, AUROC: 0.6414804975939898, AUPRC: 0.06675947085948448, F1: 0.0, MRR: 0.005360598945629266, Target: split3
Accuracy: 0.9680520903060914, Recall: 0.0, Precision: 0.0, AUROC: 0.6332571811726954, AUPRC: 0.06527769671736518, F1: 0.0, MRR: 0.0018437209702388976, Target: all


  total_after = np.sum(np.sum(r) for r in rs_filtered)


(0.0018437209702388976,
 0.0019765702400485873,
 6104.346666666666,
 5842.346666666666,
 0.0019047619047619048,
 0.0019047619047619048,
 0.0019047619047619048,
 0.0019047619047619048,
 0.005714285714285714,
 0.005714285714285714,
 0.011428571428571429,
 0.10857142857142857)

In [59]:
from sklearn.svm import SVR
skf = StratifiedKFold(n_splits=4)
predicted_probas = []
tests = []
for i, (train, test) in enumerate(skf.split(X, y)):
    clf = BaggingRegressor(base_estimator=SVR(), n_estimators=100, random_state=0).fit(X[train], y[train])
    mask_key = "split{}".format(i)
    metricHelper.masks.update({mask_key: test})
    predicted_proba = np.log(clf.predict(X[test])+1e-32)
    predicted_probas.extend(predicted_proba.tolist())
    tests.extend(test.tolist())
    metricHelper.update(predicted_proba, mask_key)
    probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
    print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, mask_key))

sorted_indices, sorted_probas = zip(*sorted(zip(tests,predicted_probas)))
metricHelper.update(np.asarray(sorted_probas), "all")
probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, "all"))
ranking = np.argsort(sorted_probas)[::-1]
ordered_truth_val = np.take_along_axis(y, ranking, axis=0)
get_metrics([ordered_truth_val])

Accuracy: 0.9678753954733512, Recall: 0.0, Precision: 0.0, AUROC: 0.5997020748089393, AUPRC: 0.06817807410334853, F1: 0.0, MRR: 0.012231965429480999, Target: split0
Accuracy: 0.9681110029211295, Recall: 0.0, Precision: 0.0, AUROC: 0.5767648712923739, AUPRC: 0.05394504848933844, F1: 0.0, MRR: 0.005632910586311973, Target: split1
Accuracy: 0.9681110029211295, Recall: 0.0, Precision: 0.0, AUROC: 0.555011929280385, AUPRC: 0.05001954201187257, F1: 0.0, MRR: 0.002818419867800991, Target: split2
Accuracy: 0.9681110029211295, Recall: 0.0, Precision: 0.0, AUROC: 0.6015082909938252, AUPRC: 0.06362872728424708, F1: 0.0, MRR: 0.0038770293652761927, Target: split3
Accuracy: 0.9680520903060914, Recall: 0.0, Precision: 0.0, AUROC: 0.583235149729995, AUPRC: 0.05424950329094309, F1: 0.0, MRR: 0.0016032998754804244, Target: all


  total_after = np.sum(np.sum(r) for r in rs_filtered)


(0.0016032998754804244,
 0.001686632868121468,
 6892.897142857143,
 6630.897142857143,
 0.0019047619047619048,
 0.0019047619047619048,
 0.0019047619047619048,
 0.0019047619047619048,
 0.0038095238095238095,
 0.0038095238095238095,
 0.009523809523809525,
 0.10857142857142857)

In [64]:
from sklearn.svm import SVR
skf = StratifiedKFold(n_splits=4)
predicted_probas = []
tests = []
for i, (train, test) in enumerate(skf.split(X_noemb, y)):
    clf = BaggingRegressor(base_estimator=SVR(), n_estimators=100, random_state=0).fit(X_noemb[train], y[train])
    mask_key = "split{}".format(i)
    metricHelper.masks.update({mask_key: test})
    predicted_proba = np.log(clf.predict(X_noemb[test])+1e-32)
    predicted_probas.extend(predicted_proba.tolist())
    tests.extend(test.tolist())
    metricHelper.update(predicted_proba, mask_key)
    probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
    print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, mask_key))

sorted_indices, sorted_probas = zip(*sorted(zip(tests,predicted_probas)))
metricHelper.update(np.asarray(sorted_probas), "all")
probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, "all"))
ranking = np.argsort(sorted_probas)[::-1]
ordered_truth_val = np.take_along_axis(y, ranking, axis=0)
get_metrics([ordered_truth_val])

Accuracy: 0.9678753954733512, Recall: 0.0, Precision: 0.0, AUROC: 0.553662727348923, AUPRC: 0.059093107251652496, F1: 0.0, MRR: 0.011727061472681401, Target: split0
Accuracy: 0.9681110029211295, Recall: 0.0, Precision: 0.0, AUROC: 0.5459809937675988, AUPRC: 0.048356044359869045, F1: 0.0, MRR: 0.005329050186394648, Target: split1
Accuracy: 0.9681110029211295, Recall: 0.0, Precision: 0.0, AUROC: 0.5317944593627095, AUPRC: 0.04314212378183076, F1: 0.0, MRR: 0.0024338278293824047, Target: split2
Accuracy: 0.9681110029211295, Recall: 0.0, Precision: 0.0, AUROC: 0.5836364439035907, AUPRC: 0.058350027136052876, F1: 0.0, MRR: 0.0036665974626101674, Target: split3
Accuracy: 0.9680520903060914, Recall: 0.0, Precision: 0.0, AUROC: 0.5525737275045799, AUPRC: 0.0481245251012677, F1: 0.0, MRR: 0.001540585457331143, Target: all


  total_after = np.sum(np.sum(r) for r in rs_filtered)


(0.001540585457331143,
 0.0016139647051052607,
 7380.657142857143,
 7118.657142857143,
 0.0019047619047619048,
 0.0019047619047619048,
 0.0019047619047619048,
 0.0019047619047619048,
 0.0038095238095238095,
 0.0038095238095238095,
 0.011428571428571429,
 0.0838095238095238)

In [23]:
del TreeRankForest

In [8]:
from sandbox.ranking.TreeRankForest import TreeRankForest
from sandbox.ranking.leafrank.DecisionTree import DecisionTree

rf = TreeRankForest(DecisionTree)

In [9]:
skf = StratifiedKFold(n_splits=4)
predicted_probas = []
tests = []
y_ranking = y[:]
y_ranking[y_ranking == 0] = -1
for i, (train, test) in enumerate(skf.split(X_noemb, y)):
    rf = TreeRankForest(DecisionTree()).learnModel(X_noemb[train], y[train])
    print(clf.oob_score_)
    mask_key = "split{}".format(i)
    metricHelper.masks.update({mask_key: test})
    predicted_proba = np.log(clf.predict(X[test])+1e-32)[:,1]
    predicted_probas.extend(predicted_proba.tolist())
    tests.extend(test.tolist())
    metricHelper.update(predicted_proba, mask_key)
    probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
    print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, mask_key))

sorted_indices, sorted_probas = zip(*sorted(zip(tests,predicted_probas)))
metricHelper.update(np.asarray(sorted_probas), "all")
probability, prediction, accuracy, recall, precision,auroc, auprc, f1, mrr = metricHelper.get_metrics("probability", "prediction", "accuracy", "recall", "precision", "auroc", "auprc", "f1", "mrr")
print("Accuracy: {}, Recall: {}, Precision: {}, AUROC: {}, AUPRC: {}, F1: {}, MRR: {}, Target: {}".format(accuracy, recall, precision, auroc, auprc, f1, mrr, "all"))
ranking = np.argsort(sorted_probas)[::-1]
ordered_truth_val = np.take_along_axis(y, ranking, axis=0)
get_metrics([ordered_truth_val])

Tree: 0 (0.000)

AttributeError: module 'Orange' has no attribute 'Preprocessor_addClassWeight'

In [30]:
def test_monotonic_increasing(array):
    for i in range(len(array)-1):
        try:
            assert array[i] < array[i+1]
        except AssertionError:
            print("Not monotonic increasing between indices {} and {}: {}, {}".format(i,i+1,array[i],array[i+1]))

In [31]:
test_monotonic_increasing(sorted_indices)

In [7]:
def get_metrics(rs, additional_truth=None, get_hits=[]):
        """Score is reciprocal of the rank of the first relevant item
        First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
        Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
        >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
        >>> mean_reciprocal_rank(rs)
        0.61111111111111105
        >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
        >>> mean_reciprocal_rank(rs)
        0.5
        >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
        >>> mean_reciprocal_rank(rs)
        0.75
        Args:
            rs: Iterator of relevance scores (list or numpy) in rank order
                (first element is the first item)
        Returns:
            Mean reciprocal rank
        """
        rs_raw = list(np.asarray(r).nonzero()[0] for r in rs if np.sum(r) > 0)
        
        mrr_raw = np.mean([1. / (r + 1) for sublist in rs_raw for r in sublist]) 
        mean_rank_raw = np.mean([r + 1 for sublist in rs_raw for r in sublist]) 

        hitsat5_raw = np.mean([1 if r < 5 else 0 for sublist in rs_raw for r in sublist]) 
        hitsat10_raw = np.mean([1 if r < 10 else 0 for sublist in rs_raw for r in sublist]) 
        hitsat20_raw = np.mean([1 if r < 20 else 0 for sublist in rs_raw for r in sublist]) 
        hitsat50_raw = np.mean([1 if r < 50 else 0 for sublist in rs_raw for r in sublist]) 

        total_before = np.sum(rs)

        rs_filtered = []
        if additional_truth is not None:
        #if True:
            # also remove all known true examples from the other sets
            rs_prefiltered = []
            for i, additional in enumerate(additional_truth):
                to_delete = additional.nonzero()[0]
                rs_prefiltered.append(np.delete(rs[i],to_delete)) 
            
            total_after = np.sum(np.sum(r) for r in rs_prefiltered)
            assert  total_before == total_after # nothing lost filtering for out-of-sample edges
            rs = rs_prefiltered

        for r in rs:
            while np.sum(r) > 0:
                best = r.nonzero()[0][0]
                best_r = np.zeros_like(r)
                best_r[best] = 1
                rs_filtered.append(best_r)
                r = np.delete(r, best)

        total_after = np.sum(np.sum(r) for r in rs_filtered)
        assert total_before == total_after # nothing lost in filtering
        assert len(rs_filtered) == total_before # every edge gets its own array
        for r in rs_filtered:
            assert np.sum(r) == 1 # only one edge in every array

        
        rs_filtered = list(np.asarray(r).nonzero()[0] for r in rs_filtered if np.sum(r) > 0)
        mrr_filtered = np.mean([1. / (r + 1) if r.size else 0. for r in rs_filtered]) 
        mean_rank_filtered = np.mean([r + 1 if r.size else 0. for r in rs_filtered]) 

        hitsat5_filtered = np.mean([1 if r < 5 else 0 for sublist in rs_filtered for r in sublist])
        hitsat10_filtered = np.mean([1 if r < 10 else 0 for sublist in rs_filtered for r in sublist])
        hitsat20_filtered = np.mean([1 if r < 20 else 0 for sublist in rs_filtered for r in sublist]) 
        hitsat500_filtered = np.mean([1 if r < 500 else 0 for sublist in rs_filtered for r in sublist])

        return (mrr_raw, mrr_filtered, mean_rank_raw, mean_rank_filtered, hitsat5_raw, hitsat5_filtered, hitsat10_raw, hitsat10_filtered,
            hitsat20_raw, hitsat20_filtered, hitsat50_raw, hitsat500_filtered)