In [1]:
import random
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import GroupKFold
import pandas as pd
import math
import time

# Get the features and gold standard 

In [2]:
drugfeatfiles = ['features/drugs-fingerprint-sim.txt','features/drugs-se-sim.txt', 
                 'features/drugs-ppi-sim.txt', 'features/drugs-target-go-sim.txt','features/drugs-target-seq-sim.txt']
diseasefeatfiles =['features/diseases-hpo-sim.txt',  'features/diseases-pheno-sim.txt' ]

In [3]:
goldindfile = 'known_associations/predict-gold-standard-omim.txt'
drugDiseaseKnown = pd.read_csv(goldindfile,delimiter='\t') 
drugDiseaseKnown.Disease = drugDiseaseKnown.Disease.astype(str)
drugDiseaseKnown.head()

Unnamed: 0,Drug,Disease
0,DB00659,103780
1,DB00284,125853
2,DB00284,125850
3,DB00284,125851
4,DB00284,600496


# Merge feature matrix

In [4]:
def adjcencydict2matrix(df, name1, name2):
    df1 = df.copy()
    df1= df1.rename(index=str, columns={name1: name2, name2: name1})
    print (len(df))
    df =df.append(df1)
    print (len(df))
    return df.pivot(index=name1, columns=name2)

def mergeFeatureMatrix(drugfeatfiles, diseasefeatfiles):
    for i,featureFilename in enumerate(drugfeatfiles):
        df = pd.read_csv(featureFilename, delimiter='\t')
        cond = df.Drug1 > df.Drug2
        df.loc[cond, ['Drug1', 'Drug2']] = df.loc[cond, ['Drug2', 'Drug1']].values
        if i != 0:
            drug_df=drug_df.merge(df,on=['Drug1','Drug2'],how='inner')
            #drug_df=drug_df.merge(temp,how='outer',on='Drug')
        else:
            drug_df =df
    drug_df.fillna(0, inplace=True)
    
    drug_df = adjcencydict2matrix(drug_df, 'Drug1', 'Drug2')
    drug_df = drug_df.fillna(1.0)

    
    for i,featureFilename in enumerate(diseasefeatfiles):
        print (featureFilename)
        df=pd.read_csv(featureFilename, delimiter='\t')
        cond = df.Disease1 > df.Disease2
        df.loc[cond, ['Disease1','Disease2']] = df.loc[cond, ['Disease2','Disease1']].values
        if i != 0:
            disease_df = disease_df.merge(df,on=['Disease1','Disease2'], how='inner')
            #drug_df=drug_df.merge(temp,how='outer',on='Drug')
        else:
            disease_df = df
    disease_df.fillna(0, inplace=True)
    disease_df.Disease1 = disease_df.Disease1.astype(str)
    disease_df.Disease2 = disease_df.Disease2.astype(str)
    
    disease_df = adjcencydict2matrix(disease_df, 'Disease1', 'Disease2')
    disease_df = disease_df.fillna(1.0)
    
    return drug_df, disease_df

In [5]:
drug_df, disease_df = mergeFeatureMatrix(drugfeatfiles, diseasefeatfiles)

332520
665040


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


features/diseases-hpo-sim.txt
features/diseases-pheno-sim.txt
107880
215760


In [6]:
drug_df.head()

Unnamed: 0_level_0,CHEM-SIM,CHEM-SIM,CHEM-SIM,CHEM-SIM,CHEM-SIM,CHEM-SIM,CHEM-SIM,CHEM-SIM,CHEM-SIM,CHEM-SIM,...,TARGETSEQ-SIM,TARGETSEQ-SIM,TARGETSEQ-SIM,TARGETSEQ-SIM,TARGETSEQ-SIM,TARGETSEQ-SIM,TARGETSEQ-SIM,TARGETSEQ-SIM,TARGETSEQ-SIM,TARGETSEQ-SIM
Drug2,DB00014,DB00035,DB00091,DB00104,DB00122,DB00125,DB00130,DB00136,DB00145,DB00152,...,DB08899,DB08901,DB08906,DB08907,DB08910,DB08911,DB08912,DB08916,DB08918,DB08932
Drug1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
DB00014,1.0,0.74,0.57,0.66,0.32,0.36,0.33,0.4,0.22,0.45,...,0.029618,0.050605,0.032936,0.041449,0.048445,0.044138,0.046573,0.043068,0.043676,0.081918
DB00035,0.74,1.0,0.54,0.71,0.25,0.44,0.4,0.29,0.27,0.48,...,0.028017,0.031601,0.039931,0.020888,0.03973,0.041915,0.031694,0.036733,0.043548,0.12479
DB00091,0.57,0.54,1.0,0.54,0.38,0.31,0.42,0.41,0.29,0.38,...,0.032221,0.037581,0.045833,0.03481,0.054382,0.056235,0.041482,0.032043,0.03776,0.037173
DB00104,0.66,0.71,0.54,1.0,0.26,0.35,0.41,0.38,0.27,0.47,...,0.023592,0.047539,0.029199,0.047738,0.044011,0.030169,0.039007,0.030578,0.0453,0.162962
DB00122,0.32,0.25,0.38,0.26,1.0,0.31,0.27,0.28,0.33,0.37,...,0.037706,0.047971,0.043467,0.03872,0.066144,0.056779,0.03977,0.034736,0.038546,0.04105


In [7]:
disease_df.head()

Unnamed: 0_level_0,HPO-SIM,HPO-SIM,HPO-SIM,HPO-SIM,HPO-SIM,HPO-SIM,HPO-SIM,HPO-SIM,HPO-SIM,HPO-SIM,...,PHENO-SIM,PHENO-SIM,PHENO-SIM,PHENO-SIM,PHENO-SIM,PHENO-SIM,PHENO-SIM,PHENO-SIM,PHENO-SIM,PHENO-SIM
Disease2,100070,102100,102300,102400,102500,103100,103230,103285,103780,104130,...,608710,609135,612219,612227,612376,612416,613985,614266,615555,615688
Disease1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
100070,1.0,0.038424,0.035952,0.037063,0.322315,0.03755,0.037034,0.217807,0.025429,0.036986,...,0.188982,0.142009,0.2,0.234216,0.181444,0.204939,0.204656,0.305687,0.228665,0.187292
102100,0.038424,1.0,0.194242,0.393386,0.406072,0.328159,0.309476,0.403815,0.035506,0.285586,...,0.029161,0.059761,0.077152,0.090351,0.0,0.067763,0.055728,0.170783,0.079388,0.123856
102300,0.035952,0.194242,1.0,0.256683,0.213125,0.407773,0.255908,0.154389,0.271504,0.34628,...,0.156162,0.160019,0.15891,0.241926,0.16435,0.195402,0.223828,0.251259,0.239826,0.165821
102400,0.037063,0.393386,0.256683,1.0,0.468868,0.329036,0.288966,0.312454,0.034242,0.251431,...,0.0,0.03371,0.043519,0.050965,0.0,0.050965,0.031435,0.110096,0.089562,0.104796
102500,0.322315,0.406072,0.213125,0.468868,1.0,0.264019,0.290357,0.388354,0.117129,0.385163,...,0.068816,0.117525,0.144138,0.159913,0.115624,0.133261,0.15343,0.172726,0.239389,0.17659


# Generate positive and negative pairs

In [8]:
def generatePairs(drug_df, disease_df, drugDiseaseKnown):
    drugwithfeatures = set(drug_df.columns.levels[1])
    diseaseswithfeatures = set(disease_df.columns.levels[1])
    
    drugDiseaseDict  = set([tuple(x) for x in  drugDiseaseKnown[['Drug','Disease']].values])

    commonDrugs= drugwithfeatures.intersection( drugDiseaseKnown.Drug.unique())
    commonDiseases=  diseaseswithfeatures.intersection(drugDiseaseKnown.Disease.unique() )
    print ("commonDrugs: %d commonDiseases : %d"%(len(commonDrugs),len(commonDiseases)))

    #abridged_drug_disease = [(dr,di)  for  (dr,di)  in drugDiseaseDict if dr in drugwithfeatures and di in diseaseswithfeatures ]

    #commonDrugs = set( [ dr  for dr,di in  abridged_drug_disease])
    #commonDiseases  =set([ di  for dr,di in  abridged_drug_disease])

    print ("Gold standard, associations: %d drugs: %d diseases: %d"%(len(drugDiseaseKnown),len(drugDiseaseKnown.Drug.unique()),len(drugDiseaseKnown.Disease.unique())))
    print ("Drugs with features: %d Diseases with features: %d"%(len(drugwithfeatures),len(diseaseswithfeatures)))
    print ("commonDrugs: %d commonDiseases : %d"%(len(commonDrugs),len(commonDiseases)))

    pairs=[]
    classes=[]
    for dr in commonDrugs:
        for di in commonDiseases:
            cls = (1 if (dr,di) in drugDiseaseDict else 0)
            pairs.append((dr,di))
            classes.append(cls)
            
    return pairs, classes

In [9]:
pairs, classes = generatePairs(drug_df, disease_df, drugDiseaseKnown)

commonDrugs: 505 commonDiseases : 310
Gold standard, associations: 1933 drugs: 592 diseases: 313
Drugs with features: 816 Diseases with features: 465
commonDrugs: 505 commonDiseases : 310


# Balance negative samples/postives 

In [10]:
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedKFold
def balance_data(pairs, classes, n_proportion):
    classes = np.array(classes)
    pairs = np.array(pairs)
    
    indices_true = np.where(classes == 1)[0]
    indices_false = np.where(classes == 0)[0]

    np.random.shuffle(indices_false)
    indices = indices_false[:(n_proportion*indices_true.shape[0])]
    print ("+/-:", len(indices_true), len(indices), len(indices_false))
    pairs = np.concatenate((pairs[indices_true], pairs[indices]), axis=0)
    classes = np.concatenate((classes[indices_true], classes[indices]), axis=0) 
    
 
    return pairs, classes

In [11]:
n_proportion = 2
pairs, classes= balance_data(pairs, classes, n_proportion)

+/-: 1718 3436 154832


# Train-Test Splitting

In [12]:
pairs_train, pairs_test, classes_train, classes_test = model_selection.train_test_split(pairs, classes, stratify=classes, test_size=0.2, shuffle=True)

In [13]:
len(pairs_train), len(pairs_test)

(4123, 1031)

# Feature extraction (Best Combined similarity)

In [14]:
def geometricMean(drug, disease, knownDrugDisease, drugDF, diseaseDF):
    #print (drug, disease)
    a  = drugDF.loc[knownDrugDisease[:,0]][drug].values
    b  = diseaseDF.loc[knownDrugDisease[:,1]][disease].values
    #print (a,b)
    c = np.sqrt( np.multiply(a,b) )
    ix2 = (knownDrugDisease == [drug, disease])
    c[ix2[:,1]& ix2[:,0]]=0.0
    return float(np.max(c))


def createFeatureDF(pairs, classes, knownDrugDisease, drugDFs, diseaseDFs):
    totalNumFeatures = len(drugDFs)*len(diseaseDFs)
    #featureMatri x= np.empty((len(classes),totalNumFeatures), float)
    df =pd.DataFrame(list(zip(pairs[:,0], pairs[:,1], classes)), columns =['Drug','Disease','Class'])
    index = 0
    for i,drug_col in enumerate(drugDFs.columns.levels[0]):
        for j,disease_col in enumerate(diseaseDFs.columns.levels[0]):
            drugDF = drugDFs[drug_col]
            diseaseDF = diseaseDFs[disease_col]
            df["Feature_"+str(drug_col)+'_'+str(disease_col)] = df.apply(lambda row: geometricMean( row.Drug, row.Disease, knownDrugDisease, drugDF, diseaseDF), axis=1)
    return df

def calculateCombinedSimilarity(pairs_train, pairs_test, classes_train, classes_test, drug_df, disease_df, knownDrugDisease):
    train_df  = createFeatureDF(pairs_train, classes_train, knownDrugDisease, drug_df, disease_df)
    test_df = createFeatureDF(pairs_test, classes_test, knownDrugDisease, drug_df, disease_df)
    return train_df, test_df

In [15]:
knownDrugDisease= pairs_train[classes_train==1]
train_df, test_df = calculateCombinedSimilarity(pairs_train, pairs_test, classes_train, classes_test, drug_df, disease_df, knownDrugDisease)

# Model Training

In [16]:
from sklearn import tree, ensemble
from sklearn import svm, linear_model, neighbors

def trainModel(train_df, clf):
    features = list(train_df.columns.difference(['Drug','Disease','Class']))
    X = train_df[features]
    y = train_df['Class']
    print ('fiting classifier...')
    clf.fit(X, y)
    return clf

  from numpy.core.umath_tests import inner1d


In [17]:
n_seed = 100
clf = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, random_state=n_seed) 
clf = trainModel(train_df, clf)

fiting classifier...


# Evaulation 

In [18]:
from sklearn import metrics
import numbers
def multimetric_score(estimator, X_test, y_test, scorers):
    """Return a dict of score for multimetric scoring"""
    scores = {}
    for name, scorer in scorers.items():
        if y_test is None:
            score = scorer(estimator, X_test)
        else:
            score = scorer(estimator, X_test, y_test)

        if hasattr(score, 'item'):
            try:
                # e.g. unwrap memmapped scalars
                score = score.item()
            except ValueError:
                # non-scalar?
                pass
        scores[name] = score

        if not isinstance(score, numbers.Number):
            raise ValueError("scoring must return a number, got %s (%s) "
                             "instead. (scorer=%s)"
                             % (str(score), type(score), name))
    return scores

def evaluate(test_df, clf):
    features = list(train_df.columns.difference(['Drug','Disease','Class']))
    X_test =  test_df[features]
    y_test = test_df['Class']

    scoring = ['precision', 'recall', 'accuracy', 'roc_auc', 'f1', 'average_precision']
    scorers, multimetric = metrics.scorer._check_multimetric_scoring(clf, scoring=scoring)
    scores = multimetric_score(clf, X_test, y_test, scorers)
    return scores

In [19]:
scores = evaluate(test_df, clf)
print ("Test:",scores)

Test: {'f1': 0.760655737704918, 'average_precision': 0.8566906000076772, 'accuracy': 0.8583899127061105, 'recall': 0.6744186046511628, 'roc_auc': 0.8802152093700281, 'precision': 0.8721804511278195}


# 10-fold drug-disjoint cross-validation (PREDICT - CV scheme )

In [20]:
disjoint = True
n_fold = 10

if disjoint:
    print ('Disjoint')
    groups = pairs[:,0] # group by drug
    group_kfold = GroupKFold(n_splits=n_fold)
    cv = group_kfold.split(pairs, classes, groups)
else:
    print ('Non-disjoint')
    skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=n_seed)
    cv = skf.split(pairs, classes)

n_seed = 100
cv_results = pd.DataFrame()
clf = linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, random_state=n_seed) 
  
for i, (train, test) in enumerate(cv):
    print ('Fold',i+1)
    start_time = time.time()
    pairs_train = pairs[train]
    classes_train = classes[train] 
    pairs_test = pairs[test]
    classes_test = classes[test]
    knownDrugDisease= pairs_train[classes_train==1]
    
    train_df, test_df = calculateCombinedSimilarity(pairs_train, pairs_test, classes_train, classes_test, drug_df, disease_df, knownDrugDisease)
    elapsed_time = time.time() - start_time
    print ('Time elapsed to generate features:',time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

    clf = trainModel(train_df, clf)
    
    scores = evaluate(test_df, clf)
    print ("Scores:",scores)
    cv_results = cv_results.append(scores, ignore_index=True)

Disjoint
Fold 1
Time elapsed to generate features: 00:05:04
fiting classifier...
Scores: {'f1': 0.6846153846153846, 'average_precision': 0.7836834260939477, 'accuracy': 0.8410852713178295, 'recall': 0.55625, 'roc_auc': 0.8145979634831461, 'precision': 0.89}
Fold 2
Time elapsed to generate features: 00:05:10
fiting classifier...
Scores: {'f1': 0.7307692307692308, 'average_precision': 0.8337392888758675, 'accuracy': 0.8640776699029126, 'recall': 0.59375, 'roc_auc': 0.8577728873239437, 'precision': 0.95}
Fold 3
Time elapsed to generate features: 00:05:05
fiting classifier...
Scores: {'f1': 0.6199261992619927, 'average_precision': 0.8095055766497591, 'accuracy': 0.8, 'recall': 0.46408839779005523, 'roc_auc': 0.8326330763886591, 'precision': 0.9333333333333333}
Fold 4
Time elapsed to generate features: 00:05:01
fiting classifier...
Scores: {'f1': 0.6024096385542169, 'average_precision': 0.8064347698912219, 'accuracy': 0.8081395348837209, 'recall': 0.43859649122807015, 'roc_auc': 0.834960589

In [21]:
cv_results.mean()

accuracy             0.833336
average_precision    0.810000
f1                   0.685671
precision            0.918820
recall               0.549804
roc_auc              0.836133
dtype: float64

In [None]:
cv_results.to_csv('results/DCV_runs_transd.csv')