# Calculate edge circadian scores of drug-disease pairs in indication data

In [1]:
import numpy as np
import pandas as pd
import src.calculate_edge_circa_score as cal

In [2]:
# read in processed CircaDB data 
circa_db = pd.read_csv('data/circa_db_mapped.tsv', sep = '\t')
circa_db.head()

Unnamed: 0,gene_id,Fat SQ_fdr,Fat Visceral_fdr,Aorta_fdr,Artery Coronary_fdr,Artery Tibial_fdr,Colon_fdr,Esophagus_fdr,Heart Atrial_fdr,Liver_fdr,...,Artery Coronary_exp,Artery Tibial_exp,Colon_exp,Esophagus_exp,Heart Atrial_exp,Liver_exp,Lung_exp,Nerve Tibial_exp,Pituitary_exp,Thyroid_exp
0,653635,0.6432,0.1526,0.8443,0.7712,0.9549,0.5059,0.2928,0.6953,0.9732,...,12.3,11.59,12.72,12.3033,5.369,5.406,13.68,19.48,15.84,19.255
1,79854,0.7652,0.2412,0.0473,0.0002,0.6314,0.8602,0.082,0.6481,0.6405,...,5.94,8.419,5.4407,4.6607,2.962,3.24,7.024,12.11,9.898,9.7615
2,643837,0.9075,0.0774,0.7887,0.2173,0.6017,0.3294,0.0696,0.2758,0.3438,...,9.039,11.52,7.1738,7.635,14.35,3.674,6.016,5.872,18.6,6.5845
3,26155,0.8656,0.457,0.5405,0.6391,0.8885,0.3902,0.6801,0.8984,0.3532,...,52.24,62.41,50.845,62.955,30.67,28.42,57.32,69.66,57.56,66.58
4,339451,0.8808,0.6235,0.436,0.4999,0.7446,0.778,0.6238,0.6645,0.9089,...,13.51,10.8,11.62,14.5633,4.87,5.979,15.6,9.224,21.42,15.445


In [3]:
# read in metapath data
metapath_df = pd.read_csv('data/rephetio_significant_metapaths.tsv', header = 0, sep = '\t')
metapath_df.head()

Unnamed: 0,abbreviation,length,delta_auroc,negative_log_p
0,CtDaGaD,3,12.70%,7.2485
1,CbGbCtD,3,21.70%,6.4962
2,CbGaD,2,14.50%,6.2137
3,CrCbGaD,3,8.22%,6.0926
4,CbGeAlD,3,8.43%,5.1873


In [4]:
metapaths = list(metapath_df.abbreviation)
len(metapaths)

67

### Pre-process drug~disease indication data

In [5]:
# read in indication data 
indication_df = pd.read_csv('https://github.com/dhimmel/learn/raw/7668c97b2a6f348479b70fa40c3d7db424584315/summary/indications.tsv',
           header = 0, sep = '\t')
indication_df.head()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,rel_type
0,DB01048,Abacavir,DOID:635,acquired immunodeficiency syndrome,TREATS_CtD
1,DB05812,Abiraterone,DOID:10283,prostate cancer,TREATS_CtD
2,DB00659,Acamprosate,DOID:0050741,alcohol dependence,TREATS_CtD
3,DB00284,Acarbose,DOID:9352,type 2 diabetes mellitus,TREATS_CtD
4,DB01193,Acebutolol,DOID:10763,hypertension,TREATS_CtD


In [6]:
# filter by treat type 
indication_df = indication_df[indication_df.rel_type == 'TREATS_CtD']
len(indication_df)

755

### Pre-process drug half-life data

In [7]:
# read in half-life of drugs
half_life_df = pd.read_csv('https://github.com/dhimmel/drugbank/raw/6b9ae386d6ba4a0eca2d66d4b0337a6e90fe81f4/data/drugbank_subset_halflife_curated.tsv'
                           ,header = 0, sep = '\t')
half_life_df.head()

Unnamed: 0,type,drugbank_id,name,half_life_hours_curated
0,small molecule,DB00477,Chlorpromazine,30.0
1,small molecule,DB06708,Lumefantrine,108.0
2,small molecule,DB00980,Ramelteon,1.8
3,small molecule,DB01035,Procainamide,3.5
4,biotech,DB00092,Alefacept,270.0


In [8]:
# filter by half-life
half_life_df = half_life_df[half_life_df.half_life_hours_curated <= 24]
len(half_life_df)

841

### Combine indication and half-life dataframes

In [9]:
# merge two datasets
drug_disease_df = pd.merge(indication_df, half_life_df, left_on = 'compound_id', right_on = 'drugbank_id')
select_col = ['compound_id','compound_name','disease_id','disease_name','rel_type','half_life_hours_curated']
drug_disease_df = drug_disease_df.loc[:,select_col]
drug_disease_df.head()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,rel_type,half_life_hours_curated
0,DB01048,Abacavir,DOID:635,acquired immunodeficiency syndrome,TREATS_CtD,1.54
1,DB05812,Abiraterone,DOID:10283,prostate cancer,TREATS_CtD,9.5
2,DB00284,Acarbose,DOID:9352,type 2 diabetes mellitus,TREATS_CtD,2.0
3,DB00819,Acetazolamide,DOID:1826,epilepsy syndrome,TREATS_CtD,6.0
4,DB00819,Acetazolamide,DOID:1686,glaucoma,TREATS_CtD,6.0


In [10]:
# Drug~hematologic cancer (DOID:2531) has tens of thousands of edges. It will take a long time to run.
drug_disease_df = drug_disease_df[drug_disease_df.disease_id != 'DOID:2531']
len(drug_disease_df)

377

### Calculate edge circadian scores of drug-disease pairs

In [11]:
# obtain names of tissues in CircaDB
circa_cols = list(circa_db.columns)
tissues = []
for i in range(0, len(circa_cols)):
    cols_s = circa_cols[i].split('_amp')
    if len(cols_s) > 1:
        tissues.append(cols_s[0])
tissues = list(np.unique(tissues))

In [12]:
# calculate edge circadian scores 
indi_tissues_scores = []
indi_tissues_notes = []
indi_tissues_metapaths = []
indi_tissues_paths = []
# iterate drug~disease pair
for dd in range(0, len(drug_disease_df)):
    print(dd,'\n')
    drug = drug_disease_df['compound_id'].iloc[dd,]
    disease = drug_disease_df['disease_id'].iloc[dd,]
    score, note, n_meta, n_path = cal.calculate_edge_circa_score(drug, disease, tissues, circa_db, 
                                                                 query_metapath = metapaths)
    indi_tissues_scores.append(score)
    indi_tissues_notes.append(note)
    indi_tissues_metapaths.append(n_meta)
    indi_tissues_paths.append(n_path)

0 

1 

2 

3 

4 

5 

6 

7 

8 

9 

10 

11 

12 

13 

14 

15 

16 

17 

18 

19 

20 

21 

22 

23 

24 

25 

26 

27 

28 

29 

30 

31 

32 

33 

34 

35 

36 

37 

38 

39 

40 

41 

42 

43 

44 

45 

46 

47 

48 

49 

50 

51 

52 

53 

54 

55 

56 

57 

58 

59 

60 

61 

62 

63 

64 

65 

66 

67 

68 

69 

70 

71 

72 

73 

74 

75 

76 

77 

78 

79 

80 

81 

82 

83 

84 

85 

86 

87 

88 

89 

90 

91 

92 

93 

94 

95 

96 

97 

98 

99 

100 

101 

102 

103 

104 

105 

106 

107 

108 

109 

110 

111 

112 

113 

114 

115 

116 

117 

118 

119 

120 

121 

122 

123 

124 

125 

126 

127 

128 

129 

130 

131 

132 

133 

134 

135 

136 

137 

138 

139 

140 

141 

142 

143 

144 

145 

146 

147 

148 

149 

150 

151 

152 

153 

154 

155 

156 

157 

158 

159 

160 

161 

162 

163 

164 

165 

166 

167 

168 

169 

170 

171 

172 

173 

174 

175 

176 

177 

178 

179 

180 

181 

182 

183 

184 



In [13]:
# score dataframe
indi_df = pd.DataFrame(indi_tissues_scores)
indi_df.columns = tissues
# note dataframe
note_df = pd.DataFrame(indi_tissues_notes)
note_df.columns = ['score_note']
# metapath count dataframe 
meta_count_df = pd.DataFrame(indi_tissues_metapaths)
meta_count_df.columns = ['metapath_cout']
# path count dataframe 
path_count_df = pd.DataFrame(indi_tissues_paths)
path_count_df.columns = ['path_cout']
# combine dataframes and output 
combine_df = pd.concat([drug_disease_df.reset_index(drop = True), indi_df, note_df, meta_count_df, path_count_df], 
                       axis=1)
combine_df.to_csv('data/indication_edge_circa_scores.tsv', sep = '\t', na_rep = 'NA', 
                  float_format = '%.4f', index = False)
combine_df.head()

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,rel_type,half_life_hours_curated,Aorta,Artery Coronary,Artery Tibial,Colon,...,Fat Visceral,Heart Atrial,Liver,Lung,Nerve Tibial,Pituitary,Thyroid,score_note,metapath_cout,path_cout
0,DB01048,Abacavir,DOID:635,acquired immunodeficiency syndrome,TREATS_CtD,1.54,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1,1.0
1,DB05812,Abiraterone,DOID:10283,prostate cancer,TREATS_CtD,9.5,,,,,...,,,,,,,,query drug and disease connected by genes not ...,1,0.0
2,DB00284,Acarbose,DOID:9352,type 2 diabetes mellitus,TREATS_CtD,2.0,0.0,0.219674,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3,9.0
3,DB00819,Acetazolamide,DOID:1826,epilepsy syndrome,TREATS_CtD,6.0,0.0,0.183,0.558949,0.0,...,0.0,0.0,0.0,0.0,0.741949,0.0,0.0,,2,6.0
4,DB00819,Acetazolamide,DOID:1686,glaucoma,TREATS_CtD,6.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,4,4.0
