/
input_output.py
251 lines (171 loc) · 7.79 KB
/
input_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
"""
This file is part of Scuba.
Copyright (C) 2016 by Guido Zampieri, Dinh Tran, Michele Donini.
Scuba is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Scuba is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with Scuba. If not, see <http://www.gnu.org/licenses/>.
@author: Guido Zampieri
"""
import pandas as pd
import numpy as np
def load_all_genes(kernel_path):
""" Load the full list of genes with corresponding indeces.
Parameters:
kernel_path_list -- List of kernel paths.
Return:
all_genes_dict -- Dictionary whose keys are genes represented by kernels and whose values are corresponding indeces.
"""
with open(kernel_path,'r') as f:
kernel_path_list = [line.rstrip() for line in f.readlines()]
all_genes_list = pd.DataFrame()
for path in kernel_path_list:
l = pd.read_table(path.split('_Matrix')[0] + '_Index.txt', sep=' ', header=None)
all_genes_list = pd.concat([all_genes_list, l])
all_genes_list = list(pd.unique(all_genes_list[0]))
all_genes_dict = {all_genes_list[i] : i for i in xrange(len(all_genes_list))}
return all_genes_dict
def load_kernels(kernel_path, all_genes_dict):
""" Load the list of kernels.
Parameters:
kernel_path -- Path of the file containing kernels' paths.
all_genes_dict -- Dictionary whose keys are genes represented by kernels and whose values are corresponding indeces.
Return:
kernel_name_list -- List of kernel file names.
kernel_list -- List of pandas data frames, each one representing a kernel matrix.
Notes:
If the gene set represented by the kernel matrices do not fully overlap, these are expanded to include the global set with the kernel average replacing missing values.
"""
with open(kernel_path,'r') as f:
kernel_path_list = [line.rstrip() for line in f.readlines()]
kernel_list = []
for path in kernel_path_list:
#Load kernel matrix
kernel_matrix = np.load(path)
kernel_matrix = pd.DataFrame(data=kernel_matrix)
#Dictionary of indeces related to the kernel matrix and associated to gene name
gene_index_dict = pd.read_table(path.split('_Matrix')[0] + '_Index.txt', sep=' ', names='gene_name gene_index'.split(), index_col='gene_index')
# Calculate mean kernel value
kernel_mean = kernel_matrix.sum().sum() / kernel_matrix.size
# Add gene_name column to kernel matrix in order to perform the following reindex
kernel_matrix['gene_name'] = gene_index_dict.gene_name
# Reindex with gene_name
kernel_matrix = kernel_matrix.set_index('gene_name')
# Change name of columns with gene_name
kernel_matrix.columns = [kernel_matrix.index[i] for i in xrange(len(kernel_matrix))]
# 1) Reindex to all genes list (rows)
# 2) Transpose matrix (rows and columns are inverted)
# 3) Reindex to all genes list (columns)
# 4) Replace NaN values with 0
final_kernel_matrix = kernel_matrix.reindex(all_genes_dict.keys()).transpose().reindex(all_genes_dict.keys()).fillna(kernel_mean)
# Normalize matrix to get unitary diagonal sum
mean_trace = np.mean(np.diag(final_kernel_matrix))
final_kernel_matrix = final_kernel_matrix.divide(mean_trace)
# Update the list of kernels
kernel_list.append(final_kernel_matrix)
kernel_name_list = [path.split('/')[-1] for path in kernel_path_list]
return kernel_name_list, kernel_list
def extract_submatrices(row_indeces, column_indeces, kernel_list):
""" Get list of kernel submatrices.
Parameters:
row_indeces -- List of row indeces.
column_indeces -- List of column indeces.
kernel_list -- List of pandas data frames representing kernel matrices.
Return:
subkernel_list -- List of pandas data frames extracted from input matrices.
"""
subkernel_list = []
for K in kernel_list:
submatrix = K.iloc[row_indeces, column_indeces]
subkernel_list.append(submatrix)
return subkernel_list
def load_multiple_indeces(path, all_genes_dict):
""" Load multiple lists of genes from files.
Parameters:
path -- Path of the list of gene lists paths.
all_genes_dict -- Dictionary whose keys are genes represented by kernels and whose values are corresponding indeces.
Return:
genes -- List of gene indeces lists.
"""
genes = []
with open(path,'r') as f:
path_list = [line.rstrip() for line in f.readlines()]
for p in path_list:
genes.append(load_indeces(p, all_genes_dict))
return genes
def load_indeces(path, all_genes_dict):
""" Load a list of genes from file.
Parameters:
path -- Path of the list of genes.
all_genes_dict -- Dictionary whose keys are genes represented by kernels and whose values are corresponding indeces.
Return:
indeces -- Lists of gene indeces.
"""
with open(path,'r') as f:
genes = [line.rstrip() for line in f.readlines()]
indeces = []
discarded = []
for gene in genes:
if gene in all_genes_dict.keys():
indeces.append(all_genes_dict[gene])
else:
discarded.append(gene)
print path, ': ', len(discarded), 'genes not present in kernels'
return indeces
def load_targets(path, all_genes_dict):
""" Load targets from files.
Parameters:
path -- Path of the list of seed gene lists paths.
all_genes_dict -- Dictionary whose keys are genes represented by kernels and whose values are corresponding indeces.
Return:
target_names -- -- List of target gene names.
target_indeces -- List of target gene indices.
Notes:
Seeds' file names are the names of the corresponding target gene for each seed group.
"""
with open(path,'r') as f:
path_list = [line.rstrip() for line in f.readlines()]
target_names = [p.split('/')[-1] for p in path_list]
target_indeces = [all_genes_dict[gene] for gene in target_names]
return target_names, target_indeces
def save_rank(path, scores, genes, all_genes_dict):
""" Save to file a gene rank.
Parameters:
scores -- List of scores.
genes -- List of genes
all_genes_dict -- Dictionary whose keys are genes represented by kernels and whose values are corresponding indeces.
path -- Output path.
"""
rank = sorted(zip(scores, genes), reverse=True)
with open(path+'_rank.txt', 'w') as f:
for score, gene in rank:
f.write(all_genes_dict.keys()[all_genes_dict.values().index(gene)] + "\t" + str(score) + "\n")
def save_details(path, best_lambda, kernel_names, weights):
""" Save to file the values of learned parameters.
Parameters:
kernel_names -- List of kernels' names.
path -- Output path.
best_lambda -- Learned value of lambda.
weights -- List of learned linear coefficients for kernels.
file_name -- Label for the output files (empty string by default)
"""
with open(path+"_details.txt",'w') as f:
f.write("Regularization hyper-parameter value:\n\t" + str(best_lambda) + "\n\n")
f.write("Weights of kernels:\n")
for i in range(len(weights)):
f.write("\t" + kernel_names[i] + "\t" + str(weights[i]) + "\n")
def save_results(path, results):
""" Save to file the overall performance.
Parameters:
path -- Output path.
results -- Dictionary containing the results for each quantity computed.
"""
with open(path+"_results.txt",'w') as f:
for measure in sorted(results):
f.write(measure + "\t\t" + str(results[measure]) + "\n")