12
12
import numpy as np
13
13
import math
14
14
from tqdm import tqdm
15
+ import sys
15
16
16
17
from canine import Orchestrator
17
18
from canine .utils import ArgumentHelper
23
24
from .samplers .PhosphoSampler import *
24
25
25
26
from .mapping .mapper import GPmapper
26
- from .utils import hill , parse_resmap , wap
27
- from .utils import get_distance_matrix , transform_distance_matrix , get_pdb_muts_overlap , map_pos_with_weights
27
+ from .utils import hill , parse_resmap , wap , fwap
28
+ from .utils import get_distance_matrix , transform_distance_matrix , get_pdb_muts_overlap , map_pos_with_weights , transform_distance_matrix2
28
29
from .utils import mkdir
29
30
30
31
def main ():
@@ -162,7 +163,7 @@ def main():
162
163
args .tumor_type = None
163
164
164
165
if args .tumor_type and args .pancan_factor != 1.0 :
165
- print ('WARNING: args.pancan_factor is not 1 althought args.tumor_type is set. Correcting to args.pancan_factor=1' )
166
+ print ('WARNING: args.pancan_factor is not 1 althought args.tumor_type is set. Correcting to args.pancan_factor=1' , file = sys . stderr )
166
167
args .pancan_factor = 1.0
167
168
168
169
args .mut_types = set (args .mut_types )
@@ -229,7 +230,7 @@ def main():
229
230
# CLUMPS
230
231
#----------------------------------------
231
232
if args .sampler == 'CoverageSampler' or args .sampler == 'MutspecCoverageSampler' :
232
- print ("Building mapper..." )
233
+ print ("Building mapper..." , file = sys . stderr )
233
234
gpm = GPmapper (hgfile = args .hgfile , spfile = args .fasta , mapfile = args .gpmaps )
234
235
235
236
# Load mutational frequencies
@@ -259,14 +260,29 @@ def main():
259
260
# TODO
260
261
#
261
262
pdbch = pdbch .split ('-' )
263
+
262
264
#splits res map into two lists of number, and a dict of numbers and booleans
263
265
#number: boolean dict represents when numbers are identical...?
264
266
ur ,pr ,prd = parse_resmap (resmap )
265
267
268
+
266
269
if len (ur ) < 5 :
267
- print ("Bad mapping for {}." .format (ur ))
270
+ print ("Bad mapping for {}." .format (ur ), file = sys .stderr )
271
+ continue
272
+
273
+ # Skip structure if there are any negative UniProt -> PDB mappings
274
+ # (cause unknown, but likely an unusably bad structure)
275
+ if (pr < 0 ).any ():
276
+ print (f"WARNING: skipping structure { u1 } ({ pdbch } ) due to negative UniProt -> PDB mappings!" , file = sys .stderr )
268
277
continue
269
278
279
+ # Remove non-unique UniProt -> PDB mappings (likely due to wonky homology modeling)
280
+ nuidx = np .flatnonzero (np .bincount (pr ) > 1 )
281
+ if len (nuidx ):
282
+ rmidx = np .isin (pr , nuidx )
283
+ pr = pr [~ rmidx ]
284
+ ur = ur [~ rmidx ]
285
+ print (f"WARNING: removed { rmidx .sum ()} residues with non-unique UniProt -> PDB mappings!" , file = sys .stderr )
270
286
271
287
# Load Protein file
272
288
protein_muts = map_pos_with_weights (args .muts , u1 , mfreq , args .tumor_type , args .mut_types , args .use_provided_values , args .mut_freq )
@@ -276,35 +292,38 @@ def main():
276
292
## mv: normalized mutation count at each residue
277
293
## mt: cancer types contributing mutations
278
294
mi ,mv ,mt = get_pdb_muts_overlap (ur , protein_muts , args .hill_exp , args .use_provided_values )
295
+ mv = np .c_ [mv ]
279
296
280
297
# Load AA residue coordinates
281
298
if len (mi ) > 0 :
282
299
try :
283
300
D ,x ,pdb_resnames = get_distance_matrix (pdbch , args .pdb_dir , pdb_resids = pr )
284
- DDt = transform_distance_matrix (D , ur , args .xpo )
301
+ #DDt = transform_distance_matrix(D, ur, args.xpo)
302
+ DDt2 = np .tril (transform_distance_matrix2 (D , args .xpo ), - 1 )
285
303
except :
286
- print ("Unable to load PDB..." )
304
+ print ("Unable to load PDB..." , file = sys . stderr )
287
305
continue
288
306
289
307
# print("Sampling {} | {} - {}".format(u1, pdbch, mi))
290
308
291
309
# Compute matrix
292
310
## matrix that holds mv[i]*mv[j] values (sqrt or not)
293
- Mmv = []
294
- mvcorr = range (len (mv ))
295
-
296
- for i in range (len (mi )):
297
- mrow = np .zeros (len (mi ), np .float64 )
298
- for j in range (len (mi )):
299
- #mrow[j] = np.sqrt(mv[i]*mv[j]) ## geometric mean; actually does not perform better in most cases
300
- if args .pancan_factor == 1.0 :
301
- mrow [j ] = mv [i ]* mv [j ]
302
- else :
303
- mrow [j ] = (args .pancan_factor + (1.0 - args .pancan_factor )* (len (mt [i ] & mt [j ])> 0 )) * mv [i ]* mv [j ] ## product
304
- Mmv .append (mrow )
311
+ # Mmv = []
312
+ # mvcorr = range(len(mv))
313
+
314
+ # for i in range(len(mi)):
315
+ # mrow = np.zeros(len(mi), np.float64)
316
+ # for j in range(len(mi)):
317
+ # #mrow[j] = np.sqrt(mv[i]*mv[j]) ## geometric mean; actually does not perform better in most cases
318
+ # if args.pancan_factor == 1.0:
319
+ # mrow[j] = mv[i]*mv[j]
320
+ # else:
321
+ # mrow[j] = (args.pancan_factor + (1.0-args.pancan_factor)*(len(mt[i] & mt[j])>0)) * mv[i]*mv[j] ## product
322
+ # Mmv.append(mrow)
305
323
306
324
# Compute WAP score
307
- wap_obs = wap (mi , mvcorr , Mmv , DDt )
325
+ #wap_obs = wap(mi, mvcorr, Mmv, DDt)
326
+ wap_obs = fwap (mi , mv , DDt2 )
308
327
309
328
# Create Null Sampler
310
329
rnd = 0
@@ -329,7 +348,7 @@ def main():
329
348
# test sampler
330
349
_ = sam .sample (mireal )
331
350
except :
332
- print ("Error initializing {} for {} {} {}." .format (args .sampler , u1 , u2 , pdbch ))
351
+ print ("Error initializing {} for {} {} {}." .format (args .sampler , u1 , u2 , pdbch ), file = sys . stderr )
333
352
continue
334
353
335
354
STARTTIME = time .time ()
@@ -364,8 +383,9 @@ def booster():
364
383
## some samplers will fail to yield a sample in some (small number of) of runs due to combinatorics
365
384
x = sam .sample (mireal )
366
385
367
- mi ,mvcorr = x
368
- r = wap (mi , mvcorr , Mmv , DDt )
386
+ mi_perm , mut_perm_idx = x
387
+ #r = wap(mi, mvcorr, Mmv, DDt)
388
+ r = fwap (mi_perm , mv [mut_perm_idx ], DDt2 )
369
389
370
390
for rr in range (len (args .xpo )):
371
391
wap_rnd [rr ] += r [rr ]
0 commit comments