Skip to content

Commit 13f8c6d

Browse files
committed
resolving merge issues
2 parents ca335c5 + 7daef08 commit 13f8c6d

File tree

3 files changed

+80
-36
lines changed

3 files changed

+80
-36
lines changed

Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@ RUN apt-get update && \
1212
apt-get install -y openjdk-8-jre-headless && \
1313
apt-get clean
1414

15-
COPY . /build
15+
# Install prerequisites
16+
RUN pip install twobitreader statsmodels scipy pyopenssl prody mkl-random mkl-fft lxml jpype1 canine biopython tqdm
1617

1718
# Install clumps
19+
COPY . /build
1820
RUN python3 -m pip install -e .
1921

2022
# Test

clumps/clumps.py

Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import numpy as np
1313
import math
1414
from tqdm import tqdm
15+
import sys
1516

1617
from canine import Orchestrator
1718
from canine.utils import ArgumentHelper
@@ -23,8 +24,8 @@
2324
from .samplers.PhosphoSampler import *
2425

2526
from .mapping.mapper import GPmapper
26-
from .utils import hill, parse_resmap, wap
27-
from .utils import get_distance_matrix, transform_distance_matrix, get_pdb_muts_overlap, map_pos_with_weights
27+
from .utils import hill, parse_resmap, wap, fwap
28+
from .utils import get_distance_matrix, transform_distance_matrix, get_pdb_muts_overlap, map_pos_with_weights, transform_distance_matrix2
2829
from .utils import mkdir
2930

3031
def main():
@@ -162,7 +163,7 @@ def main():
162163
args.tumor_type = None
163164

164165
if args.tumor_type and args.pancan_factor != 1.0:
165-
print('WARNING: args.pancan_factor is not 1 althought args.tumor_type is set. Correcting to args.pancan_factor=1')
166+
print('WARNING: args.pancan_factor is not 1 althought args.tumor_type is set. Correcting to args.pancan_factor=1', file = sys.stderr)
166167
args.pancan_factor = 1.0
167168

168169
args.mut_types = set(args.mut_types)
@@ -229,7 +230,7 @@ def main():
229230
# CLUMPS
230231
#----------------------------------------
231232
if args.sampler == 'CoverageSampler' or args.sampler == 'MutspecCoverageSampler':
232-
print("Building mapper...")
233+
print("Building mapper...", file = sys.stderr)
233234
gpm = GPmapper(hgfile=args.hgfile, spfile=args.fasta, mapfile=args.gpmaps)
234235

235236
# Load mutational frequencies
@@ -259,14 +260,29 @@ def main():
259260
# TODO
260261
#
261262
pdbch = pdbch.split('-')
263+
262264
#splits res map into two lists of number, and a dict of numbers and booleans
263265
#number: boolean dict represents when numbers are identical...?
264266
ur,pr,prd = parse_resmap(resmap)
265267

268+
266269
if len(ur) < 5:
267-
print("Bad mapping for {}.".format(ur))
270+
print("Bad mapping for {}.".format(ur), file = sys.stderr)
271+
continue
272+
273+
# Skip structure if there are any negative UniProt -> PDB mappings
274+
# (cause unknown, but likely an unusably bad structure)
275+
if (pr < 0).any():
276+
print(f"WARNING: skipping structure {u1} ({pdbch}) due to negative UniProt -> PDB mappings!", file = sys.stderr)
268277
continue
269278

279+
# Remove non-unique UniProt -> PDB mappings (likely due to wonky homology modeling)
280+
nuidx = np.flatnonzero(np.bincount(pr) > 1)
281+
if len(nuidx):
282+
rmidx = np.isin(pr, nuidx)
283+
pr = pr[~rmidx]
284+
ur = ur[~rmidx]
285+
print(f"WARNING: removed {rmidx.sum()} residues with non-unique UniProt -> PDB mappings!", file = sys.stderr)
270286

271287
# Load Protein file
272288
protein_muts = map_pos_with_weights(args.muts, u1, mfreq, args.tumor_type, args.mut_types, args.use_provided_values, args.mut_freq)
@@ -276,35 +292,38 @@ def main():
276292
## mv: normalized mutation count at each residue
277293
## mt: cancer types contributing mutations
278294
mi,mv,mt = get_pdb_muts_overlap(ur, protein_muts, args.hill_exp, args.use_provided_values)
295+
mv = np.c_[mv]
279296

280297
# Load AA residue coordinates
281298
if len(mi) > 0:
282299
try:
283300
D,x,pdb_resnames = get_distance_matrix(pdbch, args.pdb_dir, pdb_resids=pr)
284-
DDt = transform_distance_matrix(D, ur, args.xpo)
301+
#DDt = transform_distance_matrix(D, ur, args.xpo)
302+
DDt2 = np.tril(transform_distance_matrix2(D, args.xpo), -1)
285303
except:
286-
print("Unable to load PDB...")
304+
print("Unable to load PDB...", file = sys.stderr)
287305
continue
288306

289307
# print("Sampling {} | {} - {}".format(u1, pdbch, mi))
290308

291309
# Compute matrix
292310
## matrix that holds mv[i]*mv[j] values (sqrt or not)
293-
Mmv = []
294-
mvcorr = range(len(mv))
295-
296-
for i in range(len(mi)):
297-
mrow = np.zeros(len(mi), np.float64)
298-
for j in range(len(mi)):
299-
#mrow[j] = np.sqrt(mv[i]*mv[j]) ## geometric mean; actually does not perform better in most cases
300-
if args.pancan_factor == 1.0:
301-
mrow[j] = mv[i]*mv[j]
302-
else:
303-
mrow[j] = (args.pancan_factor + (1.0-args.pancan_factor)*(len(mt[i] & mt[j])>0)) * mv[i]*mv[j] ## product
304-
Mmv.append(mrow)
311+
#Mmv = []
312+
#mvcorr = range(len(mv))
313+
314+
# for i in range(len(mi)):
315+
# mrow = np.zeros(len(mi), np.float64)
316+
# for j in range(len(mi)):
317+
# #mrow[j] = np.sqrt(mv[i]*mv[j]) ## geometric mean; actually does not perform better in most cases
318+
# if args.pancan_factor == 1.0:
319+
# mrow[j] = mv[i]*mv[j]
320+
# else:
321+
# mrow[j] = (args.pancan_factor + (1.0-args.pancan_factor)*(len(mt[i] & mt[j])>0)) * mv[i]*mv[j] ## product
322+
# Mmv.append(mrow)
305323

306324
# Compute WAP score
307-
wap_obs = wap(mi, mvcorr, Mmv, DDt)
325+
#wap_obs = wap(mi, mvcorr, Mmv, DDt)
326+
wap_obs = fwap(mi, mv, DDt2)
308327

309328
# Create Null Sampler
310329
rnd = 0
@@ -329,7 +348,7 @@ def main():
329348
# test sampler
330349
_ = sam.sample(mireal)
331350
except:
332-
print("Error initializing {} for {} {} {}.".format(args.sampler, u1, u2, pdbch))
351+
print("Error initializing {} for {} {} {}.".format(args.sampler, u1, u2, pdbch), file = sys.stderr)
333352
continue
334353

335354
STARTTIME=time.time()
@@ -364,8 +383,9 @@ def booster():
364383
## some samplers will fail to yield a sample in some (small number of) of runs due to combinatorics
365384
x = sam.sample(mireal)
366385

367-
mi,mvcorr = x
368-
r = wap(mi, mvcorr, Mmv, DDt)
386+
mi_perm, mut_perm_idx = x
387+
#r = wap(mi, mvcorr, Mmv, DDt)
388+
r = fwap(mi_perm, mv[mut_perm_idx], DDt2)
369389

370390
for rr in range(len(args.xpo)):
371391
wap_rnd[rr] += r[rr]

clumps/utils.py

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def parse_resmap(resmap):
9595
pr.append(int(y))
9696
prd[int(y)] = True
9797

98-
return ur,pr,prd
98+
return np.r_[ur],np.r_[pr],prd
9999

100100
@contextlib.contextmanager
101101
def gunzipper(gz_file):
@@ -141,14 +141,19 @@ def get_distance_matrix(pdbch, pdb_structures_dir, point='centroid', pdb_resids=
141141
yy = aa.getCoords()
142142
zz = aa.getResnames()
143143

144-
pdb_resids = None
144+
# if list of PDB residues is not provided, look them up
145145
if pdb_resids is None:
146146
pdb_resids = {}
147147
for i in range(len(xx)):
148148
if zz[i] in AMINO_ACID_MAP:
149149
pdb_resids[xx[i]] = True
150150
pdb_resids = sorted(pdb_resids.keys())
151151

152+
# otherwise, perform sanity check that provided residue list comprises valid amino acids
153+
else:
154+
if len(set(pdb_resids) - set(xx[np.r_[[z in AMINO_ACID_MAP for z in zz]]])):
155+
raise ValueError("Invalid PDB residues specified!")
156+
152157
mapped_pdb_to_aa = defaultdict(set)
153158
for idx,resnum in enumerate(xx):
154159
if resnum in pdb_resids:
@@ -163,24 +168,20 @@ def get_distance_matrix(pdbch, pdb_structures_dir, point='centroid', pdb_resids=
163168
coords[xx[i]].append(yy[i]) ## add coordinates of an atom belonging to this residue
164169

165170
## Euclidean distance matrix
166-
D = []
167-
for i in range(len(pdb_resids)):
168-
D.append(sp.zeros(i, dtype=sp.float32))
169-
170171
if point == 'centroid':
171172
## distance between centroids
172173
## calculate residue centroid positions
173174
centroids = {}
174175
for k in coords:
175176
centroids[k] = np.mean(np.array(coords[k]), 0)
176177

177-
co = [centroids[i] for i in pdb_resids] ## pdb residue coordinates
178-
179-
for i in range(len(pdb_resids)):
180-
for j in range(i):
181-
D[i][j] = euclidean(co[i], co[j])
178+
co = np.c_[[centroids[i] for i in pdb_resids]] ## pdb residue coordinates
179+
co2 = (co**2).sum(1, keepdims = True)
180+
D = co2 + co2.T - 2*co@co.T
182181

183182
elif point == 'min':
183+
D = np.zeros(len(pdb_resids)*np.r_[1, 1])
184+
184185
## min-distance (atom pairs)
185186
co = [coords[i] for i in pdb_resids] ## pdb atom coordinates
186187
for i in range(len(pdb_resids)):
@@ -192,6 +193,7 @@ def get_distance_matrix(pdbch, pdb_structures_dir, point='centroid', pdb_resids=
192193
if e < m:
193194
m = e
194195
D[i][j] = m
196+
D = D**2 # TODO: optimize this using fast method employed in centroid method
195197
else:
196198
raise Exception('Unknown setting for point: %s' % point)
197199

@@ -213,12 +215,25 @@ def transform_distance_matrix(D, ur, XPO):
213215
for i in range(len(ur)):
214216
mrow = sp.zeros(i, dtype=sp.float32)
215217
for j in range(i):
216-
mrow[j] = sp.exp(-(D[i][j]**2)/den)
218+
mrow[j] = sp.exp(-(D[i][j])/den)
217219
m.append(mrow)
218220
DDt.append(m)
219221

220222
return DDt
221223

224+
def transform_distance_matrix2(D, XPO):
225+
"""
226+
Transform distance matrix.
227+
--------------------------
228+
Transforms distance matrix.
229+
"""
230+
DDt = [] ## array of transformed distance matrices
231+
for soft_thresh_idx in range(len(XPO)):
232+
den = 2.0 * XPO[soft_thresh_idx]**2
233+
DDt.append(np.exp(-D/den))
234+
235+
return DDt
236+
222237
def load_prot_file(protein_dir, uniprot):
223238
"""
224239
Load Protein File
@@ -315,8 +330,15 @@ def wap(mut_indices, mvcorr, Mmv, DDt):
315330
dcol = d[mut_indices[i]]
316331
for j in range(i):
317332
s[mat] += Mmv[mvcorr[i]][mvcorr[j]] * dcol[mut_indices[j]]
333+
318334
return s
319335

336+
def fwap(mi, mv, DDt):
337+
scores = np.zeros(len(DDt))
338+
for xpo_idx in range(len(DDt)):
339+
scores[xpo_idx] = mv.T@DDt[xpo_idx][mi, :][:, mi]@mv
340+
return scores
341+
320342
def get_fragment_annot(pdb, ch, pdb_dir):
321343
"""
322344
Get pdb-fragment annotation.

0 commit comments

Comments
 (0)