Skip to content

Commit

Permalink
resolving merge issues
Browse files Browse the repository at this point in the history
  • Loading branch information
ajdunford committed Mar 6, 2023
2 parents ca335c5 + 7daef08 commit 13f8c6d
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 36 deletions.
4 changes: 3 additions & 1 deletion Dockerfile
Expand Up @@ -12,9 +12,11 @@ RUN apt-get update && \
apt-get install -y openjdk-8-jre-headless && \
apt-get clean

COPY . /build
# Install prerequisites
RUN pip install twobitreader statsmodels scipy pyopenssl prody mkl-random mkl-fft lxml jpype1 canine biopython tqdm

# Install clumps
COPY . /build
RUN python3 -m pip install -e .

# Test
Expand Down
66 changes: 43 additions & 23 deletions clumps/clumps.py
Expand Up @@ -12,6 +12,7 @@
import numpy as np
import math
from tqdm import tqdm
import sys

from canine import Orchestrator
from canine.utils import ArgumentHelper
Expand All @@ -23,8 +24,8 @@
from .samplers.PhosphoSampler import *

from .mapping.mapper import GPmapper
from .utils import hill, parse_resmap, wap
from .utils import get_distance_matrix, transform_distance_matrix, get_pdb_muts_overlap, map_pos_with_weights
from .utils import hill, parse_resmap, wap, fwap
from .utils import get_distance_matrix, transform_distance_matrix, get_pdb_muts_overlap, map_pos_with_weights, transform_distance_matrix2
from .utils import mkdir

def main():
Expand Down Expand Up @@ -162,7 +163,7 @@ def main():
args.tumor_type = None

if args.tumor_type and args.pancan_factor != 1.0:
print('WARNING: args.pancan_factor is not 1 althought args.tumor_type is set. Correcting to args.pancan_factor=1')
print('WARNING: args.pancan_factor is not 1 althought args.tumor_type is set. Correcting to args.pancan_factor=1', file = sys.stderr)
args.pancan_factor = 1.0

args.mut_types = set(args.mut_types)
Expand Down Expand Up @@ -229,7 +230,7 @@ def main():
# CLUMPS
#----------------------------------------
if args.sampler == 'CoverageSampler' or args.sampler == 'MutspecCoverageSampler':
print("Building mapper...")
print("Building mapper...", file = sys.stderr)
gpm = GPmapper(hgfile=args.hgfile, spfile=args.fasta, mapfile=args.gpmaps)

# Load mutational frequencies
Expand Down Expand Up @@ -259,14 +260,29 @@ def main():
# TODO
#
pdbch = pdbch.split('-')

#splits res map into two lists of number, and a dict of numbers and booleans
#number: boolean dict represents when numbers are identical...?
ur,pr,prd = parse_resmap(resmap)


if len(ur) < 5:
print("Bad mapping for {}.".format(ur))
print("Bad mapping for {}.".format(ur), file = sys.stderr)
continue

# Skip structure if there are any negative UniProt -> PDB mappings
# (cause unknown, but likely an unusably bad structure)
if (pr < 0).any():
print(f"WARNING: skipping structure {u1} ({pdbch}) due to negative UniProt -> PDB mappings!", file = sys.stderr)
continue

# Remove non-unique UniProt -> PDB mappings (likely due to wonky homology modeling)
nuidx = np.flatnonzero(np.bincount(pr) > 1)
if len(nuidx):
rmidx = np.isin(pr, nuidx)
pr = pr[~rmidx]
ur = ur[~rmidx]
print(f"WARNING: removed {rmidx.sum()} residues with non-unique UniProt -> PDB mappings!", file = sys.stderr)

# Load Protein file
protein_muts = map_pos_with_weights(args.muts, u1, mfreq, args.tumor_type, args.mut_types, args.use_provided_values, args.mut_freq)
Expand All @@ -276,35 +292,38 @@ def main():
## mv: normalized mutation count at each residue
## mt: cancer types contributing mutations
mi,mv,mt = get_pdb_muts_overlap(ur, protein_muts, args.hill_exp, args.use_provided_values)
mv = np.c_[mv]

# Load AA residue coordinates
if len(mi) > 0:
try:
D,x,pdb_resnames = get_distance_matrix(pdbch, args.pdb_dir, pdb_resids=pr)
DDt = transform_distance_matrix(D, ur, args.xpo)
#DDt = transform_distance_matrix(D, ur, args.xpo)
DDt2 = np.tril(transform_distance_matrix2(D, args.xpo), -1)
except:
print("Unable to load PDB...")
print("Unable to load PDB...", file = sys.stderr)
continue

# print("Sampling {} | {} - {}".format(u1, pdbch, mi))

# Compute matrix
## matrix that holds mv[i]*mv[j] values (sqrt or not)
Mmv = []
mvcorr = range(len(mv))

for i in range(len(mi)):
mrow = np.zeros(len(mi), np.float64)
for j in range(len(mi)):
#mrow[j] = np.sqrt(mv[i]*mv[j]) ## geometric mean; actually does not perform better in most cases
if args.pancan_factor == 1.0:
mrow[j] = mv[i]*mv[j]
else:
mrow[j] = (args.pancan_factor + (1.0-args.pancan_factor)*(len(mt[i] & mt[j])>0)) * mv[i]*mv[j] ## product
Mmv.append(mrow)
#Mmv = []
#mvcorr = range(len(mv))

# for i in range(len(mi)):
# mrow = np.zeros(len(mi), np.float64)
# for j in range(len(mi)):
# #mrow[j] = np.sqrt(mv[i]*mv[j]) ## geometric mean; actually does not perform better in most cases
# if args.pancan_factor == 1.0:
# mrow[j] = mv[i]*mv[j]
# else:
# mrow[j] = (args.pancan_factor + (1.0-args.pancan_factor)*(len(mt[i] & mt[j])>0)) * mv[i]*mv[j] ## product
# Mmv.append(mrow)

# Compute WAP score
wap_obs = wap(mi, mvcorr, Mmv, DDt)
#wap_obs = wap(mi, mvcorr, Mmv, DDt)
wap_obs = fwap(mi, mv, DDt2)

# Create Null Sampler
rnd = 0
Expand All @@ -329,7 +348,7 @@ def main():
# test sampler
_ = sam.sample(mireal)
except:
print("Error initializing {} for {} {} {}.".format(args.sampler, u1, u2, pdbch))
print("Error initializing {} for {} {} {}.".format(args.sampler, u1, u2, pdbch), file = sys.stderr)
continue

STARTTIME=time.time()
Expand Down Expand Up @@ -364,8 +383,9 @@ def booster():
## some samplers will fail to yield a sample in some (small number of) of runs due to combinatorics
x = sam.sample(mireal)

mi,mvcorr = x
r = wap(mi, mvcorr, Mmv, DDt)
mi_perm, mut_perm_idx = x
#r = wap(mi, mvcorr, Mmv, DDt)
r = fwap(mi_perm, mv[mut_perm_idx], DDt2)

for rr in range(len(args.xpo)):
wap_rnd[rr] += r[rr]
Expand Down
46 changes: 34 additions & 12 deletions clumps/utils.py
Expand Up @@ -95,7 +95,7 @@ def parse_resmap(resmap):
pr.append(int(y))
prd[int(y)] = True

return ur,pr,prd
return np.r_[ur],np.r_[pr],prd

@contextlib.contextmanager
def gunzipper(gz_file):
Expand Down Expand Up @@ -141,14 +141,19 @@ def get_distance_matrix(pdbch, pdb_structures_dir, point='centroid', pdb_resids=
yy = aa.getCoords()
zz = aa.getResnames()

pdb_resids = None
# if list of PDB residues is not provided, look them up
if pdb_resids is None:
pdb_resids = {}
for i in range(len(xx)):
if zz[i] in AMINO_ACID_MAP:
pdb_resids[xx[i]] = True
pdb_resids = sorted(pdb_resids.keys())

# otherwise, perform sanity check that provided residue list comprises valid amino acids
else:
if len(set(pdb_resids) - set(xx[np.r_[[z in AMINO_ACID_MAP for z in zz]]])):
raise ValueError("Invalid PDB residues specified!")

mapped_pdb_to_aa = defaultdict(set)
for idx,resnum in enumerate(xx):
if resnum in pdb_resids:
Expand All @@ -163,24 +168,20 @@ def get_distance_matrix(pdbch, pdb_structures_dir, point='centroid', pdb_resids=
coords[xx[i]].append(yy[i]) ## add coordinates of an atom belonging to this residue

## Euclidean distance matrix
D = []
for i in range(len(pdb_resids)):
D.append(sp.zeros(i, dtype=sp.float32))

if point == 'centroid':
## distance between centroids
## calculate residue centroid positions
centroids = {}
for k in coords:
centroids[k] = np.mean(np.array(coords[k]), 0)

co = [centroids[i] for i in pdb_resids] ## pdb residue coordinates

for i in range(len(pdb_resids)):
for j in range(i):
D[i][j] = euclidean(co[i], co[j])
co = np.c_[[centroids[i] for i in pdb_resids]] ## pdb residue coordinates
co2 = (co**2).sum(1, keepdims = True)
D = co2 + co2.T - 2*co@co.T

elif point == 'min':
D = np.zeros(len(pdb_resids)*np.r_[1, 1])

## min-distance (atom pairs)
co = [coords[i] for i in pdb_resids] ## pdb atom coordinates
for i in range(len(pdb_resids)):
Expand All @@ -192,6 +193,7 @@ def get_distance_matrix(pdbch, pdb_structures_dir, point='centroid', pdb_resids=
if e < m:
m = e
D[i][j] = m
D = D**2 # TODO: optimize this using fast method employed in centroid method
else:
raise Exception('Unknown setting for point: %s' % point)

Expand All @@ -213,12 +215,25 @@ def transform_distance_matrix(D, ur, XPO):
for i in range(len(ur)):
mrow = sp.zeros(i, dtype=sp.float32)
for j in range(i):
mrow[j] = sp.exp(-(D[i][j]**2)/den)
mrow[j] = sp.exp(-(D[i][j])/den)
m.append(mrow)
DDt.append(m)

return DDt

def transform_distance_matrix2(D, XPO):
"""
Transform distance matrix.
--------------------------
Transforms distance matrix.
"""
DDt = [] ## array of transformed distance matrices
for soft_thresh_idx in range(len(XPO)):
den = 2.0 * XPO[soft_thresh_idx]**2
DDt.append(np.exp(-D/den))

return DDt

def load_prot_file(protein_dir, uniprot):
"""
Load Protein File
Expand Down Expand Up @@ -315,8 +330,15 @@ def wap(mut_indices, mvcorr, Mmv, DDt):
dcol = d[mut_indices[i]]
for j in range(i):
s[mat] += Mmv[mvcorr[i]][mvcorr[j]] * dcol[mut_indices[j]]

return s

def fwap(mi, mv, DDt):
scores = np.zeros(len(DDt))
for xpo_idx in range(len(DDt)):
scores[xpo_idx] = mv.T@DDt[xpo_idx][mi, :][:, mi]@mv
return scores

def get_fragment_annot(pdb, ch, pdb_dir):
"""
Get pdb-fragment annotation.
Expand Down

0 comments on commit 13f8c6d

Please sign in to comment.