In [1]:
import os
import sys

from Bio.Seq import Seq
import gzip
from twobitreader import TwoBitFile
import pandas as pd
from collections import defaultdict

In [2]:
from mapper import GPmapper, make_gp_file, split_muts_file
from spectra import calc_muts_spectra, calc_muts_freq

### Clumps Pipeline

In [3]:
input_maf = './inputs/PMBL.clumps.maf'
#inMaf = './inputs/brca_prosp_v2.1_CPTAC2_BRCA_prospective.v1.4.somatic.variants.070918.maf'

# Create GP Mapper
gpm = GPmapper()

# Make GP File
make_gp_file(gpm, input_maf, output_file='muts.gp')
             
# Split GP File
split_muts_file('muts.gp')

# Calculate mutational contexts
calc_muts_spectra(input_maf)

# Calculate mutational spectra
calc_muts_freq(input_maf)

ONO 654 2




dir exists.
SHOULD BE A MAF WITH ALL CODING MUTATIONS (NOT JUST MISSENSES)
Tumor Types: 1
	PMBL | 37 samples


In [None]:
# Create huniprot chunks
# Download files?

In [3]:
import random
import time

from gzip import GzipFile
#from lib import *
from multiprocessing import Process, Queue
from samplers.UniformSampler import *
from samplers.CoverageSampler import *
from samplers.MutspecCoverageSampler import *

In [4]:
RESFLAG = 'pmbl.rerun'
MUTURL = './splitByProtein/' # 'http://compute-a-16-139:9014/p='
MUTTYPES = set(['M'])
TTYPE = 'PMBL'
#MAPURL = '../res/huniprot2pdb.run18.split/'#'http://compute-a-17-138:9000/s='
#COVERAGETRACK = '/home/ajd27/clumps2/dat/WEx_cov.fwb'
#SAMPLER = 'MutspecCoverageSampler'
MAXRAND = 10000000
SAMPLEMUTFREQWEIGHT = 'sampleMutFreq.txt'
PATMUTSPECTRA = 'sampleMutSpectra.txt'
HILLEXP = 3
PANCANFACTOR = 1.0
XPO = [3, 4.5, 6, 8, 10]  ## 'soft threshold' parameter for the truncated Gaussian fct.
NUMCORES = 1
USEPROVIDEDVALUES = 0
CLUMPSMUTSAMPLEFILT = 3

In [5]:
#MAPSFILE = './dat/huniprot2pdb_chunks/huniprot2pdb_chunk_00000.gz'
MAPSFILE = './dat/huniprot2pdb.run18.filt.txt.gz'

#SHARD = MAPSFILE.rsplit('.',1)[0].rsplit('_',1)[1]  ## chunk identifier of the mapsfile
LINEIDX = 0

if TTYPE == 'PanCan':
    TTYPE = None

if TTYPE and PANCANFACTOR != 1.0:
    print('WARNING: pancanfactor is not 1 althought TTYPE is set. Correcting to pancanfactor=1')
    PANCANFACTOR = 1.0

In [6]:
try:
    del sys.modules['utils']
except:
    pass

from utils import *

In [8]:
xpol = len(XPO)

num_pdbs = 0
with gzip.open(MAPSFILE, 'r') as f:
    for idx,line in enumerate(f):
        u1,u2,pdbch,alidt,resmap = line.decode('utf-8').strip('\n').split('\t', 4)
        
        if os.path.isfile(os.path.join(MUTURL,u1)):
            pdbch = pdbch.split('-')
            
            ur,pr,prd = parse_resmap(resmap)

            if len(ur) < 5:
                ## number of mapped residues (between uniprot and pdb)
                #fo.write('#\n')
                #fo.close()
                exit(0)

            # Get AA residue Coordinates
            D,x = get_distance_matrix(pdbch, pdb_resids=pr)

            # Transform distance matrix
            DDt = transform_distance_matrix(D, ur, XPO)

            # Load mutational frequencies
            mfreq = load_mut_freqs(SAMPLEMUTFREQWEIGHT)

            # Load Protein file
            protein_muts = map_pos_with_weights(MUTURL, u1, mfreq, TTYPE, MUTTYPES, USEPROVIDEDVALUES, SAMPLEMUTFREQWEIGHT)

            # mi: index of mutated residue
            # mv: normalized mutation count at each residue
            # mt: cancer types contributing mutations
            mi,mv,mt = get_pdb_muts_overlap(ur, protein_muts, HILLEXP, USEPROVIDEDVALUES)
            
            if len(mi) > 0:
                break

@> 1120 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 2447 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> 1019 atoms and 20 coordinate set(s) were parsed in 0.06s.
@> 1492 atoms and 20 coordinate set(s) were parsed in 0.08s.
@> 876 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 951 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 1029 atoms and 1 coordinate set(s) were parsed in 0.01s.
@> 6305 atoms and 1 coordinate set(s) were parsed in 0.08s.
@> 896 atoms and 15 coordinate set(s) were parsed in 0.04s.
@> 2828 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> 2971 atoms and 1 coordinate set(s) were parsed in 0.03s.
@> 7749 atoms and 1 coordinate set(s) were parsed in 0.08s.


In [11]:
#pth = '../res/clumps/%s-%d_%s_%s_%s-%s_%s' % (SHARD, LINEIDX, u1, u2, pdbch[0], pdbch[1], resmap.split(':',1)[0])

In [9]:
mi,mv,mt

([53, 866],
 [0.0002698639871710061, 0.0002698639871710061],
 [{'PMBL'}, {'PMBL'}])

In [10]:
vcorr = range(len(mv))  ## correspondence between mi and mv
cm = len(mi)   ## mutated residue count

(range(0, 2), 2)

In [12]:
Mmv = []   ## matrix that holds mv[i]*mv[j] values (sqrt or not)
for i in range(len(mi)):
    mrow = sp.zeros(len(mi), sp.float64)  
    for j in range(len(mi)):
        #mrow[j] = sp.sqrt(mv[i]*mv[j])  ## geometric mean; actually does not perform better in most cases
        if PANCANFACTOR == 1.0:
            mrow[j] = mv[i]*mv[j]          ## product
        else:
            mrow[j] = (PANCANFACTOR + (1.0-PANCANFACTOR)*(len(mt[i] & mt[j])>0)) * mv[i]*mv[j]          ## product
    Mmv.append(mrow)

def wap(mi,mvcorr):
    s = sp.zeros(len(DDt), sp.float64)
    for mat in xrange(xpol):
        d = DDt[mat]
        for i in xrange(cm):
            dcol = d[mi[i]]
            for j in xrange(i):
                s[mat] += Mmv[mvcorr[i]][mvcorr[j]] * dcol[mi[j]]
    return s

In [16]:
mi, range(len(mv))

([53, 866], range(0, 2))