In [None]:
import pandas, os, sys, re, time, collections, Bio, random, glob, importlib, pickle

sys.path.append('/Users/dp/pma/')
import sameRiver
import sameRiver.biotypeLookupFileMaker

from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
from Bio import SeqIO
from pprint import pprint as pp
from functools import reduce
import numpy as np
from typing import List, Mapping, Union


import tcgaParser
import tcgaAnnotater
importlib.reload(tcgaParser)
importlib.reload(tcgaAnnotater)


In [None]:
# Study descriptions from https://www.cbioportal.org/webservice.do?cmd=getCancerTypes

study_info = tcgaParser.currated_set_of_nonredundant_studies(
        fname='./all_TCGA_data/currated_set_of_nonredundant_studies_list.txt',
        study_desc_fname='cancerLists/tcga_study_ids_and_descriptions.do')

In [None]:
def do_all(
    nonredundant_study_ids: List, study_info: tcgaParser.currated_set_of_nonredundant_studies) -> tcgaParser.dataLoader:
    
    
    set_of_studies = set([
        fname for study in nonredundant_study_ids if os.path.exists(
            fname := 'all_TCGA_data/Jan2020_census_api_outputs/census_{}.txt'.format(study))])

    set_of_studies |= set([
        fname for study in nonredundant_study_ids if os.path.exists(
            fname := 'all_TCGA_data/Dec2019_api_outputs/all_rbps_{}.txt'.format(study))])
    
    dl = tcgaParser.dataLoader(study_info=study_info)

    dl.add_studies(set_of_studies)
    #print("Total patients: {}".format(dl.n_total_patients))
    
    return dl


#all_dl = do_all(study_info.nonredundant_study_ids, study_info)

#all_an = tcgaAnnotater.tcgaAnnotater(all_dl)

class tcgaConvolve():

    @staticmethod
    def smooth(y, box_pts):
        #https://stackoverflow.com/questions/20618804/how-to-smooth-a-curve-in-the-right-way
        box = np.ones(box_pts)/box_pts
        y_smooth = np.convolve(y, box, mode='same')
        return y_smooth

    def for_each_mut(self, by_mutation, box_width=1):
        gene_array = {}
        for (gene, mut), num_patients in by_mutation.items():
            #if gene != 'TENT4B':
            #    continue
            if '*' in mut:
                continue
            if '?' in mut:
                continue
            pos = re.search('\w(\d+)\w', mut)

            if pos is None:
                continue
                
            pos = int(pos.group(1)) - 1
            if pos < 0:
                print(gene, mut, pos)
            if gene not in gene_array:
                gene_array[gene] = np.zeros(pos)
            if pos >= len(gene_array[gene]):
                gene_array[gene] = np.append(
                    gene_array[gene], np.zeros(1+pos-len(gene_array[gene])))
            gene_array[gene][pos] += len(num_patients)
            
        smoothed = {}
        self.max_positions = {}
        self.val_at_max = {}
        
        for gene in gene_array:

            smoothed[gene] = self.smooth(gene_array[gene], box_width)

            self.max_positions[gene] = self.highest_point(smoothed[gene])

            if self.max_positions[gene] is None:
                self.val_at_max[gene] = 0
            else:
                self.val_at_max[gene] = box_width * smoothed[gene][self.max_positions[gene]]

        #print(gene_array)
        #print(smoothed)
        
    def highest_point(self, arr):

        if len(arr) != 0:
            _max = np.max(arr)
        else:
            _max = 0
            return None

        indexes_of_maxes = []
        for n, v in enumerate(arr):
            if v == _max:
                indexes_of_maxes.append(n)

        if len(indexes_of_maxes) > 1:
            consecutive = []
            for n, index in enumerate(indexes_of_maxes):
                if n == 0:
                    consecutive.append(index)
                    continue
                if index <= (consecutive[n-1]+3):  # Allow 2 nt gaps.
                    consecutive.append(index)
                else:  # Reject if there are nonadjacent maxima
                    return None
            # Take the peak as the middle point.
            max_index = int((consecutive[0] + consecutive[-1])/2)

#            max_index = random.choice(indexes_of_maxes)
        else:
            max_index = indexes_of_maxes[0]

        
        return max_index

print(all_dl.by_gene['PCBP1'])
print(all_dl.by_mutation[('PCBP1', 'L100Q')])
print(all_dl.studies[-1])

In [None]:
pickle.dump(all_dl, open("outputs/tcgaLoaderObj.p", "wb"))

In [None]:


# total patients = 35313
n_total_patients = 35313

width_1_convolver = tcgaConvolve()
width_1_convolver.for_each_mut(all_dl.by_mutation, box_width=1)

#width_n_convolver = tcgaConvolve()
#width_n_convolver.for_each_mut(all_dl.by_mutation, box_width=5)

convolver = width_1_convolver
#print(convolver.max_positions['PCBP1'])
#print(convolver.val_at_max['SMAD2'])

cutoff = 1.8

above = {}
for gene in convolver.val_at_max.keys():
    if bool(convolver.val_at_max[gene]>=cutoff):
        above[gene] = convolver.val_at_max[gene]

#for k in ['RBFOX1', 'DDX3X', 'MSI2', 'CELF1', 'SRSF1', 'SRSF2', 'EIF4H', 'TIAL1']:
#    print(k, convolver.val_at_max[k])
sorted_above = sorted(above.keys(), key=lambda x: above[x])
print('{} above {}'.format(len(above), cutoff))
for k in sorted_above[-50:]:
    print(k, above[k], convolver.max_positions[k])

In [None]:
RPL10L
DIS3
DNMT3B
ESS2