In [None]:
import pandas, os, sys, re, time, collections, Bio, pprint, random

sys.path.append('/Users/dp/pma/')
import sameRiver
import sameRiver.biotypeLookupFileMaker

from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
from Bio import SeqIO
from pprint import pprint as pp
from functools import reduce


In [None]:
df = pandas.read_csv('uniprot_id_to_gene_name.tab', sep='\t', index_col=False)
df.columns = ['Uniprot', 'Gene']
uniprot_to_gene_name = dict(zip([x.upper() for x in df.Uniprot], [x.upper() for x in df.Gene]))
amigo = pandas.read_csv('listsOfRbps/rna_binding_genes_from_amigo.txt', sep='\t', comment='#')['Gene name'].tolist()


In [None]:
def get_nonrbps():
    nonrbps = ['CDKN2A', 'HIST1H1C', 'HLA-A', 'HLA-B']
    return []#nonrbps

def remove_nonrbps(x):
    nonrbps = get_nonrbps()
    return [_ for _ in x if _ not in nonrbps]

In [None]:
# Get protein lengths.

protein_length = collections.defaultdict(int)
li = ''

for seq_record in SeqIO.parse('./uniprot-proteome%3AUP000005640.fasta', "fasta"):
    m = re.search(' GN=(\w+) ', seq_record.description)
    gene = seq_record.id.split('|')[-1].split('_')[0].upper()

    if m is not None:
        gene = m.group(1)

    if 'Fragment' in seq_record.id:
        continue
    if gene in uniprot_to_gene_name:
        protein_length[uniprot_to_gene_name[gene]] = max([
            protein_length[uniprot_to_gene_name[gene]], len(seq_record.seq)])
        protein_length[gene] = max([protein_length[uniprot_to_gene_name[gene]], len(seq_record.seq)])
    else:
        protein_length[gene] = max([protein_length[gene], len(seq_record.seq)])

    li += gene + '\n'
    
with open('uniprot_gene_ids.txt', 'w') as f:
    f.write(li)


In [None]:

bailey = pandas.read_excel('cancerLists/bailey_mh_cell_cancerDrivers.xlsx', sheet_name='Table S1')
bailey_amigo = bailey.loc[[bool(x in amigo) for x in bailey.Gene], :]
bailey_scc = bailey_amigo.loc[bailey['Cancer']=='PANCAN', :]
bailey_amigo_uniq = bailey_amigo.drop_duplicates('Gene')
print(bailey_amigo_uniq.shape)
print('\n'.join(bailey_scc['Gene'].tolist()))
#print(bailey_amigo_uniq)
#print(bailey_amigo)
bailey_amigo_uniq.to_excel('outputs/bailey_uniq_amigo.xlsx')
bailey_amigo.to_excel('outputs/bailey_amigo.xlsx')

In [None]:
oncokb = pandas.read_csv('cancerLists/cancerGeneList.txt', sep='\t')
print(oncokb.head(1))
print(oncokb.shape)
oncokb_amigo = oncokb.loc[[bool(x in amigo) for x in oncokb['Hugo Symbol']], :]
oncokb_amigo.sort_index(by='# of occurrence within resources (Column D-J)', ascending=False, inplace=True)
print(oncokb_amigo.shape)
print(oncokb_amigo.head(2))
oncokb_amigo.to_excel('outputs/oncokb_amigo.xlsx')

In [None]:
# Single cell SCC sequencing data (Andrew Ji)
sc_tss_vs_norm_basal = pandas.read_excel('cancerLists/pt7new_de_tss_0.01.xlsx')
print(sc_tss_vs_norm_basal.head(1))
sc_tss_vs_norm_basal_amigo = sc_tss_vs_norm_basal.loc[[bool(x in amigo) for x in sc_tss_vs_norm_basal.Gene], :]
df = sc_tss_vs_norm_basal_amigo
df = df.loc[df['p_val']< 0.01, :]
df = df.loc[[abs(x)>=0.7 for x in df['avg_logFC']], :]
df.sort_index(by='avg_logFC', inplace=True)
sc_tss_vs_norm_basal_amigo = df
print(sc_tss_vs_norm_basal_amigo)

In [None]:
hotspots = pandas.read_csv('cancerLists/hotspots.txt', sep='\t')
print(hotspots.head(1))
print(hotspots.shape)
hotspots_amigo = hotspots.loc[[bool(x in amigo) for x in hotspots['Gene']], :]
print(hotspots_amigo.shape)
hotspots_amigo.drop_duplicates('Gene', inplace=True,)
print(len(set(hotspots_amigo['Gene'])))
print(hotspots_amigo)
print('\n'.join(hotspots_amigo['Gene'].tolist()))

In [None]:
cosmic = pandas.read_csv('cancerLists/Census_allThu May 30 21_44_43 2019.tsv', sep='\t')
cosmic['SCC?'] = [
    {True: 'Y', False: 'N'}[bool(re.search('SCC', str(x)) is not None)] for x in cosmic['Tumour Types(Somatic)']]
#print([re.search('SCC', str(x) is not None) for x in cosmic['Tumour Types(Somatic)']])
#print(cosmic.head())
print(cosmic.shape)
cosmic_amigo = cosmic.loc[[bool(x in amigo) for x in cosmic['Gene Symbol']], :]
print(cosmic_amigo.shape)
scc_cos_amigo = cosmic_amigo.loc[[bool(x=='Y') for x in cosmic_amigo['SCC?']], :]
print(scc_cos_amigo)
cosmic_amigo.to_excel('outputs/cosmic_amigo.xlsx')
print(cosmic_amigo.head(2))


In [None]:
skin_tcga = pandas.read_csv('Mutated_Genes.tsv', sep='\t')
skin_tcga['length'] = [protein_length.get(x, 1E6) for x in skin_tcga['Gene']]
skin_tcga = skin_tcga.loc[skin_tcga['length']>0, :]
skin_tcga['Patients/length'] = [1000*x/y for x, y in zip(skin_tcga['#'], skin_tcga.length)]
skin_tcga.sort_values(by='Patients/length', ascending=False, inplace=True)
print(skin_tcga.head(2))
print(skin_tcga.shape)
skin_tcga_amigo = skin_tcga.loc[[bool(x in amigo) for x in skin_tcga.Gene], :]


print(skin_tcga_amigo.shape)
print(skin_tcga_amigo.head(10))


In [None]:
exomes = pandas.read_excel('cancerLists/SCC exomes and targeted.100 integrated exomes.xlsx')
exomes['Gene'] = [str(x).upper() for x in exomes.Gene]
exomes['length'] = [protein_length.get(x, 1E6) for x in exomes['Gene']]
exomes = exomes.loc[exomes['length']>0, :]
exomes['Patients/length'] = [1000*x/y for x, y in zip(exomes['# of Patients Mutated'], exomes.length)]
exomes.sort_values(by='Patients/length', ascending=False, inplace=True)
exomes.index = range(0, exomes.shape[0])
print(exomes.head(1))
print(exomes.shape)

exome_rbps = exomes.loc[[bool(x in amigo) for x in exomes['Gene']], :]
print(exome_rbps.shape)
#print(exome_miss)4
exome_rbps['Patient codes'] = exome_rbps['Patient codes'].astype(str)
patients = reduce(lambda a, b: a + '_' + b, exome_rbps['Patient codes'].tolist())
patients = set(patients.split('_')) - set([''])
n_patients = len(patients)
print(n_patients)
print(exome_rbps.loc[:, ['Gene', #'# of Genes Mutated', 
               '# of Patients Mutated', 'length', 'Patients/length']])
# First with any real chance of being an RBP: HELZ(18 patients)
# ADAD1: adenosine deamindase. Normally only expressed in testes. Occasionally expressed in some other
# cancers, but typically not in a mutated form.
import matplotlib.pyplot as plt
import numpy as np
arr_len = np.array(exomes['Patients/length'])
arr_len = arr_len[arr_len<1E6]
plt.hist(arr_len, bins=50)
plt.title('All genes')
plt.show()
plt.clf()
plt.hist(exome_rbps['Patients/length'].tolist(), bins=50)
plt.title('RBPs')
plt.show()
plt.clf()
print(np.median(arr_len), np.median(exome_rbps['Patients/length']))

In [None]:
import glob
import tcgaParser
import tcgaAnnotater

def do_all():

    # Read studies
    studies = []
    by_mutation = collections.defaultdict(set)
    
    set_of_studies = set()
    for fname in glob.glob('./all_TCGA_data/Jan2020_census_api_outputs/'):
        set_of_studies.add(fname)
        
    dl = tcgaParser.dataLoader()
    dl.do_a_set_of_studies(set_of_studies)
    return dl

#all_dl = do_all()
#all_an = tcgaAnnotater.tcgaAnnotater(all_dl)
import numpy as np
class tcgaConvolve():

    @staticmethod
    def smooth(y, box_pts):
        #https://stackoverflow.com/questions/20618804/how-to-smooth-a-curve-in-the-right-way
        box = np.ones(box_pts)/box_pts
        y_smooth = np.convolve(y, box, mode='same')
        return y_smooth

    def for_each_mut(self, by_mutation, box_width=1):
        gene_array = {}
        for (gene, mut), num_patients in by_mutation.items():
            #if gene != 'TENT4B':
            #    continue
            if '*' in mut:
                continue
            if '?' in mut:
                continue
            pos = re.search('\w(\d+)\w', mut)

            if pos is None:
                continue
                
            pos = int(pos.group(1)) - 1
            if pos < 0:
                print(gene, mut, pos)
            if gene not in gene_array:
                gene_array[gene] = np.zeros(pos)
            if pos >= len(gene_array[gene]):
                gene_array[gene] = np.append(
                    gene_array[gene], np.zeros(1+pos-len(gene_array[gene])))
            gene_array[gene][pos] += len(num_patients)
            
        smoothed = {}
        self.max_positions = {}
        self.val_at_max = {}
        
        for gene in gene_array:

            smoothed[gene] = self.smooth(gene_array[gene], box_width)

            self.max_positions[gene] = self.highest_point(smoothed[gene])

            if self.max_positions[gene] is None:
                self.val_at_max[gene] = 0
            else:
                self.val_at_max[gene] = box_width * smoothed[gene][self.max_positions[gene]]

        #print(gene_array)
        #print(smoothed)
        
    def highest_point(self, arr):

        if len(arr) != 0:
            _max = np.max(arr)
        else:
            _max = 0
            return None

        indexes_of_maxes = []
        for n, v in enumerate(arr):
            if v == _max:
                indexes_of_maxes.append(n)

        if len(indexes_of_maxes) > 1:
            consecutive = []
            for n, index in enumerate(indexes_of_maxes):
                if n == 0:
                    consecutive.append(index)
                    continue
                if index <= (consecutive[n-1]+3):  # Allow 2 nt gaps.
                    consecutive.append(index)
                else:  # Reject if there are nonadjacent maxima
                    return None
            # Take the peak as the middle point.
            max_index = int((consecutive[0] + consecutive[-1])/2)

#            max_index = random.choice(indexes_of_maxes)
        else:
            max_index = indexes_of_maxes[0]

        
        return max_index

# total patients = 35313
n_total_patients = 35313

width_1_convolver = tcgaConvolve()
width_1_convolver.for_each_mut(all_dl.by_mutation, box_width=1)

width_n_convolver = tcgaConvolve()
width_n_convolver.for_each_mut(all_dl.by_mutation, box_width=5)

convolver = width_n_convolver
print(convolver.max_positions['EIF1AX'])
print(convolver.val_at_max['YTHDC2'])

cutoff = 1.8

above = {}
for gene in convolver.val_at_max.keys():
    if bool(convolver.val_at_max[gene]>=cutoff):
        above[gene] = convolver.val_at_max[gene]

for k in ['RBFOX1', 'DDX3X', 'MSI2', 'CELF1', 'SRSF1', 'SRSF2', 'EIF4H', 'TIAL1']:
    print(k, convolver.val_at_max[k])
sorted_above = sorted(above.keys(), key=lambda x: above[x])
print('{} above {}'.format(len(above), cutoff))
for k in sorted_above[-100:]:
    print(k, above[k], convolver.max_positions[k])


In [None]:
all_exomes = pandas.read_excel('cancerLists/updated_100WES_for_cari.xlsx')
all_exomes.index = all_exomes['Gene'].tolist()
exomes = all_exomes.loc[[bool(x in amigo) for x in df.Gene], :]

print(exomes.head(2))
exomes.sort_values(by='mutations_normalized', ascending=False, inplace=True)
ex_cols = ['Gene', 'Non/Syn', 'NumberOfSCCs_filtered', 'mutations_normalized'] 
print(exomes.head(1))
print(exomes[ex_cols].head(20))
non_over_syn = exomes.loc[[x>=3. for x in exomes['Non/Syn']], :]
print(non_over_syn[ex_cols].head(20))


In [None]:
df = pandas.read_excel('RBP_high_freq_mutations.xlsx')
df.index = zip(df['Gene'].tolist(), df.Mutation)
scc_genes = set(all_exomes.index)

def add_if_there(x, col):
    if x in scc_genes:
        return all_exomes.loc[x, col]
    else:
        return ''
import numpy as np

def yes_if_in(name, a_set):
    if name in a_set:
        return 'Yes'
    return 'No'
def expressed(x):
    if x not in scc_genes:
        return 'Not found'
    vals = all_exomes.loc[x, ['day_0', 'day_3', 'day_6']]
    vals = np.nan_to_num(vals)
    return np.average(vals)

df['cSCC Nonsyn./Synom.'] = [add_if_there(x, 'Non/Syn') for x in df.Gene]
df['cSCC # Patients/100 with ANY non_synon. mutation in gene'] = [
    add_if_there(x, 'NumberOfSCCs_filtered') for x in df.Gene]
df['cSCC expression'] = [expressed(x) for x in df.Gene]

df['# TCGA patients (out of 35313) with ANY non-truncating mutation at a.a. X'] = [
    width_1_convolver.val_at_max.get(x, '') for x in df.Gene
]
df['a.a. X'] = [
    width_1_convolver.max_positions.get(x, '') for x in df.Gene
]
def add_one(val):
    try:
        return val + 1
    except:
        return ''
df['a.a. X'] = [add_one(x) for x in df['a.a. X']]
df['Convolved # patients (out of 35313) with ANY non-truncating mutation near a.a. Y'] = [
    width_n_convolver.val_at_max.get(x, '') for x in df.Gene
]
df['a.a. Y'] = [
    add_one(width_n_convolver.max_positions.get(x, '')) for x in df.Gene
]
df['COSMIC cancer gene?'] = [yes_if_in(x, cosmic['Gene Symbol'].tolist()) for x in df.Gene]
df['MH Bailey cancer driver?'] = [yes_if_in(x, bailey['Gene'].tolist()) for x in df.Gene]
df['OncoKB cancer gene?'] = [yes_if_in(x, oncokb['Hugo Symbol'].tolist()) for x in df.Gene]
df['Cancer mutation hotspot?'] = [yes_if_in(x, hotspots['Gene'].tolist()) for x in df.Gene]
df['Amigo RBP?'] = [yes_if_in(x, amigo) for x in df.Gene]

cols_to_remove = [
    '# cases in all TCGA', 
    '# cases with ANY nonsyn mutation in this gene in 100 exomes cSCC Lee',
    #'Amigo RBP?', 
    'URL', 'notes', 'notes2', 'notes3', 'MGC with CDS',
    #'Ordered or have already',
] + [x for x in df.columns if 'Unnamed' in x]
for col in set(cols_to_remove):
    if col in df.columns:
        del df[col]

col_explanation = {
    '# cases/10,161': 'From original TCGA analysis by Amin',
    'Mutation rank/1000': 'From original TCGA analysis by Amin',
    'Old dataset?': 'Rationale for inclusion includes the TCGA analysis by Amin',
    'Rationale': 'Reason for inclusion in this list',
    'Amigo RBP?': 'Amigo GO term for RNA-binding',
    'Polyphen summary': 'Predicted mutation impact',
    'Polyphen score': 'Higher number means a negative predicted impact on protein',
    'Ordered or have already': 'Whether this protein will be included in our CLIP study',
    'cSCC Nonsyn./Synom': 'From Cari Lee, the ratio of nonsynon/synonymous mutations in cSCC',
    'cSCC # Patients/100 with ANY non_synon. mutation in gene': 'From Cari Lee data',
    'cSCC expression': "From Lee lab data",
    '# TCGA patients (out of 35313) with ANY non-truncating mutation at a.a. X': "I downloaded data for every RBP (by Amigo term) for every TCGA dataset for this count.",
    "a.a. X": "Amino acid position X",
    'Convolved # patients (out of 35313) with ANY non-truncating mutation near a.a. Y': 'Box convolution with width 5 nt for missense mutations. This leads to a peak at a.a. Y, and the total number of mutated patients in the 5 nt around Y in this column.',
    'a.a. Y': "Amino acid position Y",
    'COSMIC cancer gene?': 'In COSMIC for any cancer type.',
    'MH Bailey cancer driver?': "In M.H. Bailey, 'Comprehensive Characterization of Cancer Driver Genes and Mutations.' (2018) as a cancer driver",
    'OncoKB cancer gene?': 'In OncoKB for any cancer',
    'Cancer mutation hotspot?': 'In http://www.cancerhotspots.org (Memorial Sloan Kettering)',
}
col_explanation_df = pandas.DataFrame.from_dict(col_explanation, orient='index')
col_explanation_df.columns = ['Description']
col_explanation_df.index.name = 'Column'
print(col_explanation_df.head(1))
import xlsxwriter
writer = pandas.ExcelWriter('outputs/Recurrently_mutated_RBPs.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Mutations', index=False)
col_explanation_df.to_excel(writer, sheet_name='Column explanations', index=True)
writer.save()

#df.to_excel('outputs/Recurrently_mutated_RBPs.xlsx', index=False)

print(df.head(2))