In [None]:
"""
Analysis of all TCGA data for all RBPs.
This is data downloaded iteratively from TCGA's API using Amigo RBP term to
 define RBPs.
Analysis ends with
mutations_in_rbps_all_tcga_downloaded

"""
import pandas, os, sys, re, time, collections, Bio, pprint, random, glob, importlib, json, pickle, matplotlib

from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq
from Bio import SeqIO
import seaborn as sns
import matplotlib.pyplot as plt

import proteinLengths
import tcgaParser
import tcgaAnnotater
importlib.reload(tcgaAnnotater)
importlib.reload(proteinLengths)
importlib.reload(tcgaParser)

from typing import List, Mapping, Union

pma_dir = '/Users/dp/pma/'

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rc('font', **{'family':'sans-serif','sans-serif':['Arial']})

lolipop_proteins = [
    'SF3B1', 'SMAD4', 'U2AF1', 'SRSF2', 'PCBP1', 'CNOT9',
    'RARS2', 'A1CF', 'EEF1B2', 'EIF1AX', 'KHDRBS2', 'FUBP1',
    'RBM11', 'HNRNPCL1', 'DDX3X', 'DICER1', 'PABPC4L', 'RPL5',
    'CRNKL1', 'RBM39', 'DCP1B', 'RBFOX1', 'TDRKH', 'NOVA1',
    'DDX50', 'YTHDC2', 'KPNB1', 'NUFIP1', 'CNOT1']


In [None]:
def just_lolipop(directory):
    
    out_dir = f"{directory}/subset_to_lolipop/"
    os.makedirs(out_dir, exist_ok=True)
    
    for study_fname in glob.glob(f"{directory}/*txt"):
        outli = []
        df = pandas.read_csv(study_fname, sep='\t', comment='#', dtype='str')
        if 'COMMON' not in df.columns:
            print(f"Could not find column COMMON in {study_fname}: {df.head()}")
            continue
        df = df.loc[[(x in lolipop_proteins) for x in df.COMMON], :]
        df.to_csv(f"{out_dir}/{os.path.basename(study_fname)}", sep='\t', index=False)

just_lolipop("./all_TCGA_data/Jan2020_census_api_outputs/")
#just_lolipop("./all_TCGA_data/Dec2019_api_outputs/")
#just_lolipop("./all_TCGA_data/Dec2019_api_outputs/our_rbps/")

In [None]:
importlib.reload(tcgaAnnotater)
importlib.reload(proteinLengths)
importlib.reload(tcgaParser)

#amigo = pandas.read_csv('listsOfRbps/rna_binding_genes_from_amigo.txt', sep='\t', comment='#')['Gene name'].tolist()

protein_lengths = proteinLengths.proteinLengths()
protein_lengths = protein_lengths.load()

#print(protein_lengths['SMAD4'])

# Study descriptions from https://www.cbioportal.org/webservice.do?cmd=getCancerTypes

study_info = tcgaParser.currated_set_of_nonredundant_studies(
        fname='./all_TCGA_data/currated_set_of_nonredundant_studies_list.txt',
        study_desc_fname='cancerLists/tcga_study_ids_and_descriptions.do')

a = study_info.study_categories['MSK-IMPACT Clinical Sequencing Cohort (MSKCC, Nat Med 2017)']
print(a)
print(study_info)

In [None]:
census_dl = all_dl

In [None]:
patient_data_fname = "all_TCGA_data/msk_impact_2017_clinical_data.tsv"
all_dl = pickle.load(open('outputs/tcgaLoaderObj.p', 'rb'))

In [None]:
def do_all(
    nonredundant_study_ids: List, study_info: tcgaParser.currated_set_of_nonredundant_studies) -> tcgaParser.dataLoader:
    
    set_of_studies = set([
        fname for study in nonredundant_study_ids if os.path.exists(
            fname := 'all_TCGA_data/Jan2020_census_api_outputs/subset_to_lolipop//{}.txt'.format(study))])
    
    set_of_studies |= set([
        fname for study in nonredundant_study_ids if os.path.exists(
            fname := 'all_TCGA_data/Dec2019_api_outputs/our_rbps/subset_to_lolipop/{}.txt'.format(study))])
            
    dl = tcgaParser.dataLoader(study_info=study_info)

    dl.do_a_set_of_studies(set_of_studies)
    print("Total patients: {}".format(dl.n_total_patients))
    
    return dl
#print(study_info.nonredundant_study_ids)
all_dl = do_all(study_info.nonredundant_study_ids, study_info)

all_dl.map_mutations_to_the_number_of_patients_by_study()
all_an = tcgaAnnotater.tcgaAnnotater(all_dl)
print(study_info.study_id_to_study_name['cllsll_icgc_2011'])
#print(study_info.get_study_name_from_study_id('all_TCGA_data/Dec2019_api_outputs/all_rbps_cllsll_icgc_2011.txt'))
#print(all_dl.mutation_to_studies)
#all_dl.by_study_cat1_n

In [None]:

#all_dl.get_study_cat_from_study_name('')
#all_dl.map_mutations_to_the_number_of_patients_by_study()
a = study_info.get_study_name_from_study_id('all_TCGA_data/Jan2020_census_api_outputs/census_skcm_broad.txt')
print(list(all_dl.by_study_cat1_n.values())[:10])
print(a)
print('---')
print(all_dl.__dict__.keys())

print(f"A1CF=")
print(all_dl.mutation_to_studies[('A1CF', 'E34K')])
print(all_dl.mutation_to_studies[('SMAD4', 'R380S')])
#study_info.study_categories 

In [None]:
aas = 'ARNDBCEQZGHILKMFPSTWYV'

def interpret_mut(protein, mut):
    m = re.match('([A-Za-z]+)(\d+)$', mut)
    if m is not None:
        out_muts = []
        for aa in aas:
            out_muts.append((protein, mut + aa))
        return out_muts

    return [(protein, mut)]

studied_rbps = pandas.read_excel('/Users/dp/pma/RBP missense mutations/Sequencing_schemes_and_results.xlsx', sheet_name='Proteins')
#print(study_id_to_study_name)

def get_mut_and_study_cats_dict() -> List[Mapping[str, Union[str, int]]]:
    mut_and_study_cats = []
    n = 0
    for protein_mut in studied_rbps.Mutations.tolist()[::-1]:
        
        if pandas.isna(protein_mut):
            continue
        
        n+= 1
        
        keys = interpret_mut(*protein_mut.split(' '))
        mut_and_study_cats.append(all_dl.by_study_cat1_n[keys[0]])
        mut_and_study_cats[-1]['Mutation'] = protein_mut

        for key in keys[1:]:
            results = all_dl.by_study_cat1_n[key]
            for cancer, n_patients in results.items():
                if cancer == 'Mutation':
                    continue
                mut_and_study_cats[-1].setdefault(cancer, 0)
                mut_and_study_cats[-1][cancer] += n_patients
    return mut_and_study_cats


def mut_and_study_cats_dict_to_dataframes(_mut_and_study_cats: dict) -> List[pandas.DataFrame]:
    
    # Convert to dict and re-index by mutation.
    mut_and_study_cats = pandas.DataFrame(_mut_and_study_cats)
    mut_and_study_cats.index = mut_and_study_cats['Mutation']
    del mut_and_study_cats['Mutation']
    
    # Replace na with 0.
    mut_and_study_cats.fillna(0, inplace=True)
    
    # Sum over columns -> total by mutation.
    sums = mut_and_study_cats.sum(axis=1)
    
    # Remove mutations with 0 sum.
    mut_and_study_cats = mut_and_study_cats.loc[[x>0 for x in sums], :]
    
    # Get the sums again so the index of the sum series matches.
    sums = mut_and_study_cats.sum(axis=1)

    # Copy the dataframe to make a version by fraction.
    mut_and_study_cats_fractions = mut_and_study_cats.copy()

    # Make a version by fraction.
    for n, index in enumerate(mut_and_study_cats_fractions.index):
        mut_and_study_cats_fractions.loc[index, :] = [x/max([1, sums[n]]) for x in mut_and_study_cats_fractions.loc[index, :]]
    
    # Return the two dataframes.
    return [mut_and_study_cats, mut_and_study_cats_fractions]

mut_and_study_cats = get_mut_and_study_cats_dict()

#with open('./outputs/cancer_type_graph_data_GO_term.json', 'w') as fp:
#    json.dump(mut_and_study_cats, fp, indent=4)

#with open('./outputs/cancer_type_graph_data_census_only.json', 'r') as fp:
#    census = json.load(fp)
#    print('census json:')
#    print(census)
lolipop_only = lambda a_df: a_df.loc[[  any([x in mut for x in lolipop_proteins]) for mut in a_df.index  ], :]

[mut_and_study_cats, mut_and_study_cats_fractions] = mut_and_study_cats_dict_to_dataframes(mut_and_study_cats)
mut_and_study_cats = lolipop_only(mut_and_study_cats)
mut_and_study_cats_fractions = lolipop_only(mut_and_study_cats_fractions)

print(mut_and_study_cats.head())
#mut_and_study_cats_fractions.loc[(mut_and_study_cats_fractions > 0.2).any(axis=1), :]
mut_and_study_cats_fractions = mut_and_study_cats_fractions.loc[:, (mut_and_study_cats_fractions > 0.05).any(axis=0)]
print(mut_and_study_cats_fractions.head())

cm = sns.clustermap(mut_and_study_cats_fractions, xticklabels=True, yticklabels=True)
cm.ax_row_dendrogram.set_visible(False)
cm.ax_col_dendrogram.set_visible(False)

plt.savefig(f'{pma_dir}/dataAndScripts/clip/figs/heatmap_missense_freq_by_fraction.pdf')
plt.show()
plt.clf(); plt.close()

cm = sns.clustermap(mut_and_study_cats, vmax=3, xticklabels=True, yticklabels=True)
cm.ax_row_dendrogram.set_visible(False)
cm.ax_col_dendrogram.set_visible(False)

plt.savefig(f'{pma_dir}/dataAndScripts/clip/figs/heatmap_missense_freq_by_absolute_count.pdf')
plt.show()
plt.clf(); plt.close()
print("Final dataframe plotted:", mut_and_study_cats)
#print(all_dl.by_study_cat1_n[('SF3B1', 'R625C')])
#print(all_dl.mutation_to_studies_n[('SF3B1', 'K700E')])
#print(all_dl.by_mutation[('SF3B1', 'K700E')])
#print(all_dl.n_by_mutation[('SF3B1', 'K700E')])
#print(all_an.top_snp_df(all_dl.n_by_mutation).head(60))
all_an.top_snp_df(all_dl.n_by_mutation).to_excel('./outputs/top_snps.xlsx')

In [None]:
#print(df_by_mutation_scc.head(10))
#snp_scc = top_snp_df(n_by_mutation_scc)
snp_all = top_snp_df(n_by_mutation_all)
print(snp_all.head(40))
#snp_scc.head(30)

In [None]:

snp_all = top_snp_df(n_by_mutation_all)
print(snp_all.head(20))
print('\n'.join(snp_all['Gene'].tolist()[:20]))
#print('\n'.join(df_all['Gene'].tolist()[:20]))

In [None]:
print(df_by_mutation_all.head(2))
df_by_gene_all.head(2)

In [None]:
import numpy as np
import scipy as sp
import scipy.stats as stats
import matplotlib.pyplot as plt
import lifelines
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

kmf = KaplanMeierFitter()

def cf_survival(df, col='Overall Survival (Months)'):
    dead = df.loc[
        [not np.isnan(x) for x in df[col]], :]
    res = logrank_test(dead.loc[dead['Mut'], col], dead.loc[~dead['Mut'], col])
    #res.print_summary()
    return res

clinical = pandas.read_csv('./all_TCGA_data/scc_3816_samples_clinical_data.txt', sep='\t')
print(clinical.tail())
clinical.index = [sample_id_to_patient_id(str(x)) for x in clinical['Patient ID']]
del clinical['Patient ID']

clinical = clinical.loc[:, ['Overall Survival (Months)']]
#print(clinical.head(2))
##print(clinical.shape)
# Has patient IDS: by_mutation_scc
pvals = []
for n in range(100):
    row = df_by_gene_all.iloc[n]
    #tup = (row['Gene'], row['Variant'])
    #print(row)
    gene = row['Gene']
    if gene in by_gene_scc:
        #print('--->', gene)
        patients = by_gene_scc[gene]
        #print(clinical.loc[patients, :])
        
        clinical['Mut'] = [bool(x in patients) for x in clinical.index]
        
        if len(patients) < 3:
            continue
        res = cf_survival(clinical)
        
        pvals.append({
            'Gene': gene,
            #'tup': tup,
            'pvalue': res.p_value,
            't': res.test_statistic,
            'Patients': ','.join(patients)
        })
        
        #print(by_mutation_scc)
df = pandas.DataFrame(pvals).sort_values(by='pvalue')
print(df)
print('\n'.join(df.loc[df['pvalue']<0.05, 'Gene']))

In [None]:
cds = """ATG ATG GAA GCC GAT CGA CCT GAA AAA CTT TTT ATT GGG GGT CTC AAT CTC AAG ACG GAT GAG AAG GCC CTC AAA GCA GAG TTC GGA AAG TAT GGC CAC ATT ATC AAG GTC TTT CTT ATG AAA GAC CGC AAG ACG AAC AAA TCT AGG GGT TTC GCA TTC GTA ACA TTT GAG TCC CCA GCC GAC GCG AAG GCA GCG GCG CGC GAT ATG AAC GGG AAA TAC CTC GAT GGC AAG GCG ATT ATG GTT GCG CAG ACT ATC AAG CCG GCC TTC AAA TCC AGT CGA TGG GTG CCG CCG ACC CCT GGC TCA GGT TCA AGA TCC AGG TTC AGC CAC AGG ACC AGA GGT GGA GGA TCC AGT CCG CAA CGC CCC CCA TCA CAG GGG CGG CCG GAC GAC GGT AGA GGA TAT GCA GGG TAT TTC GAT CTC TGG CCG TAT CGC GCC CCT ATG CCC CGC AAA CGC GGT CCC CCT CCA CGA CAC TGG GCA TCT CCA CCT CAT AAA CGG GCC ACG CCT TCC TCA TTG GCC CAT TCT GTA GGG TGC GGT ATG CGA GGA AAG GCC CCG ACA GTA AGT GGT CAA GAT GGA TAT TCA GGA TTG CAG CCC CGA AGG TGG GCA GGT CCG CCC CAT AAA CGA GCA GTC CCG CGC TCC TCT TTG GCC CGG ATC GGT GGA TCA GGC ATG CCA GGG AAG GCC CCA GCA GTG TGG GGG CAG GAT GGT TAC TCA GGA CCC CGA GTA CGA GAG CCA CTT CCG CCC TGC CGA GAC CCC GGC GAC TTT GTT CCA GCA CTG AGG GAT TAT AGC AGG CGC TAT TAC GGC CAT TCA TCT GTC CCC GAC TAT CGG CCG CTT AGG GGG GAT GGC AAC CAA AAC GGT TAT CGG GGT AGA GAC CAT GAG TAC ACA GAC CAT CCA TCA AAA GGG TCA TAT AGG GAG CCG CTG AAA TCC TAC GGT GGG CCT TGT GGG GCG GCG CCT GTA TGG GGC ACT CCT CCA TCC TAT GGC GGT GGG TGT CGC TAC GAG GAA TAT CAA GGG AAT TCC CCA GAT GCA TGC AGT GAG GGG AGG AGT TCC GAA GCT CTC CCA GTA GTT CTG CCT GAT GCC TAT TCA AGA GAC CAC TCC CCT AAG GCA TAT TCA GGC GGC AGA TCC AGT TCT TCC AAT GGA TAT TCA AGG TCA GAC CGC TAC GGG GAG GAG GGT TGT TAT GAA GAA TAT CGC GGA CGA TCT CCG GAC GCA CAC TCT GGA GGC AGG AAC AGC AGC TCA AAT TCA TAC GGT CAG TCT CAC CAT TAC GGT GGC GAA GGT AGG TAT GAG GAG TAC AGA GGC CGG AGC CAC GAA GCC AGA AGC GGG GGA CGA TCT ACC GAC GCC CAT AGT CGG GGT CGG TCC GAC GAC GCC TAC AGC GGT GGC CAC GAT TCA TCA AGT TGG AGT GAC TGT TGC GGG GGT GGA GGA AGG TAC GAG GAG TAT CAA GGG CGG TCA CTG GAT GCG AAT TCA GGC GGT TGC TCC CCC GAA GCA TAT AGT GGC GGG CAC GAC AAC AGC TCA TGG AGC GAC CGA TAT GGT GTA GGG GGA CAC TAT GAA GAG AAT AGA GGT CAC TCC CTG GAT GCG AAC TCA GGA GGT CGG TCT CCC GAT ACT CAT TCA GGA GGT CAC TCA TCA AGT TCA AAC AGT TAT GGC CAA TCT CAT CGC TAC GGT GGG GAG GGT AGA TAC GAA TAC AGG GGT AGA TCT CAT GAT GCG CAT AGT GGT GGA TGC TCC GCA GAC GCC TAC TCA GGT GGC CAC GAC AGT AGT TCA CAG AGC AAC CGG TAT GGG GGC GGT GGT TGC TAT GAA GAG TAT AGA GGT CGG AGT TTG GAC GCA AAT TCT GGG GGT CGG TCC CCA AAT GCC TAC AGT GGC GGT CAC GAT AGT TCC TCC TGG TCT CAC AGG TAC GGC GGA GGC GGC CGC TAC GAA GAG TAC CGG GGC AGG TCA TTG GAC GCC AAC TCT GGC GGT AGG AGC CCA GAC GCT TAC AGT GGC GGG CAC GAT TCC TCT GGA CAG TCA AAC TGC TAT GGA GGA GGT GGG CGC TAT GAG GAG TAC CGA GGC CGC CTT CTG GAT GCG AAC TCA GGT GGT CGG TCT CCA GAT GCG TAT AGT GGT GGA CAC GAT AGT TCC TCA CAA TCC AAC CGG TAT GGG GGA GGA GGG AGG TAT GAA GAG TAC CGA GGA CAC AGT CTC GAC GCT AAC AGT GGA GGG AGA AGT CCC GAT ACC TAC AGC AGG GGC CAT GAC TCA TCT AGT CAG TCT GAC CAT TAC GGC GGC GGA GGG CGA AGC CTT GAT GCA AAC TCT TCA GGA CGA CTT CCA GAT GCC TAT TCT GGA GGA CAC GAT TCT TCA TCA CGC AGC CAT CGC TAT GGC GGG GGC GGA CGC TAC GAG GAG TAC AGG GGA AGG TCA CTT GAT GCG AAT AGC GGG GGA CGG TCC CCG AAT GCG TAT AGC GGA GGA CAC AAT TCA TCT TCT CGG AAT GAT CCT TGC AGG GGT GGC GGA AGA TAT GAG GAG AAT CGC GGT CAC AGT CTG GAC GCT AAT TCA GGG GGG CAC AGC CCG AAC GCG TAC TCA GGG GGA AGG GAC AGC AGC AGC AAC TCA TAC GAT CGA TCA CAT CGG TAC GGG GGG GGT GGT CAC TAC GAG GAA TAC AGA GGT CGA TCC CAT GAC ACT CAT AGC AGG GGG AGG TCA CCA GAC GCG CAT TCT GGT GAT CAT TAT ACT GAA GCA TAC TCT CGA GGG AGG GAC TCC TTT AGC AAC AGC TAC GGC CGC TCC GAC CAC TAT GGA CGA GGC GGT TGT TAT GAA GAA TAT CAG GGT AGA AGC CCT AAC GCA TAT GGG GGG GGC CGC GGC CTC AAT TCT TCA AAC AAT TCA CAT GGG CGG TCT CAC AGG TAT GGG GGA GGG GGC CGG TAT GAA GAG TAT CGA GGT CCG TCA CCA GAC GCA CAT AGC GGG GGA AGG GAC TCC AGC ATA AAG AGT TAC GGA CTG TCT GAT AGA TAT GGT GGC GGC GGA CAT TAT GAG GAG TAT CAA GGT TCC CTG CCC GAT GCC TAT TCT GGG GAT CAC GAC AGA TCC TCT AAC AGT TAT GGT CGG TCC GAC CGC TAT TCA AGA GGA CGG GAT CGC GTA GGC AGG CCA GAC AGG GGC TTG CCT CTG CCT ATG GAG ACA GGA TCT CCG CCG TTG CAC GAC AGT TAC TCT AGG AGT GGG TGC CGG GTC CCG AGG GGT GGT GGT AGG CAG GGA GGC AGA TTC GAG AGA GGG GAG GGC CGA TCC CGG TAC """
cds = cds.split(' ')
cds = ''.join(cds)

table = {
    "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L", 
    "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S", 
    "TAT": "Y", "TAC": "Y", "TGT": "C", "TGC": "C", 
    "TGG": "W", "CTT": "L", "CTC": "L", "CTA": "L", 
    "CTG": "L", "CCT": "P", "CCC": "P", "CCA": "P", 
    "CCG": "P", "CAT": "H", "CAC": "H", "CAA": "Q", 
    "CAG": "Q", "CGT": "R", "CGC": "R", "CGA": "R", 
    "CGG": "R", "ATT": "I", "ATC": "I", "ATA": "I", 
    "ATG": "M", "ACT": "T", "ACC": "T", "ACA": "T", 
    "ACG": "T", "AAT": "N", "AAC": "N", "AAA": "K", 
    "AAG": "K", "AGT": "S", "AGC": "S", "AGA": "R", 
    "AGG": "R", "GTT": "V", "GTC": "V", "GTA": "V", 
    "GTG": "V", "GCT": "A", "GCC": "A", "GCA": "A", 
    "GCG": "A", "GAT": "D", "GAC": "D", "GAA": "E", 
    "GAG": "E", "GGT": "G", "GGC": "G", "GGA": "G", 
    "GGG": "G"}
rev_translate = collections.defaultdict(list)
for codon, aa in table.items():
    rev_translate[aa].append(codon)
import random
def get_random_codon(aa):
    return random.choice(rev_translate[aa])
    
seq = Bio.Seq.Seq(cds)
print(seq)
print(seq.translate())
ct = Bio.Data.CodonTable.CodonTable

from Bio.Data import CodonTable
standard_table = CodonTable.unambiguous_dna_by_id[1]
print(standard_table)
standard_table.back_table['H']

randomized_codons = []
for n_codon in range(0, len(seq) - 3, 3):
    #print(n_codon, seq[n_codon:n_codon+3])
    codon = seq[n_codon:n_codon+3]
    aa = codon.translate()
    #print(aa)
    #print(standard_table.back_table[aa])
    #print(get_random_codon(aa))
    #if random.randint(0, 100) == 1:
    #    break
    randomized_codons.append(get_random_codon(aa))
    
    if len(randomized_codons) >= 550:
        break
randomized_codons = ''.join(randomized_codons)
print(randomized_codons)