In [None]:
import re, os, glob, pandas, importlib, collections, pickle, matplotlib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import proteinLengths
import tcgaParser
import tcgaAnnotater
importlib.reload(tcgaAnnotater)
importlib.reload(proteinLengths)
importlib.reload(tcgaParser)


matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Arial']})

In [None]:
def insert_newlines_into_svg_files() -> None:
    # Make reformatted svg files by inserting some newlines.
    newline_before = ['<g>', '</g>', '<text', '<circle', '<rect', '<line']
    for fname in glob.glob('figs/unedited_lolipop/*.svg'):
        #if 'BRCA1' not in fname:
        #    continue
        outlines = []
        with open(fname) as fh:
            print(fname)
            for line in fh:
                #print(line)
                for pattern in newline_before:
                    line = re.sub(pattern, '\n' + pattern, line)#'\n'.join(line.split(pattern))
                #print(f"--{line}--")
                outlines.append(line)
        basename = re.sub('.svg.svg', '.svg', os.path.basename(fname))
        print(basename)
        with open(f'figs/edited_lolipop/{basename}', 'w') as fh:
            fh.write(''.join(outlines))
            
insert_newlines_into_svg_files()

In [None]:
import get_protein_lengths_from_tcga_lolipop_svg_files as get_prot_lens
protein_lengths = get_prot_lens.get_protein_lengths_from_tcga_lolipop_svg_files()
[protein_lengths.pop(name, '') for name in ['JUN', 'CDKN2A']]
for (_name, _kda) in [(name, 110 * aa) for name, aa in protein_lengths.items()]:
    print(f"{_name}\t~{_kda/1000} kDa")



In [None]:
"""
Make tcga csv files for mutation data by subsetting to the RBPs of interest.
This is as long as it is in order to use multiple input tcga csv files for a study that add genes.
"""
def read_into_dict(study_fname: str, our_rbps=[]):
    
    df = pandas.read_csv(study_fname, sep='\t', comment='#', dtype='str')
    try:
        df = df.loc[[(x in our_rbps) for x in df.loc[:,'COMMON']], :]
    except:
        print(study_fname, " could not be parsed.")
        print(df.head())
        return {}
    
    df.drop_duplicates(subset='COMMON', inplace=True)
    df.index = list(df.COMMON)

    _d = df.to_dict(orient='index')
    return _d

def make_tcga_api_output_files_with_only_our_rbps(
    input_folder: str, output_folder: str, our_rbps=[]):
    """Read studies into dict objects saved in output_folder/."""

    os.makedirs(output_folder, exist_ok=True)    
    
    study_fnames = [_ for _ in glob.glob(f"{input_folder}/*.txt")]
    
    for n, study_fname in enumerate(study_fnames):
        print(f"Study {n+1}/{len(study_fnames)}", end=', ')
        
        _d = read_into_dict(study_fname, our_rbps=our_rbps)
        if _d is None or len(_d) == 0:
            continue
            
        basename = re.sub('all_rbps_', '', os.path.basename(study_fname))
        out_fname = f"{output_folder}/{basename}"
        with open(out_fname, 'wb') as fh:
            pickle.dump(_d, fh)
        #df.to_csv(out_fname, sep='\t', index=False)

def add_folder(
    input_folder: str, output_folder: str, our_rbps=[]):
    
    existing_fnames = [_ for _ in glob.glob(f'{output_folder}/*txt')]
    data = {os.path.basename(fname):pickle.load(open(fname, 'rb')) for fname in existing_fnames}    
    
    study_fnames = [_ for _ in glob.glob(f"{input_folder}/*.txt")]
    
    for n, study_fname in enumerate(study_fnames):
        print(f"Study {n+1}/{len(study_fnames)}", end=', ')
        
        _d = read_into_dict(study_fname,  our_rbps=our_rbps)
        
        basename = os.path.basename(study_fname)
        basename = re.sub('census_', '', basename)
        
        if basename in data:
            new_genes = set(_d.keys()) - set(data[basename].keys())
            for gene in new_genes:
                data[basename][gene] = _d[gene]
        
    for fname in existing_fnames:
        with open(fname, 'wb') as fh:
            pickle.dump(data[os.path.basename(fname)], fh)
                
make_tcga_api_output_files_with_only_our_rbps(
    'all_TCGA_data/Dec2019_api_outputs/',
    output_folder='all_TCGA_data/Dec2019_api_outputs/our_rbps_dicts/',
    our_rbps=list(protein_lengths.keys()))

add_folder(
    'all_TCGA_data/Jan2020_census_api_outputs/',
    'all_TCGA_data/Dec2019_api_outputs/our_rbps_dicts/',
    our_rbps=list(protein_lengths.keys()))

fnames = [_ for _ in glob.glob('all_TCGA_data/Dec2019_api_outputs/our_rbps_dicts/*txt')]
d = {fname:pickle.load(open(fname, 'rb')) for fname in fnames}
os.system('cp -r all_TCGA_data/Dec2019_api_outputs/our_rbps_dicts/ all_TCGA_data/our_rbps_dict_object_backups/')

output_folder = 'all_TCGA_data/Dec2019_api_outputs/our_rbps/'
for fname, dict_obj in d.items():
    with open(f"{output_folder}/{os.path.basename(fname)}", 'w') as fh:
        a = pandas.DataFrame.from_dict(dict_obj, orient='index')
        a.to_csv(fh, sep='\t', index=False)


In [None]:
import get_domain_locations_as_fractions_from_tcga_lolipop_files as get_domains

input_fnames = [fname for fname in glob.glob('figs/edited_lolipop/*svg')]
input_fnames = [_ for _ in input_fnames if 'DIS3' not in _ and 'IGF2BP1' not in _]

all_outlines = []
for n, fname in enumerate(input_fnames):
    
    gene = re.search('lolipop/(.+)_lollipop.svg', fname).group(1)
    #print(gene, fname)
    gene_dict, outlines = get_domains.get_rect(fname, verbose=False)
    #print(outlines)
    all_outlines.extend(outlines)
    with open('outputs/domain_positions_as_fractions.txt', 'w') as f:
        f.write("Gene\tDomain\tStart\tEnd\n")
        f.write(''.join(all_outlines))


In [None]:
study_info = pandas.read_csv('./all_TCGA_data/getCancerStudies_result.txt', sep='\t')
study_info = dict(zip(study_info['cancer_study_id'], study_info['name']))
#study_info.index = study_info['cancer_study_id']

def get_study_info(study_fname, study_info):
    if '/' in study_fname:
        study_fname = os.path.basename(study_fname)
    study_fname = study_fname.split('.')[0]
    study_name = re.sub('our_rbps_', '', study_fname)

    if study_name in study_info:
        #print(study_name, study_info[study_name])
        return study_info[study_name]
    else:
        print("Couldn't find study id for", study_fname)
        return 'Unknown'
        
    

for fname in glob.glob('all_TCGA_data/Dec2019_api_outputs//our_rbps/*txt'):
    _ = get_study_info(fname, study_info)
    #print('>', _, ', ', fname)


In [None]:
tcga_dir = './all_TCGA_data/Dec2019_api_outputs/our_rbps/'

importlib.reload(tcgaParser)

study_info = tcgaParser.currated_set_of_nonredundant_studies(
        fname='./all_TCGA_data/currated_set_of_nonredundant_studies_list.txt',
        study_desc_fname='cancerLists/tcga_study_ids_and_descriptions.do')

dl = tcgaParser.dataLoader(study_info=study_info, missense_only=True)

def muts_as_array(by_mutation):
    gene_array = {}

    
    for (gene, mut), num_patients in by_mutation.items():
        
        if gene not in protein_lengths:
            continue
            
        pos = re.search('\w(\d+)\w', mut)

        if pos is None:
            continue

        pos = int(pos.group(1)) - 1
        if pos < 0:
            print(gene, mut, pos)
            
        if gene not in gene_array:
            gene_array[gene] = np.zeros(protein_lengths[gene])
        if pos >= len(gene_array[gene]):
            gene_array[gene] = np.append(
                gene_array[gene], np.zeros(1+pos-len(gene_array[gene])))
        gene_array[gene][pos] += len(num_patients)
    if 'RQCD1' in gene_array:
        gene_array['CNOT9'] = gene_array['RQCD1']
    return gene_array

def get_tcga_data_from_file(fname):
    df = pandas.read_csv(fname, sep='\t', comment='#')
    print(df.shape)
    print(df.head())
    
def get_tcga_data(tcga_dir, dl):
    set_of_studies = set()
    for fname in glob.glob(tcga_dir + '/*txt'):

        set_of_studies.add(fname)
        #break
    dl.do_a_set_of_studies(set_of_studies)
    print([x for x in dl.by_mutation.keys() if x[0]=='PABPC4L'])
    #print(set([x[0] for x in  dl.by_mutation.keys()]))
    
get_tcga_data(tcga_dir, dl)
print(dl.__dict__.keys())
mutation_counts_as_arrays = muts_as_array(dl.by_mutation)
#print(a)
#print(mutation_counts_as_arrays)

In [None]:
import pprint as pp
pp.pprint(dl.mutation_to_studies[('RARS2', 'R6C')])

def classify_study(study_name):
    print(study_name)
    study_name = re.sub(' +\(.+\)', '', study_name)
    print(f"-{study_name}-")
    for keyword in ['Melanoma', 'Breast Cancer']:
        if keyword in study_name:
            return keyword#'Bladder Urothelial Carcinoma']
for study in dl.mutation_to_studies[('RARS2', 'R6C')].keys():
    classify_study(study)

In [None]:
print(dl.n_by_gene['SRSF2'])
our_snps = pandas.read_excel('RBP_high_freq_mutations.xlsx', sheet_name='All cancer')
def get_pos(_str):
    m = re.search('[A-Z]+(\d+)[A-Z]', _str)
    if not m:
        print("No SNP?", _str)
        return ''
    return int(m.group(1))

our_snps['Pos'] = [get_pos(str(_)) for _ in our_snps['Mutation']]
our_snps = our_snps.loc[our_snps['Pos']!='', :]
gene_to_our_muts = collections.defaultdict(set)
gene_to_our_mut_string = collections.defaultdict(set)
for row in our_snps.to_dict('records'):
    gene_to_our_muts[row['Gene']].add(row['Pos'])
    gene_to_our_mut_string[row['Gene']].add(row['Mutation'])
if 'RQCD1' in gene_to_our_muts:
    gene_to_our_muts['CNOT9'] = gene_to_our_muts['RQCD1']
    gene_to_our_mut_string['CNOT9'] = gene_to_our_mut_string['RQCD1']
gene_to_our_muts['FUBP1'] = set([430])
gene_to_our_muts['RBFOX1'] = set([49])
gene_to_our_muts['BCLAF1'] = set([163])
print(gene_to_our_muts['YTHDC2'])
#print(mutation_counts_as_arrays['RQCD1'])
mut_strings = list(gene_to_our_mut_string['YTHDC2'])
if len(mut_strings) > 1:
    pos_to_string = dict(zip(
        [int(re.search('[A-Z](\d+)[A-Z]', x).group(1)) for x in mut_strings],
        mut_strings
        ))
print('>>>', pos_to_string)
#def get_muts(dl, a_prot):
#    return [(prot, mut) for prot, mut in dl.by_mutation.keys() if prot==a_prot]

print(len(gene_to_our_muts))

to_max = {name:np.max(arr) for name, arr in mutation_counts_as_arrays.items()}
by_max = sorted(
    mutation_counts_as_arrays.keys(), key=lambda x: np.max(mutation_counts_as_arrays[x]), reverse=True)
print(to_max)
print(by_max)
print(len(mutation_counts_as_arrays))
print(mutation_counts_as_arrays.keys())


In [None]:
protein_domains = pandas.read_csv('outputs/domain_positions_as_fractions.txt', sep='\t')
name_to_domain_list = collections.defaultdict(list)
print(protein_domains)
for row in protein_domains.to_dict('records'):
    name_to_domain_list[row['Gene']].append(row)
#protein_domains.index = protein_domains['Gene']
print(name_to_domain_list['PABPC4L'])


def domain_name_to_color(name, get_legend=False):
    current_palette = sns.color_palette(palette='Set2', n_colors=4)
    if get_legend:
        pat0 = plt.Rectangle((0,0), 0, 0, color=current_palette[0])
        pat1 = plt.Rectangle((0,0), 0, 0, color=current_palette[1])
        pat2 = plt.Rectangle((0,0), 0, 0, color=current_palette[2])
        pat3 = plt.Rectangle((0,0), 0, 0, color=current_palette[3])
        pat4 = plt.Rectangle((0,0), 0, 0, color='k')
        return plt.legend((pat0, pat1, pat2, pat3, pat4), ('KH', 'RRM', 'Zinc finger', 'Helicase', 'Other'))
    if 'KH_' in name:
        return current_palette[0]
    if 'RRM' in name:
        return current_palette[1]
    if re.search('Zf', name, re.IGNORECASE) is not None:
        return current_palette[2]
    if (('DEAD' in name) or ('Heli' in name)):
        return current_palette[3]
    else:
        return 'k'
    
    
def plot_gene(mut_arr, our_muts, ax, gene):
    
    y_rel = 0
    line_len = len(mut_arr)
    gene_body_y = max(mut_arr) * -0.07
    gene_body_y_height = max(mut_arr) * 0.05
    print(f"{gene_body_y}, {gene_body_y_height}")
    #scale_factor = line_len/len(mut_arr)
    line = [[0, line_len], 
           [0, 0]]
            #[gene_body_y + gene_body_y_height * 0.5, gene_body_y + gene_body_y_height * 0.5]]
    
    ax.plot(line[0], line[1], 'k-', linewidth=0.5, zorder=1)
    
    #print('>>>dafsd>>>', gene_body_y + (gene_body_y_height * 0.5))
    ax.plot([0, line_len],
            [gene_body_y - (gene_body_y_height * 0.2), gene_body_y - (gene_body_y_height * 0.2)], 'k-',
            linewidth=0.5)
    for n, freq in enumerate(mut_arr):
        
        if n+1 in our_muts:
            ax.plot([n, n], [y_rel, freq+y_rel], 'r-', linewidth=0.4)
            
            
            mut_strings = list(gene_to_our_mut_string[gene])
            if len(mut_strings) > 1:
                pos_to_string = dict(zip(
                    [int(re.search('[A-Z](\d+)[A-Z]', x).group(1)) for x in mut_strings],
                    mut_strings
                    ))
                mut_label = pos_to_string[n+1]
            else:
                mut_label = mut_strings[0]
            ax.text(n, freq+y_rel, s=mut_label, fontdict={'fontsize': 7})
        else:
            ax.plot([n, n], [y_rel, freq+y_rel], 'k-', linewidth=0.6)

    if gene in name_to_domain_list:
        domains = name_to_domain_list[gene]
        #print('adding domains...', domains)
        
        
        for domain in domains:
            box_x = domain['Start'] * line_len
            box_width = (domain['End']-domain['Start']) * line_len
            #print(max(mut_arr))
            #print(box_x, box_width, "< x, width for domain")
            ax.add_patch(
                plt.Rectangle(
                    (box_x, gene_body_y-gene_body_y_height), box_width, gene_body_y_height, 
                    color=domain_name_to_color(domain['Domain']), zorder=3) )
    else:
        print(f"{gene} not in protein domain list {name_to_domain_list.keys()}")
    ax.set_yticks([max(mut_arr)])
    ax.set_xticks([len(mut_arr)])
    for tick in ax.xaxis.get_major_ticks():
            tick.label.set_fontsize(7) 
    for tick in ax.yaxis.get_major_ticks():
            tick.label.set_fontsize(7)
    ax.set_ylim(gene_body_y-gene_body_y_height, ax.get_ylim()[1])

    
ax_rows = 6
ax_cols = 6
on_row = -1
#sns.set_style('ticks')
fig = plt.figure()
domain_name_to_color('', get_legend=True)

fig.savefig('./figs/legend_for_lolipop.pdf')
plt.show()
plt.clf()
plt.close()
fig, axes = plt.subplots(ax_rows, ax_cols)

input_fnames = [fname for fname in glob.glob('figs/edited_lolipop/*svg')]
# KPNB1 and DICER1 have stability data. 
for gene in ['DIS3', 'IGF2BP1', 'UPF2', 'CDKN2A', 'JUN', 'BRCA1', 'BCLAF1', 'RPL10',]:
    input_fnames = [_ for _ in input_fnames if gene not in _]

def to_max(fname):
    gene = re.search('lolipop/(.+)_lollipop.svg', fname).group(1)
    #to_max = {name:np.max(arr) for name, arr in mutation_counts_as_arrays.items()}
    return np.max(mutation_counts_as_arrays[gene])

input_fnames = sorted(input_fnames, key=lambda x: to_max(x), reverse=True)

for n, fname in enumerate(input_fnames):
    
    gene = re.search('lolipop/(.+)_lollipop.svg', fname).group(1)
    #if gene == 'CNOT9':
    #    continue
    if n % ax_cols == 0:
        on_row += 1
    on_col = n % ax_cols
    print(fname)

    ax = axes[on_row][on_col]
    plot_gene(mutation_counts_as_arrays[gene], gene_to_our_muts[gene], ax, gene)

    axes[on_row][on_col].set_title(gene, fontdict={'fontsize': 7}, y=-0.2)
    axes[on_row][on_col].patch.set_visible(False)
    #ax.xaxis.set_visible(False)
    
    ax.set_xlim(-1, len(mutation_counts_as_arrays[gene]))
    if on_col == 0:
        ax.set_ylabel('# Patients', fontdict={'fontsize': 7})
    #if on_row == ax_rows - 1:
    #ax.set_xlabel(gene, fontdict={'fontsize': 7})
    #if on_row > 1:
    #break
sns.despine(bottom=True)
fig.set_figheight(9)
fig.set_figwidth(9)
plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0.25, hspace=0.25)
fig.savefig('figs/combined_pops.pdf')
plt.show()
plt.clf()
plt.close()



In [None]:
max_val = max(mutation_counts_as_arrays['BCLAF1'])
index_of_max_val = np.argmax(mutation_counts_as_arrays['BCLAF1'])
print(f"{max_val} at {index_of_max_val}, our mut at {gene_to_our_muts['BCLAF1']}")
muts = [x for x in dl.by_mutation.keys() if x[0]=='BCLAF1']
counts = [len(dl.by_mutation[x]) for x in muts]
_d = dict(zip(muts, counts))
#print(muts)
df = pandas.DataFrame.from_dict(data=_d, orient='index')
#print(df)


In [None]:
hess = pandas.read_excel('cancerLists/HessJM_2019.xlsx')
prots = pandas.read_excel('Sequencing_schemes_and_results.xlsx', sheet_name='midiprep amounts')
#print(prots)
proteins = [x.split(' ')[0] for x in prots['Protein']] + ['PCBP1']
df = hess.loc[[x in proteins for x in hess.gene], :]
print(df)