In [None]:
import matplotlib.pyplot as plt
import pandas, collections, re
import matplotlib_venn
from venn import venn
from typing import List, Mapping
import seaborn as sns


lts = lambda x: set(x) - set([''])

amigo = pandas.read_csv('listsOfRbps/rna_binding_genes_from_amigo.txt', sep='\t', comment='#')['Gene name'].tolist()
amigo = lts(amigo)
census = pandas.read_excel('listsOfRbps/rbp_table_from_rbp_census_nrg3818_s3.xls')['gene name'].tolist()
census = lts(census)

bailey = pandas.read_excel('cancerLists/bailey_mh_cell_cancerDrivers.xlsx', sheet_name='Table S1')['Gene'].tolist()
bailey = lts(bailey)
oncokb = pandas.read_csv('cancerLists/cancerGeneList.txt', sep='\t')['Hugo Symbol'].tolist()
oncokb = lts(oncokb)
cosmic = pandas.read_csv('cancerLists/Census_allThu May 30 21_44_43 2019.tsv', sep='\t')['Gene Symbol'].tolist()
cosmic = lts(cosmic)

any_cancer_list = cosmic | bailey | oncokb
any_rbp_list = census | amigo

In [None]:
def create_file_assigning_protein_domains_to_gene_names():
    ##################
    # Get UNIPROT<->Gene name translations by combining two files,
    # one from http://uswest.ensembl.org/biomart/, the other from https://www.uniprot.org/mapping/.
    transl = pandas.read_csv('domains/uniprot_gene_id_to_symbol.txt', sep='\t')
    transl_d = dict(zip(transl['From'], transl['To']))

    transl2 = pandas.read_csv('domains/ensg_name_uniprot.txt', sep='\t')
    transl_d.update(dict(zip(transl2['UniProtKB Gene Name ID'], transl2['Gene name'])))

    ##################
    # Get the protein domains contained in each human protein.
    # This file was from:
    # ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/proteomes/9606.tsv.gz
    df = pandas.read_csv('domains/9606.tsv', sep='\t', comment='#')

    # Translate from UNIPROT Gene Name IDs ("PF00079") to gene names ("CDK1").
    df['Gene'] = [transl_d.get(seqid, 'Unknown') for seqid in df['<seq id>']]
    df = df.loc[[x!='Unknown' for x in df['Gene']], :]

    # Connect gene names to domains.
    gene_to_domain = collections.defaultdict(set)
    for symbol, domains in zip(df['Gene'], df['<hmm name>']):
        gene_to_domain[symbol] |= set([domains])

    # Write a file with these mappings.
    outli = 'Gene\tDomains\n'
    for symbol, domains in gene_to_domain.items():
        outli += f"{symbol}\t" + ','.join(domains) + '\n'
    with open('domains/gene_name_to_domains.txt', 'w') as f:
        f.write(outli)
    
    return gene_to_domain

gene_to_domain = create_file_assigning_protein_domains_to_gene_names()

In [None]:
print(cosmic & any_rbp_list)
for name in (cosmic & any_rbp_list):
    print(name, gene_to_domain.get(name))
    
#bailey_amigo = bailey.loc[[bool(x in amigo) for x in bailey.Gene], :]
#bailey_scc = bailey_amigo.loc[bailey['Cancer']=='PANCAN', :]
#bailey_amigo_uniq = bailey_amigo.drop_duplicates('Gene')

In [None]:
rbds = [
    'dsrm', 'RRM_1', 'KH_1', 'PABP', 'Clp1', 'CLIP1', 'SF3b1', 'Dicer', 'Helicase_C',
    'DEAD', 'RRM_7', 'CEBP', 'Telomerase_RBD', 'Xpo', 'La',
    'NPM', #'ATPase',
    
    'RRM', 'KH'
]



def collapse_similar_domains(str_or_list):
    
    def collapse_name(_str):
        assert type(_str) == type('')
        
        _d = {
            'DEAD': 'Helicase_C', 
            'KH_1': 'KH', 'RRM_1': 'RRM',
            'MH1': 'MH1/2', 'MH2': 'MH1/2',
            'RRM_7': 'RRM'
        }
        _str = _d.get(_str, _str)
        
        for pat in ['ATPase', 'FERM', 'nuclease', 'HSP', 'zf-']:
            if re.search(pat, _str, re.IGNORECASE) is not None:
                return pat
            
        return _str

    if type(str_or_list) == type(''):
        return set([collapse_name(str_or_list)])
    
    output = set()
    for item in str_or_list:
        output.add(collapse_name(item))
        
    return output

def split_proteins_with_and_without_rbds(gene_list):
    domains = {
        gene_name: collapse_similar_domains(gene_to_domain.get(gene_name, 'No_domain')) for \
        gene_name in gene_list}
    
    print(f"RRM/zf- in domains values? {('RRM/zf-' in domains.values())}")
    
    [proteins_with_rbds, proteins_without_rbds] = set(), set()
    for gene in gene_list:
        if len(set(rbds) & domains[gene]):
            proteins_with_rbds.add(gene)
        else:
            proteins_without_rbds.add(gene)

    return proteins_with_rbds, proteins_without_rbds

def count_domains(names: set) -> Mapping[str, int]:
    """For a given set of gene names, return the number of proteins in that set with each domain.
    Also make:
    A pie chart of how many proteins have RBDs.
    A stacked bargraph of the number of proteins with each given domain.
    """
    
    prot_to_doms = {
        gene_name: collapse_similar_domains(gene_to_domain.get(gene_name, 'No_domain')) for \
        gene_name in names}
    
    print(prot_to_doms)
    
    dom_to_prots = collections.defaultdict(set)
    for prot, doms in prot_to_doms.items():
        for dom in doms:
            dom_to_prots[dom].add(prot)
                    
    print(f"RRM/zf- in dom_to_prots keys? {dom_to_prots['RRM/zf-']}")
    proteins_with_rbd, proteins_without_rbd = split_proteins_with_and_without_rbds(names)
    
    N = 3
    
    def domains_over_N(gene_list: set, N: int = N) -> set:
        
        _domain_counts = {dom: len(prots & gene_list) for dom, prots in dom_to_prots.items()}
        domains_in_at_least_N_genes = set([x for x, n in _domain_counts.items() if n>=N])
                
        return domains_in_at_least_N_genes

    print(f"""Genes with rbd: {len(proteins_with_rbd)}. \
        Genes without rbd: {len(proteins_without_rbd)}""")
    
    zf_only = set([x for x in prot_to_doms if 'zf-' in prot_to_doms[x]]) - set(
        proteins_with_rbd) 

    print(f"""Of the {len(set(names))} cancer-associated RBPs, only {len(proteins_with_rbd)} \
        had an clear RBD, \
        {len(zf_only)} had only a zinc finger, \
        and {len(proteins_without_rbd - zf_only)} had neither.\n""")
        
    
    ###########
    # Pie chart of how many proteins have RBDs.
    plt.clf()
    fig, ax = plt.subplots()
    ax.pie([len(proteins_with_rbd), len(zf_only), len(proteins_without_rbd - zf_only)],
          labels=[f'RBD {len(proteins_with_rbd)}',
                  f'Zinc finger only {len(zf_only)}',
                  f'No RBD {len(proteins_without_rbd - zf_only)}'], autopct='%1.f%%',
          startangle=90, wedgeprops = {'linewidth': 0})
    ax.axis('equal')
    plt.savefig('./figs/pie_chart_RBDs.svg')
    plt.show()
    plt.clf()

    ###########
    # Dataframe for stacked bar chart.
    combined = [{'Has RBD?': 'Has RBD'}, {'Has RBD?': 'No RBD'}]
    
    def update_combined(protein_set: set, a_dict: dict) -> dict:
        for gene in protein_set:
            incl_doms = prot_to_doms[gene] & domains_over_N(protein_set)
            if not incl_doms:
                a_dict.setdefault(f'< {N}', 0)
                a_dict[f'< {N}'] += 1
            else:
                dom_name = '/'.join(sorted(list(incl_doms))) if (len(incl_doms) > 1) else list(incl_doms)[0]
                a_dict.setdefault(dom_name, 0)
                a_dict[dom_name] += 1
        return a_dict
    
    update_combined(proteins_with_rbd, combined[0])
    update_combined(proteins_without_rbd, combined[1])
    
    _df = pandas.DataFrame(combined, index=[1, 0])
    
    _df.index = _df['Has RBD?']
    del _df['Has RBD?']
    
    _df.fillna(0, inplace=True)
    
    _df.loc['Sum', :] = _df.sum()
        
    def floor_to_zero_below_N(vals: List[int], N: int = N) -> List[int]:
        return map(lambda x: 0 if x<N else x, vals)
    
    _df.apply(floor_to_zero_below_N)
    _df = _df.loc[:, [x>=N for x in _df.loc['Sum', :]]]
    
    _df.drop('Sum', axis=0, inplace=True)
    print(_df)
    
    ###########
    # Stacked bar graph of the number of proteins with each given domain.
    sns.set_style('ticks')
    fig = plt.figure()
    
    _df.plot(kind='bar', stacked=True, linewidth=0, figsize=(1, 1.5), fontsize=4)

    sns.despine()
    plt.ylabel('# Proteins with domain')
    plt.savefig('./figs/stacked_bar_chart_domain_counts.pdf')
    plt.show()
    plt.clf()
    
    return {dom: len(prots) for dom, prots in dom_to_prots.items()}


domain_counts = count_domains(cosmic & any_rbp_list)

_df = pandas.DataFrame.from_dict(domain_counts, 'index')
_df.columns = ['N_domains']
_df.index.name = 'Domains'

s = _df.loc[[bool(x>1) for x in _df.N_domains], :]
sns.set()
s.T.plot(kind='bar', stacked=True)
plt.show()
plt.clf(); plt.close()


In [None]:
proteins_with_rbd, proteins_without_rbd = split_proteins_with_and_without_rbds(cosmic & census)
prot_to_doms = {
    gene_name: collapse_similar_domains(gene_to_domain.get(gene_name, 'No_domain')) for \
    gene_name in proteins_without_rbd}
from pprint import pprint
pprint(prot_to_doms)