In [None]:
import os, re, matplotlib, pandas, collections, importlib
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
import pandas, re
import seaborn as sns
import scipy as sp
from typing import List, Union, Mapping, Tuple
from pprint import pprint

import heatmaper as hm
import xlLoader
import stabilityLoader
importlib.reload(stabilityLoader)
pma_dir = '/Users/dp/pma/'

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Arial']})

lolipop_proteins = [
    'SF3B1', 'SMAD4', 'U2AF1', 'SRSF2', 'PCBP1', 'CNOT9',
    'RARS2', 'A1CF', 'EEF1B2', 'EIF1AX', 'KHDRBS2', 'FUBP1',
    'RBM11', 'HNRNPCL1', 'DDX3X', 'DICER1', 'PABPC4L', 'RPL5',
    'CRNKL1', 'RBM39', 'DCP1B', 'RBFOX1', 'TDRKH', 'NOVA1',
    'DDX50', 'YTHDC2', 'KPNB1', 'NUFIP1', 'CNOT1']        


In [None]:
################################################
# Plot general data from literature
# Only takes the General_data_from_literature.xlsx as input.
################################################

def load_general_data_from_literature(
    excel_fname=f"{pma_dir}/RBP missense mutations/General_data_from_literature.xlsx"
    ) -> pandas.DataFrame:
    """Load some general protein information from an excel file into a DataFrame, with some edits."""
    
    importlib.reload(hm)
    info = pandas.read_excel(excel_fname)
    info = info.loc[[pandas.isna(x) for x in info.Skip], :]
    info.index = info.loc[:, 'Protein']
    
    # !!!
    # Remove proteins not in the lolipop:
    info = info.loc[[x in lolipop_proteins for x in info.Protein], :]
    del info['Protein']

    #info = info.loc[[x in p_value_protein_order for x in info.index], :]
    #info = info.loc[p_value_protein_order, :]

    info.loc[:, 'Has RBD? (Text)'] = info.loc[:, 'Has RBD?']
    info.loc[:, 'Has RBD?'] = hm.nan_to_no(info.loc[:, 'Has RBD?'])

    info.loc[:, 'Has RBD?'], has_rbd_int_to_text_dict = hm.make_categorical_column_numeric(
        info.loc[:, 'Has RBD?'], return_reverse_mapping=True)

    info.loc[:, 'Direct RBP?'], direct_int_to_text_dict = hm.make_categorical_column_numeric(
        info.loc[:, 'Direct RBP?'], return_reverse_mapping=True)

    info.sort_values(by=['Has RBD?', 'Known RBP?'], inplace=True)

    info.loc[:, 'Known RBP?'] = hm.nan_to_no(info.loc[:, 'Known RBP?'])
    info.loc[:, 'Known RBP?'], known_rbp_int_to_text_dict = hm.make_categorical_column_numeric(
        info.loc[:, 'Known RBP?'], return_reverse_mapping=True)
    info.loc[:, 'RBP by easyCLIP?'], rbp_by_our_method_int_to_text_dict = hm.make_categorical_column_numeric(
        info.loc[:, 'RBP by our method?'], return_reverse_mapping=True)
    info.loc[:, 'Function'], description_int_to_text_dict = hm.make_categorical_column_numeric(
        info.loc[:, 'Function'], return_reverse_mapping=True)
    return info, [has_rbd_int_to_text_dict, direct_int_to_text_dict, known_rbp_int_to_text_dict,
                 rbp_by_our_method_int_to_text_dict, description_int_to_text_dict]

info, [has_rbd_int_to_text_dict, direct_int_to_text_dict, known_rbp_int_to_text_dict, 
       rbp_by_our_method_int_to_text_dict, description_int_to_text_dict,
      ] = load_general_data_from_literature()

order_of_plotting_for_category = list(info.index)
print(f"Plotting {len(order_of_plotting_for_category)} proteins: {order_of_plotting_for_category}")

In [None]:
studied_rbps = pandas.read_excel('/Users/dp/pma/RBP missense mutations/Sequencing_schemes_and_results.xlsx', sheet_name='Proteins'
                                )['Proteins'].tolist()
studied_rbps = [x for x in studied_rbps if (type(x)==type('') and x!='')]

studied_rbps = pandas.read_excel('/Users/dp/pma/RBP missense mutations/General_data_from_literature.xlsx', sheet_name='All'
                                )#['Protein'].tolist()
studied_rbps = studied_rbps.loc[[pandas.isna(x) for x in studied_rbps.Skip], 'Protein'].tolist()
studied_rbps = [x for x in studied_rbps if (type(x)==type('') and x!='')]

def load_tok_statistics() -> Mapping[str, pandas.DataFrame]:
    tok_f = pandas.ExcelFile('cancerLists/pnas.1616440113.sd04.xlsx')
    print(tok_f.sheet_names)
    return {sheet_name: tok_f.parse(sheet_name) for sheet_name in tok_f.sheet_names}

#tok = load_tok_statistics()

paired_col_names = [
    ('2020+', 'driver q-value'), 
    ('2020+', 'oncogene q-value'),
    ('TUSON', 'TUSON.combined.qvalue.TSG'), ('MutsigCV', 'q'), ('OncodriveFM', 'QVALUE'),
    ('OncodriveCLUST', 'QVALUE'), ('OncodriveFML', 'qvalue'), ('ActiveDriver', 'fdr'), ('MuSIC', 'FDR FCPT')
]

def name_cleaning(col):
    col = re.sub('TUSON.combined.qvalue.TSG', 'TSG q-value', col)
    col = re.sub('QVALUE', 'q-value', col)
    col = re.sub('MutsigCV, q', 'MutsigCV, q-value', col)
    return col

def load_stats_into_df(tok, paired_col_names):
    stok = {}
    _dict = {}
    for n in tok:
        print(n)
        #print(tok[n].head(1))
        stok[n] = tok[n].loc[[x in studied_rbps for x in tok[n].loc[:,'Gene']], :]
        stok[n].index = stok[n]['Gene']
        stok[n].fillna(1.5, inplace=True)

        #for protein in proteins_ordered_by_xl_rate:
        #    if protein not in stok[n].index:

    for (method, col) in paired_col_names:
        print(method, col)
        vals = stok[method][col]
        _dict[', '.join([method, col])] = {x: y for x,y in zip(stok[method].index, stok[method][col])}
        for protein in [x for x in studied_rbps if (x not in stok[method].index) and (' ' not in x)]:
            _dict[', '.join([method, col])][protein] = 1.
            
    m = pandas.DataFrame(_dict)
    return m.T

print(studied_rbps)
m = load_stats_into_df(tok, paired_col_names)
new_col = {col: name_cleaning(col) for col in m.index}
print(m, new_col)
m.rename(index=new_col, inplace=True)
#m = m.loc[:, [_ for _ in studied_rbps if _ in m.columns]]
m = m.loc[:, [_ for _ in order_of_plotting_for_category if _ in m.columns]]
m = m.loc[:, [_ in lolipop_proteins for _ in m.columns]]
m = m.T

fig = plt.figure()

g = sns.clustermap(m, cmap='binary_r', xticklabels=True, vmax=1, row_cluster=True, )
g.ax_row_dendrogram.set_visible(False)
g.ax_col_dendrogram.set_visible(False)

p_value_protein_order = [t.get_text() for t in g.ax_heatmap.get_yticklabels()]
print(">>>", p_value_protein_order, len(p_value_protein_order))

fig.set_figwidth(3)
fig.set_figheight(10)
plt.savefig(f'{pma_dir}/dataAndScripts/clip/figs/heatmap_stats_of_missense_RBPs.pdf')
plt.show()
plt.clf(); plt.close()
#print(vals)


In [None]:
def domain_name_to_color(name):
    pal = sns.color_palette(palette='Set2', n_colors=4)
    if 'KH' in name:
        return pal[0]
    if 'RRM' in name:
        return pal[1]
    if re.search('Zf', name, re.IGNORECASE) is not None:
        return pal[2]
    if (('DEAD' in name) or ('Heli' in name)):
        return pal[3]
    else:
        return 'k'

pal = sns.color_palette(palette='Set2', n_colors=4)
domain_color = {
    'KH': pal[0], 'RRM': pal[1], 'Zf': pal[2], 'Helicase': pal[3], 'None': 'k', 'Other': 'k',
}
#print(info.loc[:, 'Function'])
#info.loc[:, 'RBP by our method?'] = hm.nan_to_no(info.loc[:, 'RBP by our method?'])

def make_annot_text(g, mapping, truncate_to=8, left_and_black=False):
    for text in g.texts:
        _str = text.get_text()
        text_for_fig = mapping[int(_str)]
        if len(text_for_fig) > truncate_to:
            text_for_fig = text_for_fig[:truncate_to] + '.'
        text.set_text(text_for_fig)
        #text.update(horizontalalignment='left')
        if left_and_black:
            text.set_horizontalalignment('left')
            text.set_color('k')
        #text.set_rotation(90)
        
    if left_and_black:
        
        print(g.__dict__.keys())
        print(g.figure.__dict__.keys())
        print(g.figure.patches)
        print('>>?>??')
        print(g.patches)
        for patch in g.patches:
            print(patch)
        print('.....')



fig, ax = plt.subplots(1, 5)
muted = sns.color_palette("muted")
cmap = sns.color_palette(palette=muted, desat=0.3)

g = sns.heatmap(info.loc[:, ['Has RBD?']], ax=ax[0], xticklabels=True, cmap=cmap, annot=True, cbar=None,
               yticklabels=True)
ax[0].set_ylabel("")

make_annot_text(g, has_rbd_int_to_text_dict)
g = sns.heatmap(info.loc[:, ['Direct RBP?']], ax=ax[1], xticklabels=True, cmap=cmap, annot=True, cbar=None,
               yticklabels=False)
ax[1].set_ylabel("")
make_annot_text(g, direct_int_to_text_dict)
g = sns.heatmap(info.loc[:, ['Known RBP?']], ax=ax[2], xticklabels=True, cmap=cmap, annot=True, cbar=None,
               yticklabels=False)
ax[2].set_ylabel("")
make_annot_text(g, known_rbp_int_to_text_dict)
g = sns.heatmap(info.loc[:, ['RBP by easyCLIP?']], ax=ax[3], xticklabels=True, cmap=cmap, annot=True, cbar=None,
               yticklabels=False)
ax[3].set_ylabel("")
make_annot_text(g, rbp_by_our_method_int_to_text_dict)


g = sns.heatmap(info.loc[:, ['Function']], ax=ax[4], xticklabels=True,
                cmap=sns.color_palette(palette=sns.color_palette('Greys', 1)), 
                annot=True, cbar=None,
               yticklabels=False)

ax[4].set_ylabel("")
make_annot_text(g, description_int_to_text_dict, truncate_to=100, left_and_black=True)


#print(len(info.index), '<<')
[ax_.set_yticklabels(ax_.get_yticklabels(), rotation='horizontal') for ax_ in ax]
[ax_.set_xticklabels(ax_.get_xticklabels(), rotation='vertical') for ax_ in ax]
[ax_.xaxis.tick_top() for ax_ in ax]
[ax_.tick_params(left=False, bottom=False, top=False) for ax_ in ax]


#g.set_xticklabels(g.get_xmajorticklabels(), fontsize = 9)
fig.set_figheight(12)
fig.set_figwidth(5)

plt.savefig(f'{pma_dir}/dataAndScripts/clip/figs/statistics_on_missense_heatmap.pdf')
plt.show()

plt.clf(); plt.close()



In [None]:
import xlSignificance, xlLoader, importlib, pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

importlib.reload(xlSignificance)
importlib.reload(xlLoader)

pma_dir = '/Users/dp/pma/'

def _loady() -> pandas.DataFrame:
    """Get minimal region XLs and significances in a dataframe, after removing low fmol samples."""
    
    # Get % XL data.
    xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
    (df, recurrent) = xlLoad.load()
    
    # Make a dict of protein->fmol RNA in minimal region so low fmol data can be remove.
    fmolRNA = recurrent.loc[recurrent['Label']=='fmol RNA (minimal region)',:].copy()
    to_fmol = dict(fmolRNA.groupby('Protein')['Value'].mean())
    
    # Subset to only minimal region XL data.
    xl = recurrent.loc[recurrent['Label']=='% XL (minimal region)',:].copy()
    
    # Require a WT XL amount above 3 fmols.
    xl = xl.loc[[to_fmol[protein.split(' ')[0]]>3 for protein in xl.Protein], :]
    
    # Set some internal values of MUT/WT XLs.
    xlLoad.as_fraction_of_wt(xl)
    
    # Stats of MUT/WT counts are added as columns to xl dataframe.
    mut_to_pval = xlLoad.add_pvalues_using_ttest(xl, return_dict=True)
    
    return xl

xl = _loady()

# Only sufficiently high values used for determining variance, and remove WTs (all WT = 1).
mut_only = xl.loc[[x>0.01 for x in xl['Value']], :]
mut_only = mut_only.loc[[' ' in x for x in mut_only.Protein], :]

# Get: MUT(batch x)/WT(batch x) - MUT(batch 1)/WT(batch 1) 
value_vs_wt_by_prot_mut_only = mut_only.groupby('Protein')['Value vs WT'].apply(np.array)

# Make a series of the mean values to average across replicates.
log2vsWT = mut_only.groupby('Protein')['log2 Value vs WT'].apply(np.mean)

# Convert to a dataframe and name some columns.
log2vsWT = pandas.DataFrame(log2vsWT)
log2vsWT.columns = ['log2vsWT']
log2vsWT['Protein'] = log2vsWT.index

# Make a dict of the average P values across replicates.
pvals = mut_only.groupby('Protein')['P value (XL)'].apply(np.mean)
pvals = dict(pvals)

# Add the P values.
log2vsWT['-log10(P value)'] = [-np.log10(pvals.get(_)) for _ in log2vsWT.index]

print(log2vsWT)

# Make a volcano plot of significance vs fold change MUT/WT.
fig = plt.figure()

# Remove a comparison that isn't with a mutant.
log2vsWT = log2vsWT.loc[[x!='PCBP1 (uORF)' for x in log2vsWT.Protein], :]

p1 = sns.scatterplot(
    log2vsWT['log2vsWT'], log2vsWT['-log10(P value)'], color='k', alpha=0.5, s=50,
    linewidth=0)

# Add text labels to outliers on the plot.
for line in range(0,log2vsWT.shape[0]):
    if log2vsWT['-log10(P value)'][line] < 2:
        continue  # Don't label P>0.01 dots.

    p1.text(log2vsWT['log2vsWT'][line]+0.01, log2vsWT['-log10(P value)'][line], 
         log2vsWT['Protein'][line], horizontalalignment='left', 
         size='small', color='black')#, weight='semibold')

fig.set_figheight(5)
fig.set_figwidth(5)
fig.savefig('/Users/dp/pma/dataAndScripts/clip/figs/volcano_xl_effect_of_mutations.pdf')
plt.show()
plt.clf()
plt.close()

In [None]:
################################################
# Get p values.
################################################
importlib.reload(xlLoader)
xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
(df, recurrent) = xlLoad.load()
xl = recurrent.loc[recurrent['Label']=='% XL (minimal region)',:].copy()
xlLoad.as_fraction_of_wt(xl)

# Sets 'P value (XL)' column
mut_to_pval = xlLoad.add_pvalues_using_ttest(xl, return_dict=True)
print(mut_to_pval)

#n_muts = len([x for x in mut_to_pval if (' ' in x)])
#bonferroni_mut_to_pval = {k:v*n_muts for k,v in mut_to_pval.items()}

# Reformat into a separate df.
xl_pval_df = pandas.DataFrame.from_dict(mut_to_pval, 'index').T
xl_pval_df.index = ['P value (XL)']
print(xl_pval_df)

effect_sizes = xlLoad.effect_sizes(xl)
xl_effect_df = pandas.DataFrame.from_dict(effect_sizes, 'index').T
xl_effect_df.index = ['Effect size (XL)']

sns.scatterplot(x='')

In [None]:
importlib.reload(xlLoader)
from statistics import NormalDist
from scipy.stats import norm

def confidence_interval(data, confidence=0.95):
    #https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
    if len(data) < 2:
        return np.nan
    dist = NormalDist.from_samples(data)
    z = NormalDist().inv_cdf((1 + confidence) / 2.)
    h = dist.stdev * z / ((len(data) - 1) ** .5)
    return dist.mean - h, dist.mean + h


def get_xl_pval_df_from_ttest_vs_wt(xl):
    mut_to_pval = {}
    mut_to_ci = {}

    for prot in set([_ for _ in xl.Protein if ' ' in _]):
        sub = xl.loc[[x==prot for x in xl.Protein], 'Value vs WT'].to_numpy()
        res = scipy.stats.ttest_1samp(sub, 0.)
        #ci = confidence_interval(sub)
        mut_to_pval[prot] = np.log10(res.pvalue)
        #mut_to_ci[prot] = ci

    for prot in [_ for _ in xl.Protein if ' ' not in _]:
        mut_to_pval[prot] = 1.

    xl_pval_df = pandas.DataFrame.from_dict(mut_to_pval, 'index').T
    xl_pval_df.index = ['P value']
    
    #ci_df = pandas.DataFrame.from_dict(mut_to_ci, 'index').T
    return xl_pval_df


def sorted_gids(gid, xl):
    _df = xl.loc[[bool(x==gid[1]) for x in xl.Protein], :]
    gids = list(set(_df.loc[:, 'Group_ID']))
    to_exp_decimal = {(exp, protein, rep): float(str((exp.split('xp')[-1])) + '.' + str(rep.split(' ')[-1])) \
                      for (exp, protein, rep) in gids}
    gids = sorted(gids, key=lambda x: to_exp_decimal[x])
    return gids

def sorted_all_gids_with_the_same_wt_protein(protein, xl):
    _df = xl.loc[[bool(  x.split(' ')[0]==protein.split(' ')[0]  ) for x in xl.Protein], :]
    gids = list(set(_df.loc[:, 'Group_ID']))
    to_exp_decimal = {(exp, protein, rep): float(exp.split('xp')[-1]) for (exp, protein, rep) in gids}
    gids = sorted(gids, key=lambda x: to_exp_decimal[x])
    return gids


def define_all_batches(xl):
    to_batches = {}
    for protein in set(xl['Protein']):
        to_batches[protein] = batches_for_a_protein(protein, xl)
    xl['Batch'] = [to_batches[protein][exp, rep] for protein, exp, rep in zip(
        xl.Protein, xl.Exp, xl.Rep)]
    
def batches_for_a_protein(protein, xl):

    _sorted = sorted_all_gids_with_the_same_wt_protein(protein, xl)
    exp_rep_pairs = [(_exp, _rep) for _exp, _prot, _rep in _sorted]
    # Remove duplicates keeping list order.
    _ = [] 
    [_.append(x) for x in exp_rep_pairs if x not in _] 
    exp_rep_pairs = _
    to_batch = {exp_rep_pair:n for n, exp_rep_pair in enumerate(exp_rep_pairs, start=1)}
    return to_batch

def batch1(gid, xl):
    return xl.loc[[bool(prot==gid[1] and batch==1) for prot, batch in zip(xl.Protein, xl.Batch)], :]
    
def subtract_pair_from_batch1(gid, xl):
    val = xl.loc[[groupid==gid for groupid in xl.Group_ID], 'vs_protein_mean'].to_numpy()[0]    
    val_batch1 = batch1(gid, xl)
    if len(val_batch1.index):
        val_batch1 = val_batch1['vs_protein_mean'].to_numpy()[0]
        return val - val_batch1
    print(f"No batch1 found for {gid}")
    return np.nan

def pair_batches(xl):
    b2 = xl.loc[[x!=1 for x in xl.Batch], :]
    arr = []
    for protein in [x for x in set(b2['Protein']) if ' ' in x]:  # For each mutant.
        sub = b2.loc[[x==protein for x in b2.Protein], '-batch1'].to_numpy()
        wt = b2.loc[[x==protein.split(' ')[0] for x in b2.Protein], '-batch1'].to_numpy()
        for n, val in enumerate(sub):
            if len(wt) > n and not (np.isnan(wt[n]) or np.isnan(sub[n])):
                arr.append([wt[n], sub[n]])
    return arr

xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
(df, recurrent) = xlLoad.load()
xl = recurrent.loc[recurrent['Label']=='% XL (minimal region)',:].copy()
xlLoad.as_fraction_of_wt(xl)

# To investigate batch effects, define batches.
define_all_batches(xl)

xl['vs_protein_mean'] = [(val)/max([0.001, _[p.split(' ')[0]]]) for val, p, exp in zip(xl['Value'], xl['Protein'], xl.Exp)]

# Use batch definitions to get the change in signal from batch 1 for each protein.
xl['-batch1'] = [subtract_pair_from_batch1(gid, xl) for gid in xl.Group_ID]

# Pair (WT, MUT) for each batch for each protein (of -batch1 values, batch1 itself not included).
arr = pair_batches(xl)

# Create df for sns.lmplot.
arrdf = pandas.DataFrame(arr)
arrdf.columns = ['WT', 'MUT']

# Get the parameters for a linear regression.
x=[x[0] for x in arr]
y=[x[1] for x in arr]
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
print(f"slope={slope}, intercept={intercept}, R={r_value}, p={p_value}, std_err={std_err}, R^2={r_value**2}")
#plt.scatter(x=[x[0] for x in arr], y=[x[1] for x in arr], c='k', alpha=0.3)

# sns.lmplot the -batch1 values.
sns.lmplot(x='WT', y='MUT', data=arrdf)
plt.show()
plt.clf()
plt.close()

print("---*-*--- "*10)


print(xl.head())

# Get a Protein->average value dict.
_ = xl.groupby(by=['Protein'])['Value'].mean().to_dict()

# Subset to proteins with a sufficiently high WT protein mean value.
xl = xl.loc[[_[p]>0.1 for p in xl['WT protein']], :]

# Get val/(WT mean) for each protein.
xl['vs_protein_mean'] = [(val)/max([0.001, _[p.split(' ')[0]]]) for val, p, exp in zip(xl['Value'], xl['Protein'], xl.Exp)]

#xl['-batch1'] = np.log2(xl['-batch1'])
xl['vs_protein_mean'] = np.log2(xl['vs_protein_mean'])
#xl = xl.loc[[np.isfinite(x) for x in xl['-batch1']], :]
xl = xl.loc[[np.isfinite(x) for x in xl['vs_protein_mean']], :]

# Define batch_ID column.
q = xl.groupby(by=['Protein'])['Group_ID']
print(q.get_group('A1CF').to_list())
xl['batch_ID'] = [q.get_group(x).to_list() + q.get_group(x.split(' ')[0]).to_list() for x in xl.Protein]
print(xl.head())

# Subset to val/(WT mean) below 2 to remove outliers, get the mean and std dev, and fit a normal distribution.
vm = xl['vs_protein_mean'].to_numpy()
vm = vm[vm<2]
vm = vm[vm>-2]
mean, stddev = (np.mean(vm), np.var(vm)**0.5)
fit_norm_vals = np.random.normal([np.mean(vm)], [np.var(vm)**0.5], size=1000).T

# Plot val/(WT mean), including outliers.
ax = sns.distplot(xl['vs_protein_mean'], kde=True)

# Print mean/std dev.
print(f"mean={mean}, stddev={stddev}")
print('-~()~- ' * 10)

# How one could calculate a P value.
#a = norm(loc=mean, scale=stddev).cdf([1.3, 1.3])
#print(np.product([1-v for v in a]))

# Plot the fit normal distribution.
sns.distplot(fit_norm_vals, ax=ax, hist=False, fit=norm, kde=False)
plt.show()
plt.clf()
plt.close()


In [None]:

importlib.reload(stabilityLoader)
importlib.reload(xlLoader)
input_filename = '/Users/dp/pma/dataAndScripts/clip/experiments/Exp85_missense_and_stability_trial_using_control_vector/Exp85_missense.xlsx'
input_sheet = 'Stability'

proteins_ordered_by_xl_rate = xlLoader.xlLoader().proteins_ordered_by_xl_rate('/Users/dp/pma/percentCrosslinked.xlsx')

df = stabilityLoader.load_data(input_filename, input_sheet)
_stats = stabilityLoader.get_stats(df)

sdf = pandas.DataFrame(_stats)
sdf['Mutation'] = sdf.index
print(sdf)

stability_pvals = sdf.loc[:, ['pval']].T
stability_pvals = np.log10(stability_pvals)
stability_pvals.index = ['log10 Stability pval']
stability_pvals = stability_pvals.loc[:, proteins_ordered_by_xl_rate]

stability_n_reps = sdf.loc[:, ['# stability replicates']].T
stability_n_reps.index = ['# stability replicates']
stability_n_reps = stability_n_reps.loc[:, proteins_ordered_by_xl_rate]

stability_n_reps.T.to_excel('temp.xlsx')
print(stability_pvals)

In [None]:
importlib.reload(stabilityLoader)
df = stabilityLoader.load_data(input_filename, input_sheet)

g = df.groupby(['Protein'])['Abundance/WT normalized by image'].mean()
g = pandas.DataFrame(g)
g['log2 Protein stability'] = np.log2(g.loc[:, 'Abundance/WT normalized by image'])

stabiliy_signal = g.T.copy()

proteins_ordered_by_xl_rate = xlLoader.xlLoader().proteins_ordered_by_xl_rate('/Users/dp/pma/percentCrosslinked.xlsx')
stabiliy_signal = stabiliy_signal.loc[['log2 Protein stability'], proteins_ordered_by_xl_rate]
#stabiliy_signal.loc['Protein stability', :] = np.nan_to_num(stabiliy_signal.loc['Protein stability', :])

plt.clf()
fig = plt.figure()
q = sns.heatmap(stabiliy_signal, xticklabels=True, cmap='RdBu_r',
                vmin=-1, vmax=1,
                #cbar_kws={"orientation": "horizontal"}
               )
q.set_xticklabels(q.get_xmajorticklabels(), fontsize = 10)
fig.set_figwidth(12)
fig.set_figheight(3)

plt.show()
plt.clf(); plt.close()

In [None]:
"""
        df.loc[:,'WT protein'] = [x.split(' ')[0] for x in df.loc[:, 'Protein']]
        df.loc[:,'WT rep mean'] = [cls.get_wt_rep_mean(protein, rep, df, values_label) for (protein, rep) in zip(df.loc[:,'WT protein'], df.loc[:,'Exp'])]
        df.loc[:,'Value vs WT'] = [x/max([y, 10**-6]) for x,y in zip(df['Value'], df['WT rep mean'])]
        df.loc[:,'log2 Value vs WT'] = np.log2(df.loc[:, 'Value vs WT'].tolist())
        df.loc[:,'N reps'] = [cls.get_n_replicates(protein, df, values_label) for protein in df.loc[:,'WT protein']]
"""
importlib.reload(xlLoader)
importlib.reload(hm)

xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
(df, recurrent) = xlLoad.load()

xl = recurrent.loc[[label=='% XL (minimal region)' for label in recurrent['Label']], :].copy()
xlLoad.as_fraction_of_wt(xl)

proteins_ordered_by_xl_rate = xlLoader.xlLoader().proteins_ordered_by_xl_rate('/Users/dp/pma/percentCrosslinked.xlsx')
no_dups = proteins_ordered_by_xl_rate
if 'PCBP1 (uORF)' in proteins_ordered_by_xl_rate:
    proteins_ordered_by_xl_rate.pop(proteins_ordered_by_xl_rate.index('PCBP1 (uORF)'))

pmol_protein = hm.subset_by_label_and_convert_to_matrix(recurrent, label='pmol protein', order_by=no_dups)
fmol_rna_min_region = hm.subset_by_label_and_convert_to_matrix(recurrent, label='fmol RNA (minimal region)', order_by=no_dups)
perc_xl_whole_lane = hm.subset_by_label_and_convert_to_matrix(recurrent, label='% XL (whole lane)', order_by=no_dups)
per_xl_min = hm.subset_by_label_and_convert_to_matrix(recurrent, label='% XL (minimal region)', order_by=no_dups)

#deltas = subset_by_label_and_convert_to_matrix(xl, label='log2 Value vs WT', xcol='Protein', ycol='Value', order_by=no_dups)
#rep_nums = subset_by_label_and_convert_to_matrix(recurrent, label='N reps', xcol='Protein', ycol='Value', order_by=no_dups)

deltas = hm.convert_to_matrix(xl, ycol='log2 Value vs WT')
rep_nums = hm.convert_to_matrix(xl, ycol='# XL replicates')

for col in [_ for _ in deltas.columns if (' ' not in _)]:
    deltas.loc['log2 Value vs WT', col] = np.nan

#print(proteins_ordered_by_xl_rate)

#per_xl_min = per_xl_min[proteins_ordered_by_xl_rate]
deltas = deltas[proteins_ordered_by_xl_rate]
rep_nums = rep_nums[proteins_ordered_by_xl_rate]
xl_pval_df = xl_pval_df[proteins_ordered_by_xl_rate]
xl_effect_df = xl_effect_df[proteins_ordered_by_xl_rate]

cols1 = set(deltas.columns)
cols2 = set(per_xl_min.columns)
print("In per XL only: ", cols2-cols1)
print("In deltas only: ", cols1-cols2)
print(xl_effect_df)


grid_kws = {"width_ratios": (.9, .1), "wspace": .05}
fig, ax = plt.subplots(10, 2, gridspec_kw=grid_kws)
#print(fmol_rna_min_region)

def cell_text(g, lower, upper, round_to_int=True, decimals=1):
    for text in g.texts:
        _str = text.get_text()
        if float(_str) < lower:
            text.set_text(f'<{lower}')
        elif float(_str) > upper:
            if upper < 0:
                text.set_text(f'> {upper}')
            else:
                text.set_text(f'>{upper}')
        elif 0 < float(_str) < 1:
            pass
        elif round_to_int and decimals != 0:
            text.set_text(round(float(_str), decimals))
        elif round_to_int:
            text.set_text(int(round(float(_str))))
        text.set_rotation(90)
        
g = sns.heatmap(fmol_rna_min_region, ax=ax[0][0], xticklabels=False, cmap='binary', vmin=0,
                annot=True,
                cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[0][1])
#ax[0].set_yticklabels(['fmol RNA (minimal region)'], rotation='horizontal')

cell_text(g, 1, 99)

g = sns.heatmap(pmol_protein, ax=ax[1][0], xticklabels=False, cmap='binary', vmin=0, annot=True,
                cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[1][1])

cell_text(g, 1, 9)
#ax[1].set_ylabel('pmol protein', rotation='horizontal')

g = sns.heatmap(perc_xl_whole_lane, ax=ax[2][0], xticklabels=False, cmap='binary', vmin=0, annot=True,
                cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[2][1])
#ax[2].set_ylabel('% XL (whole lane)', rotation='horizontal')
print(g.texts)
cell_text(g, 0.1, 9)

g = sns.heatmap(per_xl_min, ax=ax[3][0], xticklabels=False, cmap='binary', vmin=0, annot=True,
                cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[3][1])
cell_text(g, 0.1, 9)
#ax[3].set_ylabel('% XL (minimal region)', rotation='horizontal')

g = sns.heatmap(rep_nums, ax=ax[4][0], xticklabels=False, cmap='binary', vmin=0, annot=True,
                cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[4][1])
#ax[4].set_ylabel('N reps', rotation='horizontal')
cell_text(g, 0, 9, decimals=0)

# Stability N reps.
g = sns.heatmap(stability_n_reps, ax=ax[5][0], xticklabels=False, cmap='binary', vmin=0, annot=True,
                cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[5][1])
cell_text(g, 0, 9, decimals=0)

# This is log2 vs WT stability.
g = sns.heatmap(stabiliy_signal, ax=ax[6][0], xticklabels=False, cmap='RdBu_r', vmin=-1, vmax=1, annot=True,
                cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[6][1])
cell_text(g, -3, 3)

# Effect sizes % XL
log_xl_effect_df = xl_effect_df.copy()
log_xl_effect_df.loc['Effect size (XL)', :] = np.log10(log_xl_effect_df.loc['Effect size (XL)', :])
g = sns.heatmap(xl_effect_df, ax=ax[7][0], xticklabels=False, cmap='RdBu_r', vmin=-4, vmax=-1, annot=True,
                cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[7][1])
cell_text(g, -10, 10, round_to_int=True)

# This is log10 XL p value.
log_xl_pval = xl_pval_df.copy()
log_xl_pval.loc['P value (XL)', :] = np.log10(log_xl_pval.loc['P value (XL)', :])
g = sns.heatmap(xl_pval_df, ax=ax[8][0], xticklabels=False, cmap='binary_r', vmin=-4, vmax=-1, annot=True,
                cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[8][1])
cell_text(g, 0.01, 0.2,round_to_int=False)

# This is log2 vs WT.
g = sns.heatmap(deltas, ax=ax[9][0], xticklabels=True, cmap='RdBu_r', vmin=-2, vmax=2,
                annot=True, #fmt="d",
               cbar_kws={"orientation": "horizontal"}, cbar_ax=ax[9][1])
cell_text(g, -3, 3)
#ax[5].set_ylabel('log2(Mut/WT) XL rate', rotation='horizontal')

[_ax[0].set_xlabel('') for _ax in ax[:-1]]

#[_ax.set_aspect('equal') for _ax in ax]
[_ax[0].set_yticklabels(_ax[0].get_yticklabels(), rotation='horizontal') for _ax in ax]
ax[8][0].set_xticklabels(g.get_xmajorticklabels(), fontsize = 10)
fig.set_figwidth(12)
fig.set_figheight(5)

plt.savefig(f'{pma_dir}/dataAndScripts/clip/figs/xl_rate_heatmap.pdf')
plt.show(); plt.clf(); plt.close()
#print(xl)



In [None]:
import heatmaper as hm
import xlLoader
importlib.reload(xlLoader)
importlib.reload(hm)

xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
(df, recurrent) = xlLoad.load()
print(df.head())
info = pandas.read_excel('/Users/dp/pma/RBP missense mutations/General_data_from_literature.xlsx')
#info = info.loc[[pandas.isna(x) for x in info.Skip], :]
info.index = info.loc[:, 'Protein']
#info_dict = info.to_dict()

xl = df.loc[[label=='% XL (minimal region)' for label in df['Label']], :].copy()
#xlLoad.as_fraction_of_wt(xl)

#proteins_ordered_by_xl_rate = xlLoader.xlLoader().proteins_ordered_by_xl_rate('/Users/dp/pma/percentCrosslinked.xlsx')
#print(proteins_ordered_by_xl_rate)
# Remove PCBP1 (uORF)
#if 'PCBP1 (uORF)' in proteins_ordered_by_xl_rate:
#    proteins_ordered_by_xl_rate.pop(proteins_ordered_by_xl_rate.index('PCBP1 (uORF)'))

# Remove EPB41L5, CCIN for being artifactual.
to_remove = ['TDRKH']#'EPB41L5', 'CCIN']
xl = xl.loc[[x not in to_remove for x in xl.Protein], :]

# Put in a simplified form.
per_xl_min = hm.subset_by_label_and_convert_to_matrix(
    xl, label='% XL (minimal region)')#, order_by=proteins_ordered_by_xl_rate)
print(per_xl_min, '______')

# Put proteins as the index instead of columns.
raws_with_rbd = per_xl_min.copy().T

# Remove proteins with names not in the general info file (removes mutants).
raws_with_rbd = raws_with_rbd.loc[[ bool(x in info.index) for x in raws_with_rbd.index], :]

# Add a column of RBD domains from the info excel file.
raws_with_rbd.loc[:, 'RBD?'] = [info.loc[x, 'Has RBD?'] for x in raws_with_rbd.index]
raws_with_rbd.loc[:, 'Direct RBP?'] = [info.loc[x, 'Direct RBP?'] for x in raws_with_rbd.index]
#print(raws_with_rbd, '---'*20)
#print(raws_with_rbd['Direct RBP?'].value_counts())

raws_with_rbd.to_excel('./temp.xlsx')

raws_with_rbd['% XL (minimal region)'] = np.log10(raws_with_rbd['% XL (minimal region)'])
sub = {}
for cat in set(raws_with_rbd['Direct RBP?']):
    sub[cat] = raws_with_rbd.loc[[x==cat for x in raws_with_rbd['Direct RBP?']], :]
colors = ['red', 'k', 'purple']#np.random.random((len(sub),3))

plt.figure(figsize=(4,4))

# Plot.
for n, (cat, _df) in enumerate(sub.items()):
    ax = sns.swarmplot(
        x='RBD?', y='% XL (minimal region)', data=_df, color=colors[n], order=list(set(info['Has RBD?'])),
        alpha=0.5,
    )

ax.axhline(y=-1, c='k', lw=0.4)
ax.set(xlabel='', ylabel='log10 % XL (minimal region)')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
#ax.set_title('')
sns.despine()
plt.savefig(f'{pma_dir}/dataAndScripts/clip/figs/xl_rate_swarmplot.svg')
plt.show()
plt.clf(); plt.close()


In [None]:
import dabest
importlib.reload(xlLoader)
importlib.reload(hm)
from pprint import pprint

xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
(df, recurrent) = xlLoad.load()
df = df.loc[[x!='Recurrent, MUT' for x in df.Category], :]

df = xlLoad.add_total_fmol_column(df=df, inplace=True)

known_rbps = [
    'U2AF1', 
    'PCBP1',
    'eIF4H',
    'SF3B1',
    'DDX3X', 'NOVA1', 'DDX50', 'RBM39', 'RPL5',   
    #'RARS2',
    'YTHDC2', 'FUBP1',  'KHDRBS2',   'NUFIP',  'A1CF', 
    'SRSF2', 'RBM11', 'DICER1', 
    'RBFOX1',

    'hnRNP C', 'FBL',
    'CELF1', 'hnRNP D', 'STAU1',
    'DHX21', #'NSUN2'
]

putative = [
    'SMAD3', 'SMAD4', #'BRCA1', 'BARD1',
    'CNOT9', 'HNRNPCL1', 'CNOT1', 'EIF1AX',
    'PABPC4L', 'CRNKL1', 'DCP1B', 'BCLAF1',
    'SMAD4',
]

non_rbp = [
    'CAPNS2','CCIN',
    'CDK4','CHMP3','DCTN6','EGFP',
    'EPB41L5',
    'ETS2','IDE','ITPA','TPGS2','UBA2'
]

incertae_sedis = ['TDRKH', 'EEF1B2']

print(df.head(1))

#xl = df.loc[[label=='fmol RNA (minimal region)' for label in df.Label], :]

def swarmplot_categories(
    xl, fig_name='XL_rate_min_region_swarmplot_RBP_vs_nonRBP.pdf', ylabel='Value',
    log10=True):
    
    to_cat = xl.groupby(by=['Protein'])['Category'].apply(set)
    to_cat = to_cat.apply(lambda x: list(x)[0]).to_dict()

    is_rbp = {}
    for protein, cat in to_cat.items():

        if protein in known_rbps:
            is_rbp[protein] = 'RBP'
        elif protein in putative:
            is_rbp[protein] = 'Putative'
        elif protein in non_rbp:
            is_rbp[protein] = 'non-RBP'
        elif protein in incertae_sedis:
            is_rbp[protein] = 'Incertae sedis'
        else:
            is_rbp[protein] = 'Mutant?'
    pprint([x for x in to_cat if to_cat[x]=='non-RBP'])

    a = xl.groupby(by=['Protein'])['Value'].apply(np.mean)
    a = pandas.DataFrame(a)
    a['Category'] = [to_cat[x] for x in a.index]
    a['is_RBP'] = [is_rbp.get(x, '?') for x in a.index]
    print(ylabel, '------->')
    #print(a.head())
    #print(a.loc[[x=='RBP' for x in a.is_RBP],:])
    #print(a.loc[[x=='non-RBP' for x in a.is_RBP],:])
    
    if log10:
        a['Value'] = np.log10(a.loc[:,'Value'])
    a = a.loc[[not pandas.isna(x) for x in a.Value], :]
    a['Value'] = np.clip(a=a.Value, a_min=-3, a_max=1E6)
    a[ylabel] = a['Value']
    a = a.loc[[x in ['RBP', 'non-RBP', 'Putative'] for x in a.is_RBP], :]
    
    a = a.sort_values(by='is_RBP', ascending=False)
    a.to_excel('test.xlsx')
    
    ##########
    # Plotting.
    
    my_pal = {
        'RBP': 'red',
        'non-RBP': 'purple'
    }
    dabest_xl = dabest.load(
        data=a, x='is_RBP', y=ylabel, idx=('non-RBP', 'RBP'))
    
    fig = dabest_xl.mean_diff.plot(custom_palette=my_pal)
    #print(fig.axes[0])
    fig.axes[0].axhline(y=np.log10(0.1), xmin=0, xmax=1, c='k')
    fig.set_figwidth(3)
    fig.set_figheight(3)
    plt.savefig(f"{pma_dir}/dataAndScripts/clip/figs/{fig_name}")
    plt.show()
    plt.clf(); plt.close()
    #plt.savefig('./figs/dabest_mean_of_all_mutants_vs_all_proteins.pdf')
    
    return
    fig = plt.figure()
    #ax = sns.violinplot(data=a, y='Value', x='is_RBP', inner="stick",)
    ax = sns.swarmplot(data=a, y='Value', x='is_RBP')
    ax.set(xlabel='', ylabel=ylabel)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    
    fig.set_figwidth(2)
    plt.savefig(f"{pma_dir}/dataAndScripts/clip/figs/{fig_name}")
    plt.show()
    plt.clf(); plt.close()

    #params = {'bins':10, 'rug':True}
    #sns.distplot(a.loc[[x=='RBP' for x in a.is_RBP], :]['Value'], color='Red', label='RBP', **params)
    #sns.distplot(a.loc[[x=='non-RBP' for x in a.is_RBP], :]['Value'], color='Blue', label='non-RBP', **params)
    #sns.distplot(a.loc[[x=='Putative' for x in a.is_RBP], :]['Value'], color='Grey', label='Putative')
    #plt.show()
    #plt.clf(); plt.close()

    
xl = df.loc[[label=='fmol RNA (whole lane)' for label in df.Label], :]
swarmplot_categories(xl, fig_name='fmol_RNA_whole_lane_swarmplot_RBP_vs_nonRBP.pdf', ylabel='log10 fmol RNA (whole lane)')

xl = df.loc[[label=='% XL (minimal region)' for label in df.Label], :]
swarmplot_categories(xl, fig_name='XL_rate_min_region_swarmplot_RBP_vs_nonRBP.pdf', ylabel='% XL (minimal region)', log10=True)

xl = df.loc[[label=='% XL (whole lane)' for label in df.Label], :]
swarmplot_categories(xl, fig_name='XL_rate_whole_lane_swarmplot_RBP_vs_nonRBP.pdf', ylabel='log10 % XL (whole lane)')

xl = df.loc[[label=='fmol RNA (minimal region)' for label in df.Label], :]
swarmplot_categories(xl, fig_name='fmol_RNA_min_region_swarmplot_RBP_vs_nonRBP.pdf', ylabel='log10 fmol RNA (minimal region)')

xl = df.loc[[label=='pmol protein' for label in df.Label], :]
swarmplot_categories(xl, fig_name='pmol_protein_swarmplot_RBP_vs_nonRBP.pdf', ylabel='log10 pmol protein')



xl = recurrent.loc[[label=='% XL (minimal region)' for label in recurrent['Label']], :]
xlLoad.as_fraction_of_wt(xl)




In [None]:
importlib.reload(xlLoader)
importlib.reload(hm)
from pprint import pprint
xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
(df, recurrent) = xlLoad.load()
xlLoad.get_total_fmol(df, ('Exp57', 'PCBP1', 'Rep 1'))


In [None]:
xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
(df, recurrent) = xlLoad.load()
df = df.loc[[x!='Recurrent, MUT' for x in df.Category], :]
df = df.loc[[x=='% XL (minimal region)' for x in df.Label], :]
df = pandas.DataFrame(df.groupby(by=['Protein'])['Value'].apply(np.mean))
df['Protein'] = df.index
print(df)

info = pandas.read_excel(f"{pma_dir}/RBP missense mutations/General_data_from_literature.xlsx")
info.replace(to_replace='Ribosomal', value='Other', inplace=True)
info.replace(to_replace='No', value='None', inplace=True)
info.index = info['Protein']
to_rbd = dict(zip(info.Protein, info['Has RBD?']))


#raws_with_rbd = per_xl_min.copy().T
#info_dict = info.to_dict()
df.loc[:, 'RBD?'] = [to_rbd.get(x, '') for x in df.Protein]
df = df.loc[[x!='' for x in df['RBD?']], :]



df['Value'] = np.log10(df.Value)
fig = plt.figure()

ax = sns.swarmplot(x='RBD?', y='Value', data=df, order=['RRM', 'KH', "Other", 'None', 'non-RBP'])
wid = 0.01
plt.axhspan(
    -1-wid, -1+wid,
    facecolor='0', alpha=0.5)

ax.set(xlabel='', ylabel='log10 % XL (minimal region)')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
sns.despine()
fig.set_figwidth(3)
plt.savefig(f'{pma_dir}/dataAndScripts/clip/figs/xl_rate_by_prot_domain_swarmplot.pdf')
plt.show()
plt.clf(); plt.close()