In [None]:
import os, re, matplotlib, pandas, collections
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
import pandas, re
import seaborn as sns
import scipy as sp
from typing import List, Union, Mapping, Tuple
pma_dir = '/Users/dp/pma/'

matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Arial']})

In [None]:
def replicate_names(df: pandas.DataFrame):
    df['Protein'] = [re.sub('XL-\d', '', x) for x in df.Protein]
    df['Protein'] = [re.sub(' r', ' R', x).split(' R')[0] for x in df.Protein]
    return df


def load(edit_names=True) -> pandas.DataFrame:
    df = pandas.read_excel(
        #'/Users/dfporter/pma/clip/experiments/exp33 aka exp34 randos/exp33.xlsx',
        f'{pma_dir}/percentCrosslinked.xlsx',
        #sheetname='Num for fig qWB lysate 1'
        sheet_name='XL'
    )

    df = df.loc[[bool(pandas.isna(x)) for x in df.Discard], :]
    
    if edit_names:
        df = replicate_names(df)
    df.sort_values(by='Value', ascending=False, inplace=True)
    
    # Add molecular weights.
    mw = pandas.read_excel(f'{pma_dir}/dataAndScripts/tables/molecularWeights.xlsx')
    to_mw = dict(zip(mw.Protein, mw['MW (kDa)']))
    df['MW'] = [to_mw.get(x, 0) for x in df.Protein]

    return df





In [None]:
df = pandas.read_excel(
    #'/Users/dfporter/pma/clip/experiments/exp33 aka exp34 randos/exp33.xlsx',
    f'{pma_dir}/percentCrosslinked.xlsx',
    #sheetname='Num for fig qWB lysate 1'
    sheet_name='XL'
)
df = df.loc[[bool(pandas.isna(x)) for x in df.Discard], :]
xl = df.loc[[x=='% XL (minimal region)' for x in df.Label], :]
n_reps = xl.groupby('Protein')['Value'].apply(lambda x: len(list(x)))
has_reps = list(n_reps[n_reps>1].index)


In [None]:
#df = load()
rando_names = {
    'ITPA', 'CCIN', 'CDK4', 'TPGS2', 'IDE',
    'UBA2', 'EPB41L5', 'DCTN6', 'CAPNS2', 'VPS24',
    'ETS2', 'CHMP3',
    'EGFP',
}

df = pandas.read_excel(
    #'/Users/dfporter/pma/clip/experiments/exp33 aka exp34 randos/exp33.xlsx',
    f'{pma_dir}/percentCrosslinked.xlsx',
    #sheetname='Num for fig qWB lysate 1'
    sheet_name='XL'
)
df = df.loc[[bool(pandas.isna(x)) for x in df.Discard], :]

xl = df.loc[[x=='% XL (minimal region)' for x in df.Label], :]
n_reps = xl.groupby('Protein')['Value'].apply(lambda x: len(list(x)))
has_reps = list(n_reps[n_reps>1].index)

df = df.loc[[(x in has_reps) for x in df.Protein],:]

def is_RBP(name, category):
    if name == 'TDRKH':
        return 'Putative RBP'
    if name in ['SMAD4']:
        return 'Putative RBP'
    if category in ['RBP', 'Putative RBP', 'non-RBP']:
        return category
    if name in rando_names:
        return 'non-RBP'
    if name in ['DCP1B', 'CNOT9', 'BCLAF1', 'CNOT1', 'EEF1B2', 'EIF1AX', 'KPNB1', 'CRNKL1']:
        return 'Unknown if direct'
    return 'RBP'

def pal(n_colors):
    return sns.cubehelix_palette(n_colors=n_colors, start=0, gamma=3, rot=180, dark=0,)[::-1]

to_remove = ['FHH-hnRNP C', 'FHH-hnRNP C F54A', 'PCBP1 GxxG', 'PCBP1 100P' , 'PCBP1 100P', 'PCBP1 L100P',
             'PCBP1 (uORF)', 'PCBP1 ∆KH2', 'FBL (old method)', 'BRCA1']
wts = df.loc[[(type(x) != type('') or 'MUT' not in x) for x in df['Category']], :]
wts = wts.loc[[(x not in to_remove) for x in wts.Protein], :]
wts['RBP?'] = [is_RBP(x,y) for x,y in zip(wts['Protein'], wts['Category'])]

def order_by_label(df, label=None):
    
    if label is not None:
        with_label_only = df.loc[[x==label for x in df['Label']], :]
        _order = with_label_only.groupby(by=['Protein'])['Value'].mean()
    else:
        _order = df.groupby(by=['Protein'])['Value'].mean()
        
    protein_order = sorted(_order.index, key=lambda x: _order[x], reverse=True)

    def get_order(name: str) -> int:
        if name in protein_order:
            return protein_order.index(name)
        elif type(name) != type(''):
            return -1
        elif name.split(' ')[0] in protein_order:
            return protein_order.index(name.split(' ')[0]) + 0.5
        return -1

    df['order'] = [get_order(x) for x in df.Protein]
    df.sort_values(by=['order'], inplace=True)
    return df, protein_order

colors = {
    'RBP': 'red',
    'non-RBP': 'purple',
    'Unknown if direct': 'blue',
    'Putative RBP': 'green',
}
def xl_scatterplot(xl, cutoff_lines=False, grid_lines=True):
    
    #xl.sort_values(by='Value', ascending=False, inplace=True)
    xl, protein_order = order_by_label(xl)
    rbps = xl.loc[[x=='RBP' for x in xl['RBP?']], :]
    non_rbps = xl.loc[[x=='non-RBP' for x in xl['RBP?']]]
    print(f"{len(set(xl.Protein))} proteins plotted. {len(set(rbps.Protein))} RBPs and {len(set(non_rbps.Protein))} non-RBPs.")
    
    
    xl['Value'] = np.log10(xl.Value)
    sns.scatterplot(x='Protein', y='Value', data=xl,
               #color='#7594AA',
                    hue='RBP?', alpha=0.5,
                  #style='Exp',
                    palette=colors,
                #palette='Set1',#sns.cubehelix_palette(3, start=-1, rot=0.2),
                    edgecolors=None,
               )# **get_kwargs(xl))
    
    #plt.ylim(0,0.1)

    if grid_lines:
        wid = 0.01
        for y in [1, 0, -1, -2]:
            if cutoff_lines and y==-1:
                continue
            plt.axhspan(y-wid, y+wid, facecolor='0', alpha=0.25)
            
    locs, labels = plt.xticks()
    print(plt.xticks())
    for x_loc in locs:
        wid = 0.01
        plt.axvspan(x_loc-wid, x_loc+wid, facecolor='0', alpha=0.2)
            
    if cutoff_lines:
        wid = 0.01
        plt.axhspan(
            -1-wid, -1+wid,
            #0.090, 0.11,
            facecolor='0', alpha=0.5)

        #plt.axhspan(
        #    -0.6989-wid, -0.6989+wid,
        #    facecolor='0', alpha=0.5)

        plt.text(plt.xlim()[0] + 1, -1+0.03, 'XL=0.1% cutoff for RBP')
        #plt.text(plt.xlim()[0] + 1, -0.6989+0.03, 'XL=0.2%')

    
# Minimal region plot.
xl = wts[wts['Label']=='% XL (minimal region)'].copy()
xl['Value'].clip(lower=0.01, inplace=True)
fig = plt.figure()

xl_scatterplot(xl, cutoff_lines=True)

plt.ylabel('log10 % cross-linked molecules RNA \nper molecule of protein (minimal region)')
plt.xticks(rotation='vertical')
sns.despine()
fig.set_figwidth(12)
fig.set_figheight(2)
plt.savefig(f'{pma_dir}/dataAndScripts/clip/figs/XL_rates_all_RBPs_minmal_region.pdf')
plt.show()
plt.clf()

# Whole lane plot.
xl = wts[wts['Label']=='% XL (whole lane)'].copy()
xl['Value'].clip(lower=0.01, inplace=True)
fig = plt.figure()

xl_scatterplot(xl)

plt.ylabel('log10 % cross-linked molecules RNA \nper molecule of protein (whole lane)')
plt.xticks(rotation='vertical')
sns.despine()
fig.set_figwidth(12)
fig.set_figheight(2)
plt.savefig(f'{pma_dir}/dataAndScripts/clip/figs/XL_rates_all_RBPs_whole_lane.pdf')
plt.show()
plt.clf()

#oddsratio, pvalue = scipy.stats.fisher_exact([[23, 0], [1, 10]])
#print(f"P value for Fisher Exact: {pvalue}")

In [None]:
import xlLoader

def make_scatterplot_and_excel_file(stats_df, x='% XL (minimal region)', y='% XL (minimal region)', text=True, log=True):
    
    if log:
        stats_df['log10 ' + x] = np.log10(stats_df[x])
        stats_df['log10 ' + y] = np.log10(stats_df[y])
        x = 'log10 ' + x
        y = 'log10 ' + y

    p1 = sns.lmplot(
        x=x, y=y, scatter_kws=dict(edgecolor="none"),
               #y='% left split reads after inital removal of empty adapters',
        hue='Category', data=stats_df, fit_reg=False
    )
    
    
    xarr = np.arange(np.min(stats_df[x]), np.max(stats_df[x])*1.1, 0.05)
        
    plt.plot(xarr, xarr, 'k:')

    if text:
        for line in range(0,stats_df.shape[0]):
            if not np.isnan(stats_df[x][line]):
                
                p1.axes.flatten()[0].text(
                    stats_df[x][line]+0.01, stats_df[y][line], 
                    stats_df['Protein'][line], horizontalalignment='left', 
                    size='small', color='black')
    
    plt.savefig(f"{pma_dir}/dataAndScripts/clip/figs/scatter_protein_vs_RNA.pdf")
    plt.show(); plt.clf(); plt.close()

xlLoad = xlLoader.xlLoader(f"{pma_dir}/percentCrosslinked.xlsx")
(df, recurrent) = xlLoad.load()
info = pandas.read_excel(f"{pma_dir}/RBP missense mutations/General_data_from_literature.xlsx")
#info = info.loc[[pandas.isna(x) for x in info.Skip], :]
info.index = info.loc[:, 'Protein']
print(info.head(2))
print('--=-' * 10)

df = df.loc[[(
    (x in info.index) or (x in rando_names)) for x in df['Protein']], :]
df['Category'] = [info.loc[x, 'Direct RBP?'] for x in df['Protein']]

print(info.loc['KHDRBS2', 'Direct RBP?'])
print(df['Category'].value_counts())
def reorganize(df):
    dfs = []
    labels = list(set(df['Label']))
    
    for label in labels:
        try:
            dfs.append(
                df.loc[[x==label for x in df['Label']],:].groupby(['Protein'])['Value'].mean())
        except:
            print(f"Could not mean() {label}")
    
    a = pandas.concat(dfs, axis=1)
    a.columns = labels
    a['Protein'] = a.index
    
    # Group by protein, and get a list of category: ['RBP'. 'non-RNP', ...].
    p = df.groupby('Protein')['Category'].apply(list)
    
    # Create a protein->category lookup.
    to_cat = {name:cat[0] for name,cat in dict(p).items()}
    
    # Add category.
    a['Category'] = [to_cat[x] for x in a.index]

    # Return the protein-grouped, category-column added dataframe.
    return a

recurrent_RBPs = ['A1CF', 'HNRNPCL1', 'FUBP1', 'DDX50', 'RBM11', 'NOVA1', 'SRSF2',
                  'KHDRBS2', 'PABPC4L', 'DDX3X', 'RBM39', 'RPL5', 'YTHDC2', 'SF3B1',
                  'PCBP1', 'U2AF1']

a = reorganize(df)

a = a.loc[[bool((type(cat)==type('') and 'Recurrent' not in cat) or (prot in recurrent_RBPs)) for cat, prot in zip(a.Category, a.Protein)], :]
a = a.loc[[x in ['Direct', 'non-RBP', 'Unknown',
                ] for x in a.Category], :]

make_scatterplot_and_excel_file(a, y='fmol RNA (minimal region)', x='pmol protein', text=True, log=True)
