In [None]:
import pandas, os, sys, re, collections, importlib, glob, itertools
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import numpy as np
from typing import List, Tuple, Union, Mapping
import nameUtils
importlib.reload(nameUtils)
import stabilityLoader
importlib.reload(stabilityLoader)
figsdir = '/Users/dp/pma/RBP missense mutations//figs/'

In [None]:


# function to calculate Cohen's d for independent samples
def cohend(d1, d2):

    n1, n2 = len(d1), len(d2)

    s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
    # calculate the pooled standard deviation
    s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
    # calculate the means of the samples
    u1, u2 = np.mean(d1), np.mean(d2)
    # calculate the effect size
    return (u1 - u2) / s



def get_pval(prot: str, df: pandas.DataFrame, _stats: dict) -> float:
    """Get the lowest p value for comparing each of the mutant forms of 'prot' to the wild-type.
    """
    if prot in df.Reference:
        wt = prot
    else:
        wt = df.loc[[bool(x==prot) for x in df.Protein], 'Reference'].tolist()[0]
        
    muts = set(df.loc[[bool(x==wt) for x in df.Reference], 'Protein'].to_list()) - set([wt])
    pvals = [_stats['pval'][mut] for mut in muts]
    lowest = sorted(pvals)[0]
    
    return lowest  # Lowest p value.


In [None]:
###################
# Load data and calculate stats.
input_filename = '/Users/dp/pma/dataAndScripts/clip/experiments/Exp85_missense_and_stability_trial_using_control_vector/Exp85_missense.xlsx'
input_sheet = 'Stability'

df = stabilityLoader.load_data(input_filename, input_sheet)
#print(df)
df = df.loc[[bool('SF3B1' not in x) for x in df.Protein], :]
_stats = stabilityLoader.get_stats(df)

by_prot = df.groupby(by=['Protein'])['Abundance/WT normalized by image'].apply(list).to_dict()

#print(_stats)
#by_sig = sorted(df['Protein group'], key=lambda x: get_pval(x))#, ascending=True)

###################
# Put things in order.
df['lowest'] = [get_pval(x, df, _stats) for x in df['Protein']]

df.sort_values(by='lowest', inplace=True)
ordered = df.Protein.to_list()
df['position'] = [
    ordered.index(reference) + {'WT': 0, 'Mut': 0.5}[is_wt] \
    for prot, is_wt, reference in zip(df.Protein, df['WT or mut'], df.Reference)]

df.sort_values(by='position', inplace=True)

#print(df.head(2))

sdf = pandas.DataFrame(_stats)
sdf['Mutation'] = sdf.index
sdf['lowest'] = [get_pval(x, df, _stats) for x in sdf['Protein group']]
sdf.sort_values(by='lowest', inplace=True)
#print(sdf.head(2))

###################
# Plotting functions
def pal(n_colors):
    return sns.cubehelix_palette(n_colors=n_colors, start=0, gamma=3, rot=180, dark=0,)[::-1]

def f7(seq: List) -> List:
    return [k for k, g in itertools.groupby(seq)]

def get_xlabels(df: pandas.DataFrame, initial_space=False) -> List:
    x_proteins = ['']
    if initial_space:
        x_labels = ['']
    else:
        x_labels= []
    prev_prot = df['Protein group'].to_list()[0]
    for prot_group, prot in zip(df['Protein group'], df.Protein):
        if prot_group == prev_prot:
            if prot not in x_labels:
                x_labels.append(prot)
           # x_proteins.append(prot)
        else:

            x_labels.append('')
            if prot not in x_proteins:
                x_labels.append(prot)
                x_proteins.append(prot)

        prev_prot = prot_group

    x_labels = f7(x_labels)
    
    return x_labels

def get_xlabels_as_tuples(df: pandas.DataFrame) -> List:
    #x_labels = get_xlabels(df=df)
    # Just hard-coding this for now:

    x_labels = [
        ['RARS2', 'RARS2 R6C'], ['RBFOX1', 'RBFOX1 A49T'], ['KPNB1', 'KPNB1 L552F'],
        ['U2AF1', 'U2AF1 S34F'], #['SF3B1', 'SF3B1 K700E'], 
        ['DICER1', 'DICER1 R944Q'],
        ['EIF1AX', 'EIF1AX G9D'], ['SRSF2', 'SRSF2 P95L'], ['NUFIP1', 'NUFIP1 R475W'],
        ['PCBP1', 'PCBP1 L100P'], ['PCBP1', 'PCBP1 L100Q'], ['CNOT9', 'CNOT9 P131L'],
        ['PABPC4L', 'PABPC4L E312K'], ['YTHDC2', 'YTHDC2 E635K'], ['YTHDC2', 'YTHDC2 E185K'],
        ['NOVA1', 'NOVA1 R146C'], ['DCP1B', 'DCP1B Q252H'], ['KHDRBS2', 'KHDRBS2 R168C'],
        ['HNRNPCL1', 'HNRNPCL1 R99Q']]
    #fixed = [[x[0], m(x[1])] for x in x_labels]
    #print(fixed)
    return x_labels


###################
# Plotting
#sns.set(rc={'figure.figsize':(7, 7)})
sns.set_style('whitegrid')

sns.stripplot(x='Mutation', y='cohend',
              hue='ttest sig',
              #hue='WT or mut',
              s=10, 
              data=sdf, palette='Set2',
             )
plt.xticks(
    rotation=90, 
    #horizontalalignment='right',
    fontweight='light',
    fontsize='medium'  
)

plt.show()
plt.clf()
plt.close()

to_reference = dict(zip(df['Protein'], df['Reference']))
#to_ref_sig = [g.loc[]]

print(df.head(2))

def make_plot():
    df['WT Protein'] = [x.split(' ')[0] for x in df.Protein]
    muted = sns.color_palette("muted")
    cmap = sns.light_palette("muted blue", input="xkcd")#sns.color_palette(palette=muted, desat=1)
    my_pal = {'WT': 'grey', #'white', 
              'Mut': 'lightblue'}
    df['log2 Abundance/WT normalized by image'] = np.log2(df['Abundance/WT normalized by image'])
    ax = sns.stripplot(x='Protein', y='log2 Abundance/WT normalized by image',
                  hue='WT or mut',
                  data=df, palette=my_pal,#palette=pal(2),
                  order=get_xlabels(df, initial_space=True),
                  #      inner="stick",
                        #split=True,
                  #jitter=False,
                  #alpha=0.5,
                   #    size=2.5,
                 )
    plt.ylabel('Protein abundance (AU)')
    plt.xticks(
        rotation=90, 
        #horizontalalignment='right',
        fontweight='light',
        fontsize=8#'small'  
    )
    
    g = df.groupby(by=['Protein', 'WT or mut'])['Abundance/WT normalized by image'].apply(np.array)
    print(g)
    medians = {p:np.median(x) for p,x in zip(g.index, g.values)}
    g = pandas.DataFrame(g)
    g['Abundance/WT normalized by image'] = [np.median(x) for x in g['Abundance/WT normalized by image']]
    g.reset_index(inplace=True)  
    print(g)
    na = """
    d = sns.stripplot(
            x='Protein', y='Abundance/WT normalized by image',
                  hue='WT or mut',
                  data=g, palette=pal(2),
                  order=get_xlabels(df),
        ax=ax,
        marker="_",
        linewidth=2,
        #size=10,
        alpha=1.,
    )


    d = sns.boxplot(
        x='Protein', y='sig',
                  hue='WT or mut',
                  data=g, #palette=pal(2),
                  order=get_xlabels(df),
        ax=ax,
        whis=0,
        #showbox=False,
        fliersize=0,
        boxprops=dict(facecolor=(0,0,0,0)),
        
                  #jitter=True,
                  #alpha=0.5,
    )
    print(d.artists)
    for i, a in enumerate(d.artists):
        continue
        #print(a.__dict__)
        #a.show_box=False
        a.set_facecolor('white')
    
        a.set_edgecolor('black')
        a.set_linewidth(0.5)
        """
    return ax

fig = plt.figure()
ax = make_plot()
fig.set_figwidth(12)
y_extremes = plt.ylim()
plt.savefig(f"{figsdir}/protein_abundance_stripplot.pdf")
plt.show(); plt.clf(); plt.close()

#make_plot()
skip = """
#plt.ylim(y_extremes[0], 3.1)
plt.savefig(f"{figsdir}/protein_abundance_stripplot_lower.pdf")
plt.show(); plt.clf(); plt.close()

sns.set(rc={'figure.figsize':(7,1)})
sns.set_style('whitegrid')
make_plot()
#plt.ylim(3.1, y_extremes[1])
plt.yticks(ticks=[3.1, y_extremes[1]])

#plt.figure(figsize=(0.5, 4))
plt.savefig(f"{figsdir}/protein_abundance_stripplot_higher.pdf")
plt.show(); plt.clf(); plt.close()
"""
print(f'{len(set(df.Reference))} proteins assayed.')

In [None]:
import dabest
importlib.reload(dabest)
importlib.reload(nameUtils)

df = stabilityLoader.load_data(input_filename, input_sheet)
#print(df)
df = df.loc[[bool('SF3B1' not in x) for x in df.Protein], :]
_stats = stabilityLoader.get_stats(df)
df['log2 Abundance/WT normalized by image'] = np.log2(df['Abundance/WT normalized by image'])

by_prot = df.groupby(by=['Protein'])['log2 Abundance/WT normalized by image'].apply(list).to_dict()

#print(_stats)
#by_sig = sorted(df['Protein group'], key=lambda x: get_pval(x))#, ascending=True)

###################
# Put things in order.
df['lowest'] = [get_pval(x, df, _stats) for x in df['Protein']]

df.sort_values(by='lowest', inplace=True)
ordered = df.Protein.to_list()
df['position'] = [
    ordered.index(reference) + {'WT': 0, 'Mut': 0.5}[is_wt] \
    for prot, is_wt, reference in zip(df.Protein, df['WT or mut'], df.Reference)]

df.sort_values(by='position', inplace=True)


print(df.head(2))
ex = """
        Create a multi-group Cumming plot.

        >>> my_multi_groups = dabest.load(df, idx=(("Control 1", "Test 1"),
        ...                                        ("Control 2", "Test 2"))
        ...                               )
        >>> fig5 = my_multi_groups.mean_diff.plot()
"""
m = df.loc[:, ['Protein','log2 Abundance/WT normalized by image']]
m.index = m.Protein
max_obs = 0
for protein in set(df.Protein):
    n_values = len(m.loc[protein, 'log2 Abundance/WT normalized by image'])
    max_obs = max([n_values, max_obs])
print(max_obs)

rows = []
for protein in set(df.Protein):
    s = m.loc[protein, 'log2 Abundance/WT normalized by image']
    n_values = len(s)
    rows.append({'Protein': protein})
    for n, sig_value in enumerate(s):
        rows[-1][n] = sig_value
    for n in range(n_values, max_obs - n_values):
        rows[-1][n] = np.nan
rows = pandas.DataFrame(rows, dtype='float').T

rows.columns = rows.loc['Protein', :]
rows = rows.loc[[x!='Protein' for x in rows.index], :]
print(rows.head())

x_proteins = ['']
x_labels = ['']
prev_prot = df['Protein group'].to_list()[0]
for prot_group, prot in zip(df['Protein group'], df.Protein):
    if prot_group == prev_prot:
        if prot not in x_labels:
            x_labels.append(prot)
       # x_proteins.append(prot)
    else:
        if prot not in x_proteins:
            x_labels.append(prot)
            x_proteins.append(prot)
            
    prev_prot = prot_group
tup_label = get_xlabels_as_tuples(df=df)
#rows = rows.loc[:, [x for x in x_labels]]
#tup_label = tuple([(x_labels[n], x_labels[n+1]) for n in range(0, len(x_labels)-1, 2)])
print(tup_label)


chunk_len = 4
for left in range(0, len(tup_label)-1, chunk_len):
    
    iris_dabest = dabest.load(
        rows, idx=tup_label[left:left+chunk_len]
        )
    a_plot = iris_dabest.mean_diff.plot(
        halfviolin_desat=0.,
    )
    plt.savefig(f'./figs/dabest_{left}.pdf')

# Produce a Cumming estimation plot.

rows.to_csv('/Users/dp/Desktop/temp.csv', sep=',')



In [None]:
import dabest
df = stabilityLoader.load_data(input_filename, input_sheet)
#print(df)
df = df.loc[[bool('SF3B1' not in x) for x in df.Protein], :]
_stats = stabilityLoader.get_stats(df)
df['log2 Abundance/WT normalized by image'] = np.log2(df['Abundance/WT normalized by image'])
g = df.groupby(by=['Protein', 'WT or mut'])['log2 Abundance/WT normalized by image'].apply(np.mean)
g = pandas.DataFrame(g)
g.reset_index(inplace=True)
print(g)

my_pal = {
    'WT': 'grey',
    'Mut': 'lightblue'
}
dabest_xl = dabest.load(
    data=g, x='WT or mut', y='log2 Abundance/WT normalized by image', idx=('WT', 'Mut'))
fig = dabest_xl.mean_diff.plot(custom_palette=my_pal)
fig.set_figwidth(3)
fig.set_figheight(3)
plt.savefig('./figs/dabest_mean_of_all_mutants_vs_all_proteins.pdf')
#iris = pandas.read_csv("/Users/dp/Downloads/iris.csv")
#print(iris)
# Load the above data into `dabest`.
#iris_dabest = dabest.load(data=iris, x="species", y="petal_width",
#                          idx=("versicolor", "virginica"))

# Produce a Cumming estimation plot.
#iris_dabest.mean_diff.plot()

In [None]:
import statsmodels.stats.api as sms
import statsmodels

print(rows['NUFIP1 R475W'])
print(tup_label)
#print(rows)
cis = []
for (a, b) in tup_label:
    print(a, b)
    x1 = rows[a]
    x2 = rows[b]
    x1 = np.array(x1, dtype=float)
    x2 = np.array(x2, dtype=float)
    x1 = x1[~np.isnan(x1)]
    x2 = x2[~np.isnan(x2)]
    print(x1, x2)
    #print(sms.DescrStatsW(x1).__dict__)
    cm = sms.CompareMeans(sms.DescrStatsW(x2), sms.DescrStatsW(x1))
    #print(cm.summary(alpha=0.05))
    #print(cm.tconfint_diff(usevar='unequal', alpha=0.05))
    interv = cm.tconfint_diff(usevar='unequal', alpha=0.05)
    cis.append({'Label': (a, b), 'CI': interv, 'N': min([len(x1), len(x2)])})

#fig, ax = plt.subplot()
??plt.text
sns.set(rc={'figure.figsize':(6,6)})
sns.set_style('ticks')
for n, _d in enumerate(cis):
    (label, ci) = _d['Label'], _d['CI']
    print(n, ci)
    if ci[0] <= 0 <= ci[1]:
        color = 'k'
    else:
        color = 'r'
    plt.plot((n, n), ci, c=color, label=label[-1])
    plt.text(n-0.25, ci[1]+0.25, '{}'.format(str(_d['N'])), fontsize=10)

#xticklocs, xticklabels = plt.xticks()
plt.xticks(range(len(cis)), [x['Label'][-1] for x in cis], rotation=90)
plt.ylabel('Mean fold difference in abundance of MUT/WT (95% CI)')
sns.despine()
plt.axes().yaxis.grid(True)
plt.savefig(f"{figsdir}/mean_fold_diff_abundance_ci_line_graph.pdf")
plt.show()
plt.clf()
sms.CompareMeans.__module__
statsmodels.stats.weightstats.__file__

In [None]:
??dabest
??dabest.load
#iris = pandas.read_csv("https://github.com/mwaskom/seaborn-data/raw/master/iris.csv")
#print(df.head(50))
#print(df['Test or control'].value_counts())
# Load the above data into `dabest`.
print(f"# proteins {len(set(df['Protein group']))}")  # 18

row = -1
for n, prot_group in enumerate(set(df['Protein group'])):
    
    if n % 4 == 0:
        fig, axes = plt.subplots(2, 2)
    
    col = n % 2

    if n % 4 < 2:
        row = 0
    else:
        row = 1
    
    print(f"row {row} col {col}")
    
    sub = df.loc[[x==prot_group for x in df['Protein group']], :]
    #print(sub)
    iris_dabest = dabest.load(
        data=sub, x="WT or mut", y="Norm signal", #id_col='Image', paired=True,
        idx=["WT", "Mut"]
        )

    # Produce a Cumming estimation plot.
#    ??iris_dabest
    iris_dabest.mean_diff.plot(
        ax=axes[row][col], #violinplot_kwargs={'fillcolor': 'k'}
        halfviolin_desat=0.,
    )
    axes[row][col].set_title(prot_group)
    
    if n % 4 == 3:
        #plt.figure(figsize=(20, 20))
        #plt.tight_layout()
        
        plt.show()
        plt.clf()
        plt.close()
