In [None]:
import os, sys, re, pandas, collections
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy.stats as scs

import sameRiver
import sameRiver.parameters_for_hyperbolic_curve as pfhc
import importlib
import utils

importlib.reload(utils)
importlib.reload(pfhc)

In [None]:

import statsmodels.api as sm
import math

def to_perc_error(fu, obj, fmols, K):
    if obj in K:
        return 100*((fu/K[obj] - fmols)/fmols)
    else:
        return 0

def print_L5_L3_amounts(_df):
    # This just prints.
    for n, r in _df.groupby(by='Lane'):
        print(r)

        r.index = r.Object
        if r.loc['αL3']['Complex']=='Staple':
            continue
        FU3 = r.loc['αL3']['Signal']
        FU5 = r.loc['αL5']['Signal']
        print('fmols aL3', r.loc['αL3']['fmols'])
        print('L3:', FU3/Ka3)
        print('L5:', FU5/Ka5)
        print('L3/L5:', FU3/FU5 * Ka5/Ka3)
        
        
df_1 = utils.load_sheet(
    fname='../Shift based quantification protocol and results.xlsx',
    sheet_name='180428_1')

df_1, K_1 = utils.get_Ka(df_1)

df_3 = utils.load_sheet(
    fname='../Shift based quantification protocol and results.xlsx',
    sheet_name='180517_1')

def correct_for_dilutions(total, factor=0.8):
    if total < 50:
        return total*factor
    else:
        return total

df_3['fmols'] = [correct_for_dilutions(x) for x in df_3['fmols'].tolist()]
df_3.index = [100 + x for x in df_3.index]

df_3, K_3 = utils.get_Ka(df_3)

df = pandas.concat([df_3], axis=0)
df = df[df['fmols']>0]
#df = df_3
#K = K_3
#df = df[df['Image Name']=='0009837_01'].copy()
#print(df)

df['% error'] = [abs(100*(est-fmols)/fmols) for est, fmols in zip(
     df['Est. fmols'], df['fmols'])]
df['fmols error'] = [abs(est-fmols) for est, fmols in zip(
     df['Est. fmols'], df['fmols'])]
#print_L5_L3_amounts(df)

staple = df[df['Complex']=='Staple'].copy()
#print(staple)
dup = df[df['Complex']!='Staple'].copy()
dup = dup.loc[[(x in ['αL5', 'αL3']) for x in dup['Object']]]


def print_linreg(_df, channel):
    # Just prints.
    sub = _df[_df['Channel']==channel]
    slope, intercept, r_value, p_value, std_err = scs.linregress(
        y=sub['Signal'], x=sub['fmols'])
    print('linregress {0}:'.format(channel))
    print("slope {0} intercept {1} r {2} P {3} std_err {4}".format(
        slope, intercept, r_value, p_value, std_err ))

    y = sub['Signal']
    #guess = [slope*fmols + intercept for fmols in sub['fmols']]
    model = sm.OLS(sub['Est. fmols'], sub['fmols'])
    result = model.fit()
    print(result.summary())

print_linreg(dup, 800)
print_linreg(dup, 700)

# Plot signal vs fmols.
plt.clf()

g = dup[dup['Channel']==800].copy()
sns.set_style('whitegrid')
sns.lmplot(
    data=g, y='Signal', x='fmols', hue='Image Name', scatter_kws={'alpha':0.3})
plt.show()
plt.clf()

# Plot estimation vs fmols.
fig, ax = plt.subplots()
sns.lmplot(
    data=g, y='Est. fmols', x='fmols', hue='Image Name', scatter_kws={'alpha':0.3})
plt.show()
plt.clf()
sns.lmplot(
    data=dup, y='% error', x='fmols', hue='Object', scatter_kws={'alpha':0.3})

plt.xlim(0, 20)
plt.ylim(-100, 100)
plt.show()
plt.clf()

sns.lmplot(
    data=dup, y='fmols error', x='fmols', hue='Object', scatter_kws={'alpha':0.3})

#plt.xlim(0, 200)
#plt.ylim(-200, 200)
plt.show()
plt.clf()

l3 = dup[dup['Object']=='αL3'].copy()
est_l3 = l3['Est. fmols'].tolist()
true_l3 = l3['fmols'].tolist()

l5 = dup[dup['Object']=='αL5'].copy()
est_l5 = l5['Est. fmols'].tolist()
true_l5 = l5['fmols'].tolist()

plt.plot(true_l3, est_l3, 'k.', )
plt.plot(true_l5, est_l5, 'r.')
plt.plot(range(0, 350), range(0, 350), 'g-')
plt.xlim(0, 350)
plt.ylim(0, 350)
plt.show()
plt.clf()

In [None]:
#df = pandas.read_excel(
#    '../Shift based quantification protocol and results.xlsx', sheet_name='180419')
df = utils.load_sheet(
    fname='../Shift based quantification protocol and results.xlsx',
    sheet_name='180420 10 fmol staple')
#df = load_sheet(
#    sheet_name='180428_2')
#df = pandas.concat([df, df2])
#print(df)
df = df[df['Object']!='None']
_df = df.groupby(['Free', 'Object']).mean()
print(_df)
print('---')

fold_loss_aL5 = _df.loc['No', 'αL5']['Signal/fmol']/_df.loc[
    'Yes', 'αL5']['Signal/fmol']
fold_loss_aL3 = _df.loc['No', 'αL3']['Signal/fmol']/_df.loc[
    'Yes', 'αL3']['Signal/fmol']
print("aL5 fluorescence in complex = free aL5 * {0}".format(fold_loss_aL5))
print("aL3 fluorescence in complex = free aL3 * {0}".format(fold_loss_aL3))
obj_color = {
    'αL3': '#DF587A',
    'αL5': '#74C6A0',
    700: '#DF587A',
    800: '#74C6A0',
}
plt.clf()
fig, ax = plt.subplots()
sns.barplot(
    data=df, x='Signal/fmol', y='Free', hue='Object',
    palette=obj_color)
fig.savefig('../figs/staple_unlabel_vs_label.pdf')
plt.show()

In [None]:
df = utils.load_sheet(
    fname='../Shift based quantification protocol and results.xlsx',
    sheet_name='180420 10 fmol staple')

peg = df
def normalize(df):
    aL3_init = df.groupby(['fmols', 'Object']
                         ).mean().loc[200, 'αL3']['Signal/fmol']
    aL5_init = df.groupby(['fmols', 'Object']
                         ).mean().loc[200, 'αL5']['Signal/fmol']
    to_k = {
        'αL3': aL3_init,
        'αL5': aL5_init
    }
    df['FU/fmol (normalized)'] = [fu/to_k[obj] for fu, obj in zip(
        df['Signal/fmol'], df['Object'])]
    return df

obj_color = {
    'αL3': '#DF587A',
    'αL5': '#74C6A0',
    700: '#DF587A',
    800: '#74C6A0',
}



plt.clf()
sns.barplot(
    data=peg, y='FU/fmol', x='fmols')#, hue='fmols')

plt.show()

In [None]:
def purification_control_oligos(df):
    
    # All purification oligos, which are all on the 800 wavelength.
    pco = df[df['Object']=='PCO']
    pco = pco[pco['Channel']==800]
    
    # Fit a line to the standards, if possible.
    pco_stds = pco[pco['STD?']=='STD']
    print('PCO: {0} PCO standards at {1} fmols'.format(
        len(pco_stds), set(pco_stds['fmols'])))
    
    # Line fit.
    signals = pco_stds['Signal'].tolist() + [0]
    fmols = pco_stds['fmols'].tolist() + [0]
    a = scs.linregress(signals, fmols)
    m = a.slope
    b = a.intercept
    print('fmols=m*signal+b for PCO: m {0} b {1}'.format(m, b))
    
    # Estimate the purification efficiency.
    pco_exp = pco[pco['STD?']!='STD']
    pco_exp['Est. fmols'] = [m*x + b for x in pco_exp['Signal']]
    pco_exp['Purification efficiency (%)'] = [100*(x/y) for x, y in zip(
        pco_exp['Est. fmols'], pco_exp['Input PCO fmols'])]
    
    print('--- pco_exp:')
    print(pco_exp)
    df['Purification efficiency (%)'] = 0
    for n in pco_exp.index:
        df.loc[n, 'Purification efficiency (%)'] = pco_exp.loc[n, 'Purification efficiency (%)']
    print('\n\n\n41:')
    try:
        print(df.loc[41])
    except:
        print('not in index')
        
    return df, pco_stds, pco_exp


def total_L3_on_gel(df):
    df, pco_stds, pco_exp = purification_control_oligos(df)

    print("----\n" * 5)
    print('total_L3_on_gel():')
    purification_to_eff = dict(zip(
        pco_exp['Purification'], pco_exp['Purification efficiency (%)']))

    print("purification_to_eff:")
    print(purification_to_eff)
#    aLL = df.loc[[(x in ['αL5', 'αL3']) for x in df.Object]]
    aLL = df[df['Object']=='αL3']
    aLL = aLL[aLL['Complex']!='Staple']
    aLL = aLL[aLL['STD?']!='STD']
    

    if 'L5' not in purification_to_eff:
        purification_to_eff['L5'] = 1.
    aLL['Total L3 input to qRNA purification'] = [
        fmols/(purification_to_eff[purified]/100) for fmols, purified in zip(
            aLL['Est. fmols'], aLL['Purification'])]
    
    print('est fmols, purification:')
    print([(x,y) for x,y in zip(
            aLL['Est. fmols'], aLL['Purification'])])
    
    print("aLL['Total L3 input to qRNA purification'] : ")
    print(aLL['Total L3 input to qRNA purification'])
    
    aLL['Total L3 on gel']  = [
        L3_input_to_purification/(fraction_PK_input_to_purification
        ) for L3_input_to_purification, fraction_PK_input_to_purification in zip(
        aLL['Total L3 input to qRNA purification'], aLL['Input % PK extract'])
    ]
    print('total L3 input to qRNA purification, input % PK extract:')
    print([(x,y) for x,y in zip(
        aLL['Total L3 input to qRNA purification'], aLL['Input % PK extract'])])
    
    print('Total L3 on gel:')
    print(aLL['Total L3 on gel'])
    
    print("end total_L3_on_gel----\n")
    return aLL

def ligation_efficiency(df):
    
    by_rep = collections.defaultdict(dict)
    
    for row in df.to_dict('records'):
        by_rep[row['Protein CLIP replicate']][(row['Purification'], row['Object'])
                                ] = row['Est. fmols']
        
    _d = collections.defaultdict(dict)
    results = []
    
    for rep in by_rep:

        if ('L3', 'αL5') not in by_rep[rep]:
            continue
        try:
            _d[rep, "5' ligation efficiency"
                   ] = by_rep[rep][('L3', 'αL5')]/by_rep[rep][('L3', 'αL3')]
            results.append({
                'Replicate': rep,
                'Ligation': "5'",
                'Efficiency': 100 * by_rep[rep][('L3', 'αL5')]/by_rep[rep][('L3', 'αL3')],
            })
        except:
            pass
        
        try:
            _d[rep, "3' ligation efficiency"
               ] = by_rep[rep][('L5', 'αL3')]/by_rep[rep][('L5', 'αL5')]
        
        #print(rep, df[df['Protein CLIP replicate']==rep])
            results.append({
            'Replicate': rep,
            'Ligation': "3'",
            'Efficiency': 100 * by_rep[rep][('L5', 'αL3')]/by_rep[rep][('L5', 'αL5')]
            })
        except:
            pass

    eff = pandas.DataFrame(results)

    return eff

In [None]:
import sameRiver
import sameRiver.adapter_fluorescence as af
import importlib
importlib.reload(af)

obj_color = af.obj_color

exp_folder = '/Users/dfporter/pma/dataAndScripts/clip/experiments/exp35 hnRNPC FBL AURKA RPS3/'

# Get replicate 3 from 180525. Not used for some reason.
df_r3 = af.load_sheet(
    fname=exp_folder+'/exp35.xlsx',
#    fname='../Shift based quantification protocol and results.xlsx',
    sheet_name='180525 hnRNPC R3 qRNA')

# Get replicate 3. Not used for some reason.
df2 = af.load_sheet(
    fname=exp_folder+'/exp35.xlsx',
#    fname='../Shift based quantification protocol and results.xlsx',
    sheet_name='qRNA hnRNPC R3'
    #sheet_name='qRNA hnRNPC R1,R2'
)

# Get replicates 1,2,3 from 180501. This is all the data.
df3 = af.load_sheet(
    fname=exp_folder+'/exp35.xlsx',
    sheet_name='180501 hnRNPC R1,2,3 qRNA')
#df = pandas.concat([df2, df3])

print('&' * 14)

def ests_Kas(df):
    [[m5, b5], [m3, b3]] = af.get_parameters(fname='180522.xlsx', sheet_name='180522_1')
    df['Replicate'] = ['R' + str(x)[:1] for x in df.Replicate]
    # The staples parameter is very imporant. The ligation efficiencies are +/- ~20%
    # based on whether staples=None. If staples=None, then fluorescence values are
    # compared to the run used for parameters (180522_1) without normalization.
    # If staples is not None, then values are scaled based on relative fluorescence
    # of the antisense oligos in the 50 fmol staples lanes. It is not clear which
    # is the better method, but we have opted to use scaling, as it makes perhaps a little more sense.
    af.est_fmols_linear_using_params(df, [[m5, b5], [m3, b3]], staples=True)
    
    return df

df2 = ests_Kas(df2)
df3 = ests_Kas(df3)

df = pandas.concat([df2, df3])

#print(df3)
eff = ligation_efficiency(df)
eff2 = ligation_efficiency(df2)
eff3 = ligation_efficiency(df3)
print("Lig efficiencies:")
print(eff)
print(eff2)
print(eff3)

#df2, pco_stds, pco_exp = purification_control_oligos(df2)
#df2, pco_stds, pco_exp = purification_control_oligos(df2)
#print("PCO:")
#print(pco_exp)

aLL = total_L3_on_gel(df2)
#print('anti-L, L:')
#print(aLL)
# Just get antisense oligos in αL/L complexes.
df = df.loc[[(x in ['αL5', 'αL3']) for x in df.Object]]
df = df[df['Complex']!='Staple']

not_using_pco = """
plt.clf()
fig, ax = plt.subplots()
sns.barplot(
    data=pco_exp, x='Purification efficiency (%)', y='Purification', #hue='Ligation',
)#palette=obj_color)
fig.savefig('./figs/hnRNPC_lig_eff.pdf')
plt.show()
plt.clf()
print('----')

fig = plt.figure()
print('====')
sns.lmplot(
    data=pco_stds, x='Signal', y='fmols'#, hue='Ligation',
)#palette=obj_color)
fig.savefig('./figs/PCO.pdf')
plt.show()
plt.clf()
"""
plt.clf()
fig, ax = plt.subplots()
sns.boxplot(
    data=eff, x='Efficiency', y='Ligation', #hue='Ligation',
)#palette=obj_color)
fig.savefig('./figs/hnRNPC_lig_eff.pdf')
plt.show()
plt.clf()



fig, ax = plt.subplots()
sns.stripplot(
    data=df[df['STD?']!='STD'], x='Est. fmols', y='Object', hue='Purification',
    dodge=True,
    palette=obj_color)
fig.savefig('./figs/hnRNPC_quant_RNA.pdf')
plt.show()
plt.clf()

In [None]:
# RNAse digestion trial.

df = pandas.read_excel('../tables/RNAse digestion trial.xlsx', sheet_name='Trial 2')

if ('Skip' in df.columns) and any([type(x) == type('') for x in df['Skip'].tolist()]):
    df = df[df['Skip']!='Yes'].copy()
    df = df[df['Skip']!='Skip'].copy()
print(df)
print(df['Fraction shifted'].mean())

plt.clf()
fig = plt.figure()
sns.barplot(data=df, x='RNAse U/ul', y='Band intensity (1000s)')
fig.set_figwidth(1.5)
fig.set_figheight(3)
plt.show()
plt.clf()

sns.set_style('ticks')
fig = plt.figure()
sns.barplot(data=df, x='RNAse U/ul', y='Normalized intensity')
fig.set_figwidth(1.5)
fig.set_figheight(3)
fig.savefig('../figs/RNAse digestion trial barplot.pdf')
plt.show()
plt.clf()

df = pandas.read_excel('../clip/experiments/exp36/exp36.xlsx', sheet_name='qWB')
if ('Skip' in df.columns) and any([type(x) == type('') for x in df['Skip'].tolist()]):
    df = df[df['Skip']!='Yes'].copy()
    df = df[df['Skip']!='Skip'].copy()

plt.clf()
fig = plt.figure()
df['Protein (ng)'] = df['Est. ng (BSA equivalents)']
sns.barplot(data=df, x='Object', hue='RNAse U/ul', y='Protein (ng)')
fig.set_figwidth(1.5)
fig.set_figheight(3)
fig.savefig('../figs/RNAse digestion qWB Exp36 barplot.pdf')
plt.show()
plt.clf()

plt.clf()
fig = plt.figure()
sns.stripplot(data=df, x='Protein', y='% crosslinked', alpha=1, jitter=True)
plt.ylim(0, 25)
fig.set_figwidth(1.5)
fig.set_figheight(3)
fig.savefig('../figs/RNAse digestion qWB Exp36 stripplot.pdf')
plt.show()
plt.clf()