In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import plotly.offline as py
import plotly.graph_objs as go
from sklearn.metrics import auc
from collections import defaultdict

py.init_notebook_mode()

%matplotlib inline
sns.set_context("talk")

In [None]:
import mygene
import textwrap


def get_mygene_strings(genes, scope='symbol'):
    my_info = get_mygene_summaries(genes, scope=scope)
    return [mygene_summary_to_string(my_info[x]) if x in my_info else 'not found' for x in genes ]

def get_mygene_summaries(genes, scope='symbol'):
    mg = mygene.MyGeneInfo()
    results = mg.querymany(genes,
                           scopes=scope,
                           species=9606,
                           fields='summary,name',
                           )
    return dict((x['query'], x) for x in results if 'notfound' not in x)

def mygene_summary_to_string(x):
    try:
        return "{}: {}".format(x['name'], x['summary'])
    except KeyError:
        name = x['name'] if 'name' in x else 'N/A'
        summary = x['summary'] if 'summary' in x else 'N/A'
        return "{}: {}".format(name, summary)

In [None]:
import operator
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error


def plot_scores(df, x_col, y_col, score, jacks=False, min_sd_diff=3, plot=True,
                sd_lines=[], x_fdr_threshold=None, y_fdr_threshold=None,
                x_fdr_label='FDR threshold', y_fdr_label='FDR threshold',
                other_genes=None, other_genes_cmap=None,
                other_genes_color=(0.9254901960784314, 0.8823529411764706, 0.2),
                get_gene_descriptions=False):

    if score == 'JACKS':
        jacks = True
    if not sd_lines:
        sd_lines = [min_sd_diff]
    pivot_df = df.pivot(index="Gene", columns="Sample")[score].reset_index()

    x = np.array(pivot_df[x_col]).reshape((-1, 1))
    y = np.array(pivot_df[y_col])

    regression_model = LinearRegression()
    # fit the data(train the model)
    regression_model.fit(x, y)
    # predict
    y_predicted = regression_model.predict(x)

    # model evaluation
    res_stats = dict()
    res_stats['r2'] = r2_score(y, y_predicted)
    res_stats['coefficient'] = regression_model.coef_[0]
    res_stats['RMSE'] = mean_squared_error(y, y_predicted)
    res_stats['Mean Absolute Error'] = mean_absolute_error(y, y_predicted)
    res_stats['Median Absolute Error'] = median_absolute_error(y, y_predicted)
    # printing values
    print("Comparing {} vs {} with residual cutoff at {}xSD".format(x_col, y_col, min_sd_diff))
    print('The coefficient is {}'.format(res_stats['coefficient'] ))
    print('The intercept is {}'.format(regression_model.intercept_))
    print('Root mean squared error of the model is {}.'.format(res_stats['RMSE']))
    print('Mean absolute error of the model is {}.'.format(res_stats['Mean Absolute Error']))
    print('Median absolute error of the model is {}.'.format(res_stats['Median Absolute Error']))
    print('R-squared score is {}.'.format(res_stats['r2']))

    #get residuals
    res = y_predicted - y
    pivot_df['residuals'] = res
    std = res.std()
    res_stats['STDEV of the residuals'] = std
    #get synthetic lethal/supressor genes
    sp_cmp = operator.gt #supressor comparator
    sl_cmp = operator.lt #synthetic lethal comparator
    sp_op = operator.add #supressor addition/subtraction
    sl_op = operator.sub #synthetic lethal addition/subtraction
    if jacks:
        sp_cmp = operator.lt
        sl_cmp = operator.gt
        sl_op = operator.add
        sp_op = operator.sub
    sl_genes = set(pivot_df[sl_cmp(res, sl_op(0, min_sd_diff*std))]['Gene'])
    sp_genes = set(pivot_df[sp_cmp(res, sp_op(0, min_sd_diff*std))]['Gene'])
    if y_fdr_threshold:
        sl_genes = set(pivot_df[(pivot_df.Gene.isin(sl_genes)) &
                                (sp_cmp(pivot_df[y_col], y_fdr_threshold))]['Gene'])
    if x_fdr_threshold:
        sp_genes = set(pivot_df[(pivot_df.Gene.isin(sp_genes)) &
                                (sp_cmp(pivot_df[x_col], x_fdr_threshold))]['Gene'])
    print("{} synthetic lethal genes".format(len(sl_genes)))
    print("{} supressor genes".format(len(sp_genes)))
    return_df = pd.DataFrame()
    sl_text = []
    sl_df = pivot_df[pivot_df.Gene.isin(sl_genes)].copy()
    sl_df['Residual'] = res[pivot_df.Gene.isin(sl_genes)] 
    sl_df['Hit_Type'] = 'Synthetic Lethal'
    if get_gene_descriptions:
        sl_df['Description'] = get_mygene_strings(sl_df.Gene)
        sl_text = sl_df.apply(lambda x: "<br>".join([x['Gene'],
                                                     '<br>'.join(textwrap.wrap(x['Description'], 75))]),
                              axis=1)
    else:
        sl_text = pivot_df[pivot_df.Gene.isin(sl_genes)]['Gene']
    return_df = return_df.append(sl_df)
    sp_df = pivot_df[pivot_df.Gene.isin(sp_genes)].copy()
    sp_df['Residual'] = res[pivot_df.Gene.isin(sp_genes)]
    sp_df['Hit_Type'] = 'Suppresor'
    if get_gene_descriptions:
        sp_df['Description'] = get_mygene_strings(sp_df.Gene)
        sp_text = sp_df.apply(lambda x: "<br>".join([x['Gene'], 
                                                     '<br>'.join(textwrap.wrap(x['Description'], 75))]),
                              axis=1)
    else:
        sp_text = pivot_df[pivot_df.Gene.isin(sp_genes)]['Gene']
    return_df = return_df.append(sp_df)
    return_df = return_df.reset_index() 
        
    pal = sns.color_palette("Set1")
    sl_trace = go.Scatter( #synthetic lethal genes
        x=sl_df[x_col],
        y=sl_df[y_col],        
        mode = 'markers',
        #text=pivot_df[pivot_df.Gene.isin(sl_genes)]['Gene'],
        text=sl_text,
        hoverinfo='x+y+text',
        name='Synthetic Lethal',
        marker = dict(
            line = dict(width = 1),
            colorscale='Reds',
            reversescale=False,
            showscale=False,
            opacity=0.8,
            size=10,
            color=np.abs(res[pivot_df.Gene.isin(sl_genes)]),
    #         size=[],
    #         symbol=[],
    #         colorbar=dict(
    #             thickness=15,
    #             title='Deviation',
    #             xanchor='center',
    #             titleside='bottom'
    #         ),
        )
    )
    sp_trace = go.Scatter( #suppresor genes
        x=sp_df[x_col],
        y=sp_df[y_col],
        text=sp_text,
        mode = 'markers',
        name='Suppresors',
        hoverinfo='x+y+text',
        marker = dict(
            line = dict(width = 1),
            colorscale='Blues',
            reversescale=False,
            showscale=False,
            opacity=0.8,
            size=10,
            color=np.abs(res[pivot_df.Gene.isin(sp_genes)]),
    #         size=[],
    #         symbol=[],
    #         colorbar=dict(
    #             thickness=15,
    #             title=score,
    #             xanchor='center',
    #             titleside='bottom'
    #         ),
        )
    )

    other_trace = go.Scattergl( #all other genes
        x=pivot_df[~pivot_df.Gene.isin(sl_genes.union(sp_genes))][x_col], 
        y=pivot_df[~pivot_df.Gene.isin(sl_genes.union(sp_genes))][y_col], 
        mode = 'markers',
        name='non-hits',
        hoverinfo='x+y+text',
        text=pivot_df[~pivot_df.Gene.isin(sl_genes.union(sp_genes))]['Gene'], 
        marker = dict(
            showscale=False,
            color = np.abs(res[~pivot_df.Gene.isin(sl_genes.union(sp_genes))]),
            colorscale='Greys',
            reversescale=True,
            line = dict(width = 1),
            opacity=0.5,
        )
    )
    sdx = np.sort(x[::10].flatten())

    sdy = np.sort(y_predicted[::10].flatten())
    reg_trace = go.Scatter( #regression line
        x=sdx,
        y=sdy,
        visible='legendonly',
        name='Regression',
        line = dict(
            color = 'rgba(0.1, 0.1, 0.1, 0.75)',
            width = 4,
            dash = 'dot'
        )
    )
    sd_traces = []
    for i in sd_lines:
        if i == min_sd_diff:
            v = True
        else:
            v = 'legendonly'
        if jacks:
            sl_sd_name = '-{}SD'.format(i)
        else:
            sl_sd_name = '+{}SD'.format(i)
        sl_sd_trace = go.Scatter(
            x=sdx,
            y=sp_op(sdy, i*std),
            name=sl_sd_name,
            visible=v,
            line = dict(
                color = "rgba{}".format(pal[0] + (0.75,)),
                width = 3,
                dash = 'dot',
            )
        )
        sd_traces.append(sl_sd_trace)
        if jacks:
            sp_sd_name = '+{}SD'.format(i)
        else:
            sp_sd_name = '-{}SD'.format(i)
        sp_sd_trace = go.Scatter(
            x=sdx,
            y=sl_op(sdy, i*std),
            name=sp_sd_name,
            visible=v,
            line = dict(
                color = "rgba{}".format(pal[1] + (0.75,)),
                width = 3,
                dash = 'dot',
            )
        )
        sd_traces.append(sp_sd_trace)
    fdr_traces = []
    if x_fdr_threshold:
        ymin = pivot_df[y_col].min()
        ymax = pivot_df[y_col].max()
        f_trace = go.Scatter(
            x=(x_fdr_threshold, x_fdr_threshold),
            y=(ymin, ymax),
            visible=True,
            name="{} {}".format(x_col, x_fdr_label),
            line = dict(
                color = 'rgba(0.3, 0.3, 0.3, 0.75)',
                width = 3,
                dash = 'dot'
            )
        )
        fdr_traces.append(f_trace)
    if y_fdr_threshold:
        xmin = pivot_df[x_col].min()
        xmax = pivot_df[x_col].max()
        f_trace = go.Scatter(
            y=(y_fdr_threshold, y_fdr_threshold),
            x=(xmin, xmax),
            visible=True,
            name="{} {}".format(y_col, y_fdr_label),
            line = dict(
                color = 'rgba(0.3, 0.3, 0.3, 0.75)',
                width = 3,
                dash = 'dot'
            )
        )
        fdr_traces.append(f_trace)
    if score == 'BF':
        algorithm = 'BAGEL'
    else:
        algorithm = score
    fig = go.Figure(data=[other_trace, sl_trace, sp_trace, reg_trace,] + sd_traces + fdr_traces,
                    layout=go.Layout(
                        hovermode='closest',
                        xaxis = dict(title ='{} {}'.format(x_col, score)),
                        yaxis = dict(title ='{} {}'.format(y_col, score)),
                        title="{} vs {} {}".format(x_col, y_col, algorithm),
                        titlefont=dict(size=16),
                    ))
    if plot:
        py.iplot(fig,)
    return return_df, fig, res_stats, pivot_df

In [None]:
jacks_scores = pd.read_csv(snakemake.input[0], sep='\t')
samp_order = [x for x in jacks_scores.columns if x != 'Gene']
jacks_scores.head()

In [None]:
neg_genes = set()
with open(snakemake.params['neg'], 'rt') as negfile:
    for line in negfile:
        neg_genes.add(line.split()[0])
ess_genes = set()
with open(snakemake.params['ess'], 'rt') as essfile:
    for line in essfile:
        ess_genes.add(line.split()[0])

In [None]:
fdr_df = dict(cutoff=[], fdr=[], Sample=[])
for sample in samp_order:
    for cutoff in np.linspace(-3, 2.0):
        fp = len(jacks_scores[(jacks_scores[sample] <= cutoff) &
                              (jacks_scores['Gene'].isin(neg_genes))])
        fdr = fp/len(neg_genes)
        fdr_df['cutoff'].append(cutoff)
        fdr_df['Sample'].append(sample)
        fdr_df['fdr'].append(fdr)
fdr_df = pd.DataFrame.from_dict(fdr_df)
fig, ax = plt.subplots(figsize=(6, 5))
sns.lineplot(data=fdr_df, ax=ax, x='cutoff', y='fdr', hue='Sample',)
# plt.plot(fdr_dict['cutoff'], fdr_dict['fdr'])
ax.plot((-4, 1), (0.05, 0.05), '--', color='k')
ax.plot((-4, 1), (0.005, 0.005), '--', color='grey')
ax.set_ylabel("FDR")
ax.set_xlabel("JACKS score")

In [None]:
auc_df = defaultdict(list)
pal = sns.color_palette("Set2", len(samp_order))
for sample in samp_order:
    pr_df = dict(Precision=[], Recall=[], Sample=[])
    for cutoff in np.linspace(jacks_scores[sample].min(), jacks_scores[sample].max(),
                              2000):
        fp = len(jacks_scores[(jacks_scores[sample] <= cutoff) &
                              (jacks_scores['Gene'].isin(neg_genes))])
        tp = len(jacks_scores[(jacks_scores[sample] <= cutoff) &
                              (jacks_scores['Gene'].isin(ess_genes))])
        try:
            precision = tp/(tp + fp)
        except ZeroDivisionError:
            precision = 1.0            
        recall = tp/len(ess_genes)
        pr_df['Precision'].append(precision)
        pr_df['Recall'].append(recall)
        pr_df['Sample'].append(sample)
    pr_df = pd.DataFrame.from_dict(pr_df)
    area_under_curve = auc(pr_df.Recall, pr_df.Precision)
    auc_df['Sample'].append(sample)
    auc_df['auc'].append(area_under_curve)
    plt.text(0.55, 0.85, 
             "AUC = {:.3g}".format(area_under_curve),
             fontsize=15)
    clr = pal.pop(0)
    plt.plot( pr_df.Recall, pr_df.Precision, color=clr)
    plt.ylim(0, 1.1)
    plt.xlabel('Recall')
    plt.ylabel('Precision (1-FDR)')
    plt.title('{}: Precision-Recall'.format(sample))
    plt.show()
auc_df = pd.DataFrame.from_dict(auc_df)
auc_df.head()

In [None]:
sns.set_style("darkgrid")
pal = sns.color_palette("colorblind")
for samp in samp_order:
    sns.kdeplot(jacks_scores[jacks_scores.Gene.isin(neg_genes)][samp], 
                 color=pal[0],
                 shade=True, label='Neg',)
    sns.kdeplot(jacks_scores[jacks_scores.Gene.isin(ess_genes)][samp], 
                 color=pal[1],
                 shade=True, label='Essential',)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.title("{}".format(samp))
    plt.xlabel("JACKS score")
    plt.xlim((-7.5, 5))
    plt.show()

In [None]:
sns.set_style("darkgrid")
pal = sns.color_palette("colorblind")
for samp in samp_order:
    sns.kdeplot(jacks_scores[jacks_scores.Gene.isin(neg_genes)][samp], 
                color=pal[0],
                shade=True, label='Neg',)
    sns.kdeplot(jacks_scores[jacks_scores.Gene.isin(ess_genes)][samp], 
                color=pal[1],
                shade=True, label='Essential',)
    sns.kdeplot(jacks_scores[~jacks_scores.Gene.isin(ess_genes.union(neg_genes))][samp], 
                color=pal[2],
                shade=True, label='Other',)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.title("{}".format(samp))
    plt.xlabel("JACKS score")
    plt.xlim((-7.5, 5))
    plt.show()

In [None]:
control_genes = ['EGFP', 'LacZ', 'luciferase']

In [None]:
import glob
bootstrap_pr_df = pd.DataFrame()
for prfile in glob.glob("results/bagel/*.pr"):
    pr_df = pd.read_csv(prfile, sep='\t')
    smp = os.path.basename(prfile).replace(".pr", "")
    pr_df['Sample'] = smp
    bootstrap_pr_df = bootstrap_pr_df.append(pr_df)

g = sns.FacetGrid(bootstrap_pr_df, row="Sample", height=5, aspect=2, hue="Sample",
                  row_order=samp_order, hue_order=samp_order,
                  palette=sns.color_palette("Set2"), sharey=True)
g = g.map(plt.hist, "BF", bins=50, range=(-100, 100),)
for ax in g.axes.flatten():
    ax.tick_params(labelbottom=True, labelleft=True)
    ax.set_ylabel("No. Genes")
g.fig.tight_layout()
g.fig.tight_layout()

In [None]:
jacks_scores_melt = jacks_scores.melt(
    id_vars='Gene',
    value_vars=jacks_scores.columns[1:])
jacks_scores_melt.columns = ['Gene', 'Sample', 'JACKS']
bf_plus_jacks = jacks_scores_melt.join(bootstrap_pr_df.set_index(
        ['Gene', 'Sample']), on=['Gene', 'Sample'])
bf_plus_jacks = bf_plus_jacks[~bf_plus_jacks.Gene.isin(control_genes)]
bf_plus_jacks.head()

In [None]:
fdr_df = dict(cutoff=[], fdr=[], Sample=[], algorithm=[])
   
for sample in bootstrap_pr_df.Sample.unique():
    tmp_df = bootstrap_pr_df[bootstrap_pr_df.Sample == sample]
    for cutoff in np.linspace(-50, 10, 1000):
        fp = len(tmp_df[(tmp_df.BF <= cutoff) &
                        (tmp_df['Gene'].isin(neg_genes))])
        fdr = fp/len(neg_genes)
        fdr_df['cutoff'].append(cutoff)
        fdr_df['Sample'].append(sample)
        fdr_df['fdr'].append(fdr)
        fdr_df['algorithm'].append('BF')
for sample in jacks_scores.columns[1:]:
    for cutoff in np.linspace(jacks_scores[sample].min(), 1.0, 1000):
        fp = len(jacks_scores[(jacks_scores[sample] <= cutoff) &
                              (jacks_scores['Gene'].isin(neg_genes))])
        fdr = fp/len(neg_genes)
        fdr_df['cutoff'].append(cutoff)
        fdr_df['Sample'].append(sample)
        fdr_df['fdr'].append(fdr)
        fdr_df['algorithm'].append('JACKS')
fdr_df = pd.DataFrame.from_dict(fdr_df)
neg_cutoffs = dict()
for fdr in [0.05, 0.005]:
    neg_cutoffs[fdr] = defaultdict(dict)
    for alg in ['JACKS', 'BF']:
        for samp in fdr_df.Sample.unique():
            neg_cutoffs[fdr][samp][alg] = fdr_df[(fdr_df.Sample == samp) &
                                                 (fdr_df.fdr < fdr) & 
                                                 (fdr_df.algorithm == alg)]['cutoff'].max()
neg_cutoffs

In [None]:
comps = []
control_gt = snakemake.config['control_genotype']
sample_df = pd.read_csv(snakemake.config['samples'], sep='\t')
t0 = sample_df.timepoint.min()
for tp in sample_df.timepoint.unique():
    if tp == t0:
        continue
    for gt in sample_df[sample_df.timepoint == tp].genotype.unique():
        if gt == control_gt:
            continue
        comps.append(("{}-{}".format(control_gt, tp),
                      "{}-{}".format(gt, tp)))
comps

In [None]:
for cmp in comps:
    hits, fig, stats, res  = plot_scores(
        df = bf_plus_jacks,
        x_col = cmp[0],
        y_col = cmp[1],
        score = 'BF',
        min_sd_diff = 3,
        sd_lines=[3,4,5],
        y_fdr_threshold=neg_cutoffs[0.005][cmp[1]]['BF'],
        get_gene_descriptions=True)

In [None]:
for cmp in comps:
    hits, fig, stats, res  = plot_scores(
        df = bf_plus_jacks,
        x_col = cmp[0],
        y_col = cmp[1],
        score = 'JACKS',
        min_sd_diff = 3,
        sd_lines=[3,4,5],
        y_fdr_threshold=neg_cutoffs[0.005][cmp[1]]['JACKS'],
        get_gene_descriptions=True)

In [None]:
scores = ['BF' ,'JACKS']
cutoff_df = defaultdict(list)
for cutoff,d in neg_cutoffs.items():
    for sample, alg2theshold in d.items():
        for alg, threshold in alg2theshold.items():
            cutoff_df['Sample'].append(sample)
            cutoff_df['Algorithm'].append(alg)
            cutoff_df['Cutoff'].append(cutoff)
            cutoff_df['Value'].append(threshold)
cutoff_df = pd.DataFrame.from_dict(cutoff_df)
all_stats = defaultdict(list)
all_hits = defaultdict(dict)
hit_genes = dict()
with pd.ExcelWriter('results/regression_analyses/regression_hits.xlsx') as writer:
    for cmp in comps:
        for sc in scores:
            if sc == 'BF':
                sd_diff = 3
                cutoff = 0.005
            else:
                sd_diff = 3
                cutoff = 0.005
                
            hits, fig, stats, res = plot_scores(
                df=bf_plus_jacks,
                x_col=cmp[0],
                y_col=cmp[1],
                score=sc,
                min_sd_diff=sd_diff,
                sd_lines=[3,4,5],
                plot=False,
                y_fdr_threshold=neg_cutoffs[cutoff][cmp[1]][sc],
                y_fdr_label = 'FDR = {}'.format(cutoff),
                get_gene_descriptions=True)
            hits['Description'] = get_mygene_strings(hits.Gene)
            hit_genes[cmp + (sc,)] = set(hits.Gene)
            all_stats['Comparison'].append("{}_{}_{}".format(cmp[0], cmp[1], sc))
            for k,v in stats.items():
                all_stats[k].append(v)
            hits.to_excel(writer,
                          sheet_name="{}_{}_{}_hits".format(cmp[0], cmp[1], sc),
                          index=False)
            res.to_csv("results/regression_analyses/{}_{}_{}_residuals.csv".format(cmp[0],
                                                               cmp[1],
                                                               sc),
                       index=False)
            py.plot(fig,
                    filename="results/plots/{}_{}_{}_hits.html".format(cmp[0], cmp[1], sc),
                    auto_open=False)
            all_hits[cmp][sc] = hits
    
    for cmp in comps:
        g1 = hit_genes[cmp + (scores[0],)]
        g2 = hit_genes[cmp + (scores[1],)]
        gt, tp = cmp[1].rsplit("-", 1)
        h1 = all_hits[cmp][scores[0]]
        h2 = all_hits[cmp][scores[1]]
        print("{} genes intersect for {}".format(len(g1.intersection(g2)), tp))
        intr_df1 = h1[h1.Gene.isin(g1.intersection(g2))][h1.columns[1:-1]]
        rdict = dict((x, "{}_{}".format(x, scores[0])) for x in h1.columns[2:-1])
        intr_df1 = intr_df1.rename(index=str, columns=rdict)
        intr_df2 = h2[h2.Gene.isin(g1.intersection(g2))]
        rdict = dict((x, "{}_{}".format(x, scores[1])) for x in h1.columns[2:-1])
        intr_df2 = intr_df2.rename(index=str, columns=rdict)
        intr_df1 = intr_df1.join(intr_df2.set_index('Gene'), on='Gene', )
        intr_df1.to_excel(writer,
                          sheet_name="{}_{}_{}_{}_intersected_hits".format(gt, tp, scores[0], scores[1]),
                          index=False)
    all_stats = pd.DataFrame(all_stats)
    all_stats.to_excel(writer, sheet_name='Summary Stats', index=False)
    cutoff_df.to_excel(writer, sheet_name='FDR thresholds', index=False)