## Preparing mass spec data for the open-cell website demo

In [None]:
import os
import re
import sys
import glob
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
import matplotlib.colors as mpl_colors

In [None]:
# the 'original' dataset for the demo
filepath = '../data/mass-spec/2019-08-02_volcano_plots_all.txt'
d1 = pd.read_csv(filepath, sep='\t')

# a second dataset for the demo
filepath = '../data/mass-spec/MS_052319.csv'
d2 = pd.read_csv(filepath, sep=',')

In [None]:
# HACK: manually drop 'ATP2B1' and 'POLR1C', which appear in both datasets
d1 = d1.drop([
    'LFQi 1_ATP2B1: -Log(P-value)', 
    'LFQi 1_POLR1C: -Log(P-value)', 
    'LFQi 1_ATP2B1: Difference',
    'LFQi 1_POLR1C: Difference',
], axis=1)

In [None]:
def cleanup(d):
    d.rename(columns={col: col.lower().replace(' ', '_') for col in d.columns}, inplace=True)
    d.rename(columns={'Gene names': 'gene_names', 'Protein names': 'protein_names'}, inplace=True)
    
cleanup(d1)
cleanup(d2)

In [None]:
# merge on gene names column 
# (a bit sketchy because this requires the list of gene names to be identical)
d = pd.merge(d1, d2, how='outer', on='gene_names')

In [None]:
d1.shape, d2.shape

In [None]:
d1.shape[0] + d2.shape[0], d.shape[0]

In [None]:
# manually merge the protein_names column
mk = d['protein_names_y'].isna()
d.loc[mk, 'protein_names_y'] = d.loc[mk, 'protein_names_x']

mk = d['protein_names_x'].isna()
d.loc[mk, 'protein_names_x'] = d.loc[mk, 'protein_names_y']

d['protein_names'] = d['protein_names_x']
d = d.drop(['protein_names_x', 'protein_names_y'], axis=1)

In [None]:
meta_columns = ['gene_names', 'protein_names']
metadata = d[meta_columns].copy()

# create a gene_name column with the first name in the list of gene_names in each row
metadata['gene_name'] = [str(s).split(';')[0] for s in metadata.gene_names]

# minimal metadata ('protein_names' contains longer descriptive names)
metadata = metadata[['gene_name', 'gene_names', 'protein_names']].copy()

In [None]:
# column name patterns (common between the two MS datasets)
pvalue_pattern = r'^lfq.+_(\w+):_-log\(p-value\)$'
enrichment_pattern = r'^lfq.+_(\w+):_difference$'

In [None]:
# create dataframes of pvalues and enrichments with columns names equal to gene_name
pvalue_columns = [col for col in d.columns if re.findall(pvalue_pattern, col)]
enrichment_columns = [col for col in d.columns if re.findall(enrichment_pattern, col)]

pvalues = d[pvalue_columns].copy()
enrichments = d[enrichment_columns].copy()

pvalues.rename(columns={
    col: re.findall(pvalue_pattern, col)[0] for col in pvalues.columns}, inplace=True)

enrichments.rename(columns={
    col: re.findall(enrichment_pattern, col)[0] for col in enrichments.columns}, inplace=True)

In [None]:
enrichments.columns

## Calculating FDR curve parameters
This approach is an attempt to exactly implement the algorithm described in Hein 2015.

In [None]:
def calc_fdr(x0, c):
    '''
    Calculate the false discovery rate (FDR) for a given x0 and c

    The FDR is the number of negatively-enriched hits with p-values above the FDR curve, 
    relative to the total number of hits with p-values above the FDR curve
    
    The FDR curve itself is a function only of enrichment and is given by
    c / (abs(enrichment) - x0)
    '''
    
    all_en = enrichments.values
    all_pval = pvalues.values
    
    pval_neg = all_pval[all_en < (-x0)]
    en_neg = all_en[all_en < (-x0)]
    num_neg = (pval_neg > (c / (-en_neg - x0))).sum()

    pval_pos = all_pval[all_en > x0]
    en_pos = all_en[all_en > x0]
    num_pos = (pval_pos > (c / en_pos - x0)).sum()
    
    return num_neg, num_pos

In [None]:
# naive grid search over x0 and c
x0s, cs = np.meshgrid(np.arange(1.5, 5, .03), np.arange(1, 6, .03))

fdrs = x0s * 0
nums_true = x0s*0
for i in range(fdrs.shape[0]):
    for j in range(fdrs.shape[1]):
        num_neg, num_pos = calc_fdr(x0s[i][j], cs[i][j])
        fdrs[i][j] = num_neg / (num_neg + num_pos)
        nums_true[i][j] = num_pos

In [None]:
# find the parameters with an FDR near the given threshold (usually 0.01 or 0.05)
thresh = 0.05
wiggle = .001

mask = (fdrs < (thresh + wiggle)) & (fdrs > (thresh - wiggle))
plt.imshow(mask)

In [None]:
# find the parameters with the greatest number of positive hits
ind = np.argmax(nums_true[mask])
x0s[mask][ind], cs[mask][ind]

In [None]:
# plot the FDR curve 
x0, c = 1.62, 4.25

x = np.arange(x0, 15, .1)
plt.plot(x, c/(x - x0), color='gray')

x = np.arange(-15, -x0, .1)
plt.plot(x, c/(-x - x0), color='gray')

plt.scatter(enrichments, pvalues, alpha=.1)
plt.gca().set_ylim([0, 30])

In [None]:
num_neg, num_pos = calc_fdr(x0, c)
print('%d, %d, %0.4f' % (num_neg, num_pos, num_neg / (num_neg + num_pos)))

## Construct JSON data for demo

In [None]:
# enrichments and p-values for all targets
ms_data = []
for target_name in pvalues.columns:
    
    hits = []
    ms_datum = {'target_name': target_name.upper()}
    for ind, row in metadata.iterrows():
        
        hits.append({
            'gene_id': ind,
            'pvalue': '%0.2f' % pvalues[target_name].iloc[ind],
            'enrichment': '%0.2f' % enrichments[target_name].iloc[ind],
        })

    ms_datum['hits'] = hits
    ms_data.append(ms_datum)

In [None]:
metadata.loc[metadata.gene_name.isin([row['target_name'] for row in ms_data])].sort_values(by='gene_name')

In [None]:
with open('../src/demo/data/20190816_ms-data.json', 'w') as file:
    json.dump(ms_data, file)
    
with open('../src/demo/data/20190816_ms-metadata.json', 'w') as file:
    metadata.to_json(file, orient='index')