In [None]:
import os
import sys
import glob
import copy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
import matplotlib.colors as mpl_colors

from scipy import interpolate
import FlowCytometryTools as fct

In [None]:
font = {
    'family': 'normal',
    'weight': 'normal',
    'size': 16,
}

mpl.rc('font', **font)

In [None]:
%load_ext autoreload
%autoreload 2
%aimport -fct -np -pd -plt

In [None]:
sys.path.append('../')
from pipeline_process import common
from pipeline_process.facs import constants, manager, processor, unmixer

FITC = constants.FITC
BOX_ROOT = '/Users/keith.cheveralls/Box-cache/'

In [None]:
# initialize a manager
m = manager.FACSManager(BOX_ROOT)

In [None]:
p = processor.FACSProcessor(*m.get_sample_and_control_dirpaths('P0001'), verbose=False)

In [None]:
# process a sample
plt.figure(figsize=(12,8))
stats, distros, unmixer = p.process_sample('A2', show_plots=True)
plt.gca().set_xlabel('Intensity (hlog-transformed)')
plt.gca().set_ylabel('Normalized frequency')

In [None]:
# process a sample
plt.figure(figsize=(8,6))
s = p.process_sample('E7', show_plots=True)

### Aside: a few samples have different gain settings

In [None]:
# initialize a processor for one plate
p = processor.FACSProcessor(*m.get_sample_and_control_dirpaths('P0005'), verbose=False)
d = p.load_sample('D10')

In [None]:
_ = plt.hist(d.data[FITC], bins=np.arange(-1000, 10000, 100))

### Process all plates

In [None]:
# load all plates and process all samples
all_plate_ids = ['P%04d' % num for num in range(1, 20)]
stats, dists = [], []
for plate_id in all_plate_ids:
    print('Loading plate %s' % plate_id)
    p = processor.FACSProcessor(*m.get_sample_and_control_dirpaths(plate_id), verbose=False)

    for well_id in p.well_ids:
        stat, dist, unmixer = p.process_sample(well_id, show_plots=False)
        stat['well_id'] = well_id
        dist['well_id'] = well_id
        stat['plate_id'] = plate_id
        dist['plate_id'] = plate_id
        stats.append(stat)
        dists.append(dist)

### Cache results and histograms

In [None]:
# cache stats
pd.DataFrame(data=stats).to_csv('../results/2019-07-16_all-facs-results.csv', index=False)

In [None]:
# cache dists
def to_jsonable(dists):
    '''coerce arrays to lists'''
    for row in dists:
        for key, val in row.items():
            if isinstance(val, np.ndarray):
                row[key] = list(val)
    return dists

# make a copy of the dists object
with open('../results/2019-07-16_all-dists.json', 'w') as file:
    json.dump(to_jsonable(copy.deepcopy(dists)), file)

In [None]:
dist.keys()

In [None]:
# testing serialization by rounding scaled y-values
dist = dists[11]
plt.plot([int(val) for val in dist['x']], [int(val*1e6) for val in dist['y_ref_fitted']])

In [None]:
# load the cached results CSV
d = pd.read_csv('../results/2019-07-16_all-facs-results.csv')
d.plate_id.unique()

### Some plots

In [None]:
d = pd.DataFrame(data=stats)
d = d.dropna(how='any', axis=0)
d.shape

In [None]:
_ = plt.scatter(d.area, d.rel_median_log, alpha=.1)

In [None]:
# area vs log median
_ = plt.hist2d(
    d.area,
    d.rel_median_log,
    bins=(np.arange(0, .4, .01), np.arange(0, 1.5, .03)),
    norm=mpl_colors.PowerNorm(.5))

In [None]:
# mean vs std
_ = plt.hist2d(
    d.raw_mean,
    d.raw_std,
    bins=(np.arange(3000, 7000, 90), np.arange(0, 1500, 30)),
    norm=mpl_colors.PowerNorm(.7))

In [None]:
# std vs max
_ = plt.hist2d(
    d.rel_mean_hlog,
    d.rel_percentile99_hlog,
    bins=(np.arange(1000, 6000, 90), np.arange(2000, 8000, 60)),
    norm=mpl_colors.PowerNorm(.7))

### Compare control distributions from different plates

In [None]:
# plot the reference distributions
plate_nums_weird_controls = [1, 6, 10, 11, 19]
for plate_num in plate_nums_weird_controls:
    print('Loading plate %s' % plate_num)
    p = processor.FACSProcessor(*m.get_sample_and_control_dirpaths(plate_num))
    plt.plot(p.x_ref, p.y_ref, label=plate_num)
plt.legend()

### All results (w target names) for manu

In [56]:
d = pd.read_csv(
    '/Users/keith.cheveralls/projects/opencell-process-off-git/results/2019-07-16_all-facs-results.csv')

dn = pd.read_csv(
    '/Users/keith.cheveralls/Downloads/CumulativePipelineQC - CumulativePipelineQC_20190529.csv')

dn = dn[['plate_id', 'well_id', 'protein']]

In [57]:
dn['plate_id'] = ['P%04d' % int(plate_id[1:]) for plate_id in dn.plate_id]

In [58]:
d = pd.merge(d, dn, left_on=('plate_id', 'well_id'), right_on=('plate_id', 'well_id'), how='left')

In [59]:
d = d[['plate_id', 'well_id', 'protein', 'area', 'rel_median_log']]
d = d.rename(columns={'area': 'gfp_area', 'rel_median_log': 'gfp_rel_median_log_intensity'})
d['well_id'] = [qc.pad_well_id(well_id) for well_id in d.well_id]

In [None]:
d

In [61]:
d.sort_values(by=['plate_id', 'well_id']).to_csv(
    '/Users/keith.cheveralls/projects/opencell-process-off-git/results/2019-07-16_all-facs-results_.csv',
    index=False, 
    float_format='%0.2f')