In [None]:
import glob #filenames and pathnames utility
import os   #operating sytem utility

import flowgatenist as flow
#from flowgatenist import gaussian_mixture as nist_gmm
import flowgatenist.batch_process as batch_p

import matplotlib.pyplot as plt
from matplotlib import colors
#from matplotlib.backends.backend_pdf import PdfPages

import numpy as np
import pandas as pd
#from scipy import special
#from scipy import misc

#import pystan
import pickle

import seaborn as sns
sns.set()

%load_ext autoreload
%autoreload 2

%matplotlib inline

Indicate the directory where the data is stored:

In [None]:
notebook_dir = os.getcwd()
notebook_dir

In [None]:
main_directory = notebook_dir[:notebook_dir.rfind("\\")]
os.chdir(main_directory)
main_directory

Then check bead data file to make sure it has ~10,000 counts

In [None]:
bead_file = glob.glob('*bead*H12*.fcs_pkl')[0]
bead_file

In [None]:
bead_data = pickle.load(open(bead_file, 'rb'))
bead_data.flow_frame.shape

Then plot the bead data as YL1-A vs. BL1-A to check the number of bead clusters to use for fitting

In [None]:
sns.set()

plt.rcParams["figure.figsize"] = [12,6]
fig, axs = plt.subplots(1, 2)

x = bead_data.flow_frame['BL1-A']
y = bead_data.flow_frame['YL1-A']

x_bins2 = np.linspace(x.min(), x.max(), 200)
x_bins2 = np.linspace(x.min(), 1000000, 200)
y_bins2 = np.linspace(0, 400000, 200)


axs[0].hist2d(x, y, bins=200, norm=colors.LogNorm(), rasterized=True);

df = bead_data.flow_frame
x = df['BL1-A']
y = df['YL1-A']
t = df['Time']
axs[1].plot(t, y, 'o', alpha=0.1, ms=5);

Looks like 7 of the bead populations are on scale, so set num_bead_populations to 7 in calling the fit_bead_data method

By default, the parameters bead_init, singlet_init, and bead_population_init are set to large values to reduce the probability of a bad fit resulting from the random GMM initializations. This casues the method to run slowly. For quick testing, with human oversite, they could be reduced to smaller values.

With 6 or 7 bead populations on scale, it is difficult to get a good result with a random initialization, and with outlier data. So, use the means and covariances from a previous fit (or manually supplied initialization, look at the third plot with orange x's to see how good/bad the manual initializations are). Input variables: fixed_means and fixed_covars 

In [None]:
fixed_b = np.array([4.06498017e+02, 2.99833770e+03, 7.67144245e+03, 2.15945949e+04,
        5.31619404e+04, 1.51693808e+05, 4.53615788e+05])*28/45

fixed_y = np.array([   388.24753729,    978.54102811,   2245.85074931,   6011.33521244,
         14705.68308142,  42165.89008255, 127685.18467742])
fixed_means = np.asarray([ [b, y] for (b, y) in zip(fixed_b, fixed_y) ])

fixed_covars = np.array([[  410030.80843786,    46141.67545161],
        [  584518.72158146,    59154.99142766],
        [  987421.29489873,    95094.67456259],
        [ 2104919.01831125,   209859.40250991],
        [ 4816284.87397532,   470348.2314993 ],
        [13939125.3609858 ,  1334323.55322485],
        [43867870.75180154,  3541850.85089593]])

In [None]:
fixed_b

In [None]:
fixed_y

In [None]:
batch_p.fit_bead_data(bead_file=bead_file,
                      data_directory=main_directory,
                      num_bead_populations=7,
                      bead_init=10,
                      singlet_init=5,
                      num_singlet_clusters=3,
                      bead_population_init=100,
                      show_plots=True,
                      pop_init_means=fixed_means,
                      pop_init_cov=fixed_covars,
                      outlier_quantile=0.1, 
                      upper_threshold=[700000, 200000], 
                      lower_threshold=[-100000, -100000])

Then apply the bead calibration to all the data files in the main_directory

In [None]:
batch_p.batch_apply_bead_cal(bead_file=bead_file, data_directory=main_directory, fl_channel='BL1-A')

In [None]:
batch_p.batch_apply_bead_cal(bead_file=bead_file, data_directory=main_directory, fl_channel='YL1-A')

Background fit:

In [None]:
stan_back_fit_b = batch_p.batch_stan_background_fit(data_directory=main_directory, fl_channel='BL1-A-MEF', show_plots=True,
                                                    fit_max=250, hist_bins=100)
stan_back_fit_b.summary()

In [None]:
stan_back_fit_samples = stan_back_fit_b.stan_variable('mu')
back_mu = np.mean(stan_back_fit_samples)
back_mu

In [None]:
main_directory