In [1]:
import collections
import bisect
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib as mpl
import scipy.signal as signal

from sklearn.neighbors import KernelDensity
from sklearn.mixture import GaussianMixture

In [2]:
#plotting things

#%matplotlib qt5 -- I don't know what this is
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from cycler import cycler


#All of Anandh's customized seaborn/matplotlib settings

sns.set_context("talk", font_scale=1.5, rc={"lines.linewidth": 1.5})
sns.set_style("ticks")
sns.set_style({"xtick.direction": "in","ytick.direction": "in"})

#%config InlineBackend.figure_f.ormats=['svg']

mpl.rc('axes', prop_cycle=(cycler('color', ['r', 'k', 'b','g','y','m','c']) ))

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

#mpl.rc('text', usetex=False)
#mpl.rc('text.latex', preamble=r'\usepackage{helvet}
#\renewcommand\familydefault{\sfdefault}\usepackage{sansmath}\sansmath')

    #If you want to use a different font
# mpl.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica'], 
#                  'serif': ['Helvetica']})

tw = 1.5
sns.set_style({"xtick.major.size": 3, "ytick.major.size": 3,
               "xtick.minor.size": 2, "ytick.minor.size": 2,
               'axes.labelsize': 16, 'axes.titlesize': 16,
               'xtick.major.width': tw, 'xtick.minor.width': tw,
               'ytick.major.width': tw, 'ytick.minor.width': tw})

mpl.rc('xtick', labelsize=14) 
mpl.rc('ytick', labelsize=14)
mpl.rc('axes', linewidth=1.5)
mpl.rc('legend', fontsize=14)
mpl.rc('figure', figsize=(9,8))

In [3]:
def get_log_values(file_directory, channel):
    '''
    Reads in the values from a specific channel for a given flow file.
    Defaults to taking GFP/FITC-A.
    '''
    flow_data = pd.read_csv(file_directory)
    flow_data_log_values = flow_data[channel].apply(np.log10)
    
    #add a column you want to gate on that has the log values, use bbb as dummy and rename
    flow_data = flow_data.assign(bbb = flow_data_log_values).rename({'bbb' : 'log10-' + channel}, axis='columns')
    
    #drop the nans because peak finding can't handle it
    #flow_data = flow_data.dropna(axis=0, how='any')
    
    return flow_data

In [4]:
def get_peak_locations_from_KDE_fit(data):
    ''' Performs a KDE fit and then uses scipy.signal.find_peaks_cwt to get peaks.
        The KDE bandwith parameter is critical, and 0.25 has worked well in the past.
        If it feels like you are missing many peak calls, decrease the bandwith. If it feels
        like you are having too many peak calls, increase the bandwith. 
        
        Don't change the bandwith without good reason, it took awhile to decide on 0.25. '''
    
    kde = KernelDensity(bandwidth=0.25, kernel='gaussian')
    kde.fit(data[:, None]);

    x_range = np.linspace(0, 6, 1200)
    kde_estimates = np.exp(kde.score_samples(x_range[:, None]))

    #Use the SciPy function to get the KDE peaks
    peaks = signal.find_peaks_cwt(kde_estimates, np.arange(30, 200), min_snr=1)

    means_init = []
    
    for peak in peaks:
        means_init.append(x_range[peak])
    
    return means_init

In [5]:
def fit_GMM_KDE(data, peaks): 
    """Generate a Gaussian mixture model from the output
    of a Gaussian Kernel Density Estimation. 
    Outputs the mean of the on peak, fraction on, mean of the off peak, 
    and fraction off. This version of the code assumes all cells not in the on peak are off!
    This is obviously only a good assumption for uni/bimodal data. If you have multimodal data,
    do not use this code."""
    
    data = data.reshape(len(data), 1)

    peaks = np.array(peaks).reshape(len(peaks), 1)
    opt_gmm = GaussianMixture(n_components = len(peaks) , means_init = peaks).fit(data)  

    labels = opt_gmm.predict(data)
    labels = np.ravel(labels.reshape(len(labels), 1))

    means = opt_gmm.means_
    
    #this df contains each measurement value and the gaussian it is associated with.
    #you can use this to pull out the measurements that fall into the desired gaussian for gating.
    df = pd.DataFrame({'fluor value': np.ravel(data), 'which_gaussian': labels})


    return opt_gmm, df


In [6]:
def autogate_GMM (dir_path_exp, channel):
    #pick up all files to use
    all_wells = glob.glob(dir_path_exp)
    
    for well in all_wells:
        #appends a new column with the log10 transformation of the desired channel
        flow_log = get_log_values(well, channel)
        
        #data to use in fitting. use dropna here to keep the all the original data in flow_log
        #the same, but remove it here for data fitting because it can't handle it
        dt = flow_log['log10-' + channel].dropna()

        #Andy's function unchanged
        pks = get_peak_locations_from_KDE_fit(dt)
        
        #function stops after fitting model and creating the
        #gaussian assignment df
        model, result = fit_GMM_KDE(dt.values, pks)
    
    
        #to check the fit of the GMM
#         fig, ax = plt.subplots(figsize=(5,5))
#         x = np.linspace(min(dt), max(dt), 1000).reshape(1000, 1)
#         ax.plot(x, np.exp(model.score_samples(x)))
#         sns.kdeplot(dt, ax=ax)

        #get all the log10 measurement values that are in the highest peak (the ON gate)
        on_gate = result.loc[result['which_gaussian'] == max(result['which_gaussian']), :]
        
        #set the min and max of the values in the ON gate
        mi = min(on_gate['fluor value'])
        ma = max(on_gate['fluor value'])
        
        #indices of the data (all channel values) that are inside the ON gate
        i = (flow_log['log10-' + channel] >= mi) & (flow_log['log10-' + channel] <= ma)
        
        #initialize
        flow_log[channel + '_GMMgate'] = np.nan

        #indicate that these are in the ON gate
        flow_log.loc[i, channel + '_GMMgate'] = 1
        
        #fill the non-on measurements with 0, drop the log10 of gating channel column
        flow_log = flow_log.fillna(0).drop('log10-' + channel, axis=1)
        
        #save a new file with all the gated information appended
        flow_log.to_csv(well, index=False)
        
    return None

In [7]:
#set some variables

syto_channel = 'mKate/APC-A'
yfp_channel = 'GFP/FITC-A'
bfp_channel = 'CFP/VioBlue-A'

In [11]:
tpt = '0hr'

dir_path_exp = '../../Local Data/20181009 top 4 A B cell vars A=B flow samples/' + tpt + '/*.csv'
save_dir = '../../Local Data/20181009 top 4 A B cell vars A=B flow samples/23hr/GMM_autogate'

In [12]:
autogate_GMM(dir_path_exp, syto_channel)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.redu

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.redu

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.redu

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
