In [1]:
%matplotlib inline
%load_ext rpy2.ipython

import os
import glob
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re

sns.set_context('notebook', font_scale=1.5)

In [2]:
def bilinear_interpolate(x, y, bins=None):
    """Returns interpolated density values on points (x, y).

    Ref: http://en.wikipedia.org/wiki/Bilinear_interpolation.
    """
    if bins is None:
        bins = int(np.sqrt(len(x)))

    z, unused_xedge, unused_yedge = np.histogram2d(
        y, x, bins=[
            bins, bins], range=[
            (np.min(y), np.max(y)), (np.min(x), np.max(x))])
    xfrac, xint = np.modf((x - np.min(x)) /
                             (np.max(x) - np.min(x)) * (bins - 1))
    yfrac, yint = np.modf((y - np.min(y)) /
                             (np.max(y) - np.min(y)) * (bins - 1))

    xint = xint.astype('i')
    yint = yint.astype('i')

    z1 = np.zeros(np.array(z.shape) + 1)
    z1[:-1, :-1] = z

    # values at corners of square for interpolation
    q11 = z1[yint, xint]
    q12 = z1[yint, xint + 1]
    q21 = z1[yint + 1, xint]
    q22 = z1[yint + 1, xint + 1]

    return q11 * (1 - xfrac) * (1 - yfrac) + q21 * (1 - xfrac) * (yfrac) + \
        q12 * (xfrac) * (1 - yfrac) + q22 * (xfrac) * (yfrac)
    
def plot_fcm(df, n, dims, ncols=None, lab=None, 
             panel_size=3, grid_count=None,
             sc_min=0, sc_max=262144,
             f_min=-0.1, f_max=1.1):
    ndims = len(dims)

    if ncols is None:
        ncols = ndims
    nrows = (ndims + ncols - 1)//ncols

    fig, axes = plt.subplots(nrows, ncols, 
                             figsize=(panel_size*ncols, panel_size*nrows))
    axes = axes.ravel()
    for k, dim in enumerate(dims):
        ax = axes[k]
        c1, c2 = dim
        data = df.sample(n)
        x = data[c1]
        y = data[c2]
        z = bilinear_interpolate(x, y)
        if 'FSC' in c1 or 'SSC' in c1:
            ax.set_xlim(sc_min, sc_max)
        else:
            ax.set_xlim(f_min, f_max)
        if 'FSC' in c2 or 'SSC' in c2:
             ax.set_ylim(sc_min, sc_max)
        else:
            ax.set_ylim(f_min, f_max)       
        ax.scatter(x, y, s=1, c=z, edgecolors='none')
        if lab:
            plt.text(0.05, 0.9, lab,
                     va='center', ha='left',
                     transform=ax.transAxes)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_xlabel(c1.split()[0])
        ax.set_ylabel(c2.split()[0])
        if grid_count:
            xlim = ax.get_xlim()
            ylim = ax.get_ylim()
            ax.set_xticks(np.linspace(min(xlim), max(xlim), grid_count))
            ax.set_yticks(np.linspace(min(ylim), max(ylim), grid_count))
            ax.grid(True)
    plt.tight_layout()
    return fig

In [3]:
base = '/data/flow/EQAPOL/ep11'
processed = os.path.join(base, 'processed')

In [4]:
dims = [
    ('FSC-A', 'SSC-A'),
    ('CD3 APC-H7 FLR-A', 'Aqua Amine FLR-A'),
    ('CD4 PE-Cy7 FLR-A', 'CD8 PerCP-Cy55 FLR-A'),
    ('CD3 APC-H7 FLR-A', 'IFNg APC FLR-A'),
    ('CD3 APC-H7 FLR-A', 'TNFa FITC FLR-A'),
    ('CD3 APC-H7 FLR-A', 'IL2 BV421 FLR-A'),
    ('CD3 APC-H7 FLR-A', 'CD107a PE FLR-A'),    
]

In [5]:
labs = '001 004 007 010 012 031 043 048 082 003 006 008 011 013 036 044 049 101'.split()

In [6]:
samples = 'DEN084ZH G6901X7M GEN070XT'.split()

In [7]:
stims = 'Unstim CMVpp65 CEF'.split()

In [8]:
process = 'clean_gated_comp_xform'

In [9]:
pat = re.compile(r'/.*fcs')
with open(os.path.join(base, 'errors.txt')) as f:
    excludes = (pat.findall(f.read()))

In [10]:
excludes

['/home/swhite/vbox_share/cliburn_projects/eqapol_ep11/008/008_G6901X7M_Unstim_unstained_clean.fcs',
 '/home/swhite/vbox_share/cliburn_projects/eqapol_ep11/010/010_G6901X7M_Unstim_unstained_clean.fcs',
 '/home/swhite/vbox_share/cliburn_projects/eqapol_ep11/012/012_G6901X7M_Unstim_unstained_clean.fcs',
 '/home/swhite/vbox_share/cliburn_projects/eqapol_ep11/013/013_G6901X7M_Unstim_unstained_clean.fcs',
 '/home/swhite/vbox_share/cliburn_projects/eqapol_ep11/036/036_G6901X7M_Unstim_unstained_clean.fcs',
 '/home/swhite/vbox_share/cliburn_projects/eqapol_ep11/043/043_G6901X7M_Unstim_unstained_clean.fcs',
 '/home/swhite/vbox_share/cliburn_projects/eqapol_ep11/048/048_G6901X7M_Unstim_unstained_clean.fcs',
 '/home/swhite/vbox_share/cliburn_projects/eqapol_ep11/049/049_DEN084ZH_Unstim_unstained_clean.fcs',
 '/home/swhite/vbox_share/cliburn_projects/eqapol_ep11/101/101_A6901WYC_Unstim_unstained_clean.fcs']

In [11]:
from collections import defaultdict

In [12]:
np.random.seed(123)
n = 10000

collection = {}
for stim in stims:
    for sample in samples:
        used_labs = defaultdict(list)
        dfs = []
        for lab in labs:
            filename = f'{lab}_{sample}_{stim}_{process}.csv'
            f = os.path.join(processed, 
                             lab, 
                             filename)
            if f in excludes:
                continue

            try:
                df = pd.read_csv(f)
                df.rename({'# FSC-A': 'FSC-A'}, axis=1, inplace=True)
                dfs.append(df.sample(n))
                used_labs[(sample, stim)].append(lab)
            except Exception as e:
                pass
        try:
            collection[(sample, stim)] = pd.concat(dfs, sort=False)
        except Exception as e:
            print(e)

In [13]:
collection.keys()

dict_keys([('DEN084ZH', 'Unstim'), ('G6901X7M', 'Unstim'), ('GEN070XT', 'Unstim'), ('DEN084ZH', 'CMVpp65'), ('G6901X7M', 'CMVpp65'), ('GEN070XT', 'CMVpp65'), ('DEN084ZH', 'CEF'), ('G6901X7M', 'CEF'), ('GEN070XT', 'CEF')])

In [14]:
used_labs.keys()

dict_keys([('GEN070XT', 'CEF')])

In [15]:
len(collection[('GEN070XT', 'CMVpp65')])

180000

In [16]:
len(used_labs[('GEN070XT', 'CMVpp65')])

0

In [17]:
np.unique(used_labs[('GEN070XT', 'CMVpp65')])

array([], dtype=float64)

In [18]:
list(enumerate(df.columns))

[(0, 'FSC-A'),
 (1, 'FSC-H'),
 (2, 'FSC-W'),
 (3, 'SSC-A'),
 (4, 'SSC-H'),
 (5, 'SSC-W'),
 (6, 'TNFa FITC FLR-A'),
 (7, 'CD8 PerCP-Cy55 FLR-A'),
 (8, 'IL2 BV421 FLR-A'),
 (9, 'Aqua Amine FLR-A'),
 (10, 'IFNg APC FLR-A'),
 (11, 'CD3 APC-H7 FLR-A'),
 (12, 'CD107a PE FLR-A'),
 (13, 'CD4 PE-Cy7 FLR-A'),
 (14, 'Time')]

In [19]:
index = [0,3,9,11,13,7,6,8,10,12]

In [20]:
headers = df.columns[index]

In [21]:
headers

Index(['FSC-A', 'SSC-A', 'Aqua Amine FLR-A', 'CD3 APC-H7 FLR-A',
       'CD4 PE-Cy7 FLR-A', 'CD8 PerCP-Cy55 FLR-A', 'TNFa FITC FLR-A',
       'IL2 BV421 FLR-A', 'IFNg APC FLR-A', 'CD107a PE FLR-A'],
      dtype='object')

In [22]:
Y = collection[('GEN070XT', 'CMVpp65')].iloc[:, index]

In [23]:
Y.shape

(180000, 10)

In [24]:
Y.min(axis=0)

FSC-A                   26762.339844
SSC-A                     -45.119999
CD4 PE-Cy7 FLR-A           -0.501029
CD8 PerCP-Cy55 FLR-A       -0.217883
Aqua Amine FLR-A           -0.163236
CD3 APC-H7 FLR-A           -0.519089
IFNg APC FLR-A             -0.480555
CD107a PE FLR-A            -0.442045
TNFa FITC FLR-A            -0.269606
IL2 BV421 FLR-A            -0.189237
dtype: float64

In [25]:
Y.max(axis=0)

FSC-A                   262143.000000
SSC-A                   262143.000000
CD4 PE-Cy7 FLR-A             0.981121
CD8 PerCP-Cy55 FLR-A         0.998515
Aqua Amine FLR-A             0.999577
CD3 APC-H7 FLR-A             0.999259
IFNg APC FLR-A               0.997917
CD107a PE FLR-A              1.000002
TNFa FITC FLR-A              0.998816
IL2 BV421 FLR-A              0.999354
dtype: float64

In [26]:
np.repeat([1,2,3], 5)

array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])

In [27]:
C = np.repeat(np.arange(1, len(used_labs[('GEN070XT', 'CMVpp65')])+1), n)

In [28]:
len(C)

0

In [29]:
target_dir = '_'.join(('GEN070XT', 'CMVpp65'))

In [30]:
target_dir

'GEN070XT_CMVpp65'

In [31]:
if not os.path.exists(os.path.join(target_dir, 'figs')):
    os.makedirs(os.path.join(target_dir, 'figs'))

In [32]:
np.savetxt(os.path.join(target_dir, 'markers.txt'), headers, fmt='%s')

In [33]:
np.savetxt(os.path.join(target_dir,  'Y.csv'), Y.values)

In [34]:
np.savetxt(os.path.join(target_dir, 'C.csv'), C)

In [35]:
samples = ['GEN070XT']
stims = ['CMVpp65']

for lab in labs:
    for sample in samples:
        for stim in stims:
            filename = f'{lab}_{sample}_{stim}_{process}.csv'
            f = os.path.join(processed, 
                             lab, 
                             filename)
            if f in excludes:
                continue

            try:
                df = pd.read_csv(f)
                df.rename({'# FSC-A': 'FSC-A'}, axis=1, inplace=True)
                n = 10000
                fig = plot_fcm(df, n, dims, lab=lab, grid_count=10)
                fig.savefig(f'{target_dir}/figs/{lab}_{sample}_{stim}_{process}.png')
                plt.close(fig)
            except Exception as e:
                print(e)