In [None]:
import os
import sys
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
import matplotlib.colors as mpl_colors

In [None]:
font = {
    'family': 'normal',
    'weight': 'normal',
    'size': 16,
}

mpl.rc('font', **font)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
sys.path.append('../')
from pipeline_process import common
from pipeline_process.facs import constants, manager

FITC = constants.FITC
BOX_ROOT = '/Users/keith.cheveralls/Box-cache/'

### Load and cat all of Nathan's FACSQC CSVs

In [None]:
filenames = glob.glob('/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/*.csv')

In [None]:
# hard-coded filenames with repeats deleted
filenames =  {
    1: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate1_redo.csv',
    5: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate5.csv',
    7: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate7.csv',
    6: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate6.csv',
    19: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate19.csv',
    18: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate18.csv',
    13: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate13.csv',
    12: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate12.csv',
    10: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate10.csv',
    11: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate11.csv',
    15: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate15.csv',
    14: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate14.csv',
    16: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate16.csv',
    3: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate3_redo.csv',
    2: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate2_redo.csv',
    17: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate17.csv',
    8: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate8.csv',
    9: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate9.csv',
    4: '/Users/keith.cheveralls/Box-cache/LibraryDatabase/FACSQC_CSVs/TextLog_mNGplate4_redo.csv'
}

In [None]:
pd.read_csv(list(filenames.values())[1]).head()

In [None]:
def load_and_merge(plate_num):
    '''
    Load and merge the platemap and the FACSQC CSV for a single plate
    '''
    # platemap
    filepath = os.path.join(BOX_ROOT, 'LibraryDatabase', 'platemaps', 'mNGplate%d_Ref.csv' % plate_num)
    platemap = common.file_utils.read_and_validate_platemap(filepath)
    
    # FACS
    nathan_facs = pd.read_csv(filenames[plate_num])
    
    # merge (confusingly, the 'well' column of the FACS CSVs is the gene name)
    return pd.merge(platemap, nathan_facs, left_on='target_name', right_on='well')

In [None]:
# cat all of the merged platemap-FACSQC dataframes
arr = []
for plate_num in range(1, 20):
    d = load_and_merge(plate_num)
    d['plate_id'] = 'P%04d' % plate_num
    arr.append(d)

d = pd.concat(tuple(arr))
d.rename(columns={'area': 'nathan_area'}, inplace=True)

In [None]:
d.to_csv('../results/2019-07-16_all-nathan-FACSQC.csv', index=False)

### Compare my results to Nathan's FACSQC results

In [None]:
dk = pd.read_csv('../results/2019-07-16_all-facs-results.csv')
dn = pd.read_csv('../results/2019-07-16_all-nathan-FACSQC.csv')

In [None]:
dk.columns, dn.columns

In [None]:
dm = pd.merge(dn, dk, left_on=['plate_id', 'well_id'], right_on=['plate_id', 'well_id'])

In [None]:
# area
plt.figure(figsize=(10, 10))
plt.scatter(dm.nathan_area, dm.area, alpha=.1)
plt.gca().set_aspect('equal')
plt.gca().set_xlabel('GFP-positive area (Nathan)')
plt.gca().set_ylabel('GFP-positive area (Keith)')

In [None]:
# log intensity (Nathan's is the mode, mine is the median)
plt.figure(figsize=(10, 10))
plt.scatter(dm.log_intensity, dm.rel_median_log, alpha=.2, s=(100*dm.area))
plt.gca().set_aspect('equal')

if 0:
    plt.gca().set_xlim((0, 1))
    plt.gca().set_ylim((0, 1))
    
plt.gca().set_xlabel('Log mode intensity (Nathan)')
plt.gca().set_ylabel('Log median intensity (Keith)')

In [None]:
# where areas are different
dm.loc[(dm.area > .01) & (np.abs(dm.nathan_area/dm.area) < .7)][[
    'well_id', 'plate_id', 'nathan_area', 'area', 'rel_percentile99_log', 'rel_median_log', 'log_intensity'
]].sort_values(by='area', ascending=False)

In [None]:
# where nathan intensity is high and my median is low
dm.loc[(dm.log_intensity > 1) & (dm.rel_median_log < .7)][[
    'well_id', 'plate_id', 'nathan_area', 'area', 'rel_percentile99_log', 'rel_median_log', 'log_intensity']]

In [None]:
# where nathan's is low and my 99th percentile is high
dm.loc[(dm.log_intensity < .5) & (dm.rel_percentile99_log > 1.5)][[
    'well_id', 'plate_id', 'nathan_area', 'area', 'rel_percentile99_log', 'log_intensity']]