In [1]:
# TODO: remove unused functions from library / combine/separate libraries into better modules
# 

In [21]:
import os
import zipfile
import shutil
import arrow
import pandas as pd
import IPython.html.widgets as widgets

from toolz import \
    partition,\
    thread_last,\
    thread_first
    
from utils import \
    snd,\
    exists_at_path,\
    get_layout_data,\
    add_dict_to_dataframe,\
    add_col
    
from IPython.display import \
    clear_output

from raw import \
    get_plate_data

In [3]:
# String -> String
def rename_column(col):
    """ Rename column col to remove whitespace, backslashes, prefixes,
        and suffixes (esp. large parenthetic suffix). """
    if col.startswith('Cell:'):
        return col.split('(')[0].lstrip("Cell:").rstrip('/').strip(' ')
    else:
        return col.split('(')[0].rstrip('/').strip(' ')

In [4]:
plate_config = dict(
    delimiter = '\t',
    skiprows = 4,
    dropcols = ['Laser focus score',
                '\.[0-9]*\Z'],
    normcols = [['Normalized_ColocSpot_area_sum (coloc)',
                  ['ColocSpots_area_sum'],
                  ['FITC-TxRed_coloc_area_sum']],
                ['Normalized_ColocSpot_area_sum (all)',
                  ['ColocSpots_area_sum'],
                  ['FITC-TxRed_all_area_sum']],
        
                ['Normalized coloc spots (by FITC & TxRed)',
                  ['# of Coloc Spots'],
                  ['# of FITC spots', '# of TxRed spots']],
                ['Normalized coloc spots (by FITC)',
                  ['# of Coloc Spots'],
                  ['# of FITC spots']],
                ['Normalized coloc spots (by TxRed)',
                  ['# of Coloc Spots'],
                  ['# of TxRed spots']],
               
                ['Normalized coloc spots (by FITC in coloc)',
                  ['# of Coloc Spots'],
                  ['# of FITC in ColocSpots']],
                ['Normalized coloc spots (by TxRed in coloc)',
                  ['# of Coloc Spots'],
                  ['# of TxRed in ColocSpots']],
                ['Normalized coloc spots (by FITC-TxRed in coloc)',
                  ['# of Coloc Spots'],
                  ['# of FITC-TxRed in ColocSpots']]],
    colrename = rename_column)

In [5]:
# Extract files into temporary working directory
zipfile_path = '/notebooks/add-data/data.zip'
extract_path = '/notebooks/tmp/extracted-data/'

In [6]:
def generate_message(tests):
    """ Print out statements for all tests that fail. """
    return thread_last(
        tests,
        (partition,2),
        (filter,lambda pair: pair[0] == False),
        (map,snd),
        (str.join,'\n'))

In [7]:
def check(_):
    """ Extract zip and prepare for import into main dataset. 
        If data can be imported, then create a new csv in a temp directory. 
        Returns string of any warning or errors in this process. """
    
    shutil.rmtree(extract_path) # clear out existing files

    with zipfile.ZipFile(zipfile_path, "r") as z:
        z.extractall(extract_path)    
        
    # Check files for correctness
    exists = exists_at_path(extract_path) # curried function
    nonempty = lambda entity: len(os.listdir(os.path.join(extract_path,entity))) > 0
    initial_tests = \
        [exists('metadata.csv'), "File missing: metadata.csv",
         exists('Plates/'), "Folder missing: Plates",
         exists('Layouts/'), "Folder missing: Layouts",
         nonempty('Plates/'), "It looks like you haven't got any plates in your Plates folder.",
         nonempty('Layouts/'), "It looks like you haven't got any layouts in your Layouts folder."]
    
    err = generate_message(initial_tests)
    clear_output()
    if err != '':
        print "### ERROR ###"
        print err
    else: 
        print "Ready to upload!"

In [8]:
stage_button = widgets.Button(description = "Check data")
stage_button.on_click(check)

In [9]:
stage_button

In [10]:
metadata_path = os.path.join(extract_path,'metadata.csv')
metadata = pd.read_csv(metadata_path)

In [11]:
metadata

Unnamed: 0,Plate File,Layout File,Assay,Image Collection Date,Investigator,Magnification,Image Analysis Recipe,Experiment Name
0,APB HS JS 60X 07.22.2015 ML216 MMC HU TS.txt,layout.csv,APB,7/22/2015,HS JS,60,APB HS JS 60X 07.22.2015 ML216 MMC HU TS,APB/ssC Drug Response
1,ssC HS JS 07.20.2015 ML216 MMC HU TS .txt,layout.csv,ssC,7/22/2015,HS JS,60,ssC HS JS 60X 07.22.2015 ML216 MMC HU TS,APB/ssC Drug Response


In [12]:
plate = metadata.iloc[0]

plate_path = \
    os.path.join(
        extract_path,
        'Plates',
        plate['Plate File'])

layout_path = \
    os.path.join(
        extract_path,
        'Layouts',
        plate['Layout File'])

upload_timestamp = arrow.now().timestamp

In [13]:
plate_data = get_plate_data(plate_path,plate_config)

In [14]:
layout_data = get_layout_data(layout_path)

In [16]:
all_data = thread_first(
    pd.merge(plate_data,layout_data,on = 'Well Name'),
    (add_dict_to_dataframe,dict(plate)))

In [None]:
# Series -> DataFrame
def gather_plate_data(plate_metadata):
    """ Given Series containing filepaths for plate and layout,
        import these files, join them, and add the series itself 
        to create a master table for all the info about the plate. """
    
    get_path = lambda directory, column: 
        os.path.join(
            extract_path,
            directory,
            plate_metadata[column])
            
    plate_path = get_path('Plates','Plate File')
    layout_path = get_path('Layouts','Layout File')
    
    plate_data = get_plate_data(plate_path,plate_config)
    layout_data = get_layout_data(layout_path)
    
    all_data = thread_first(
        pd.merge(plate_data,layout_data,on = 'Well Name'),
        (add_dict_to_dataframe,dict(plate)),
        (add_col,'Upload Timestamp',arrow.now().timestamp))