In [30]:
# TODO: remove unused functions from library / combine/separate libraries into better modules
# 

In [31]:
import os
import zipfile
import shutil
import arrow
import hashlib
import pandas as pd
import IPython.html.widgets as widgets

from toolz import \
    partition,\
    thread_last,\
    thread_first
    
from utils import \
    snd,\
    exists_at_path,\
    get_layout_data,\
    add_dict_to_dataframe,\
    add_col,\
    maprows,\
    format_num
    
from IPython.display import \
    clear_output

from raw import \
    get_plate_data

In [32]:
# String -> String
def rename_column(col):
    """ Rename column col to remove whitespace, backslashes, prefixes,
        and suffixes (esp. large parenthetic suffix). """
    if col.startswith('Cell:'):
        return col.split('(')[0].lstrip("Cell:").rstrip('/').strip(' ')
    else:
        return col.split('(')[0].rstrip('/').strip(' ')

In [33]:
plate_import_config = dict(
    delimiter = '\t',
    skiprows = 4,
    dropcols = ['Laser focus score',
                '\.[0-9]*\Z'],
    colrename = rename_column)

In [34]:
# Extract files into temporary working directory
zipfile_path = '/notebooks/add-data/data.zip'
extract_path = '/notebooks/tmp/extracted-data/'
temp_save_path = '/notebooks/tmp/imported-data/'

In [35]:
# Series -> String
def generate_cell_sid(cell_data):
    """ Given Series containing cell information, 
        generate hash string to use as string id. """
    
    # String -> String
    def computeMD5hash(string):
        m = hashlib.md5()
        m.update(string.encode('utf-8'))
        return m.hexdigest()
    
    columns_to_hash = \
        ['Plate ID',
         'Well Name',
         'Site ID',
         'Cell ID']
    
    return thread_last(
        cell_data[columns_to_hash].tolist(),
        (map,str),
        (str.join,' '),
        computeMD5hash, 
        lambda string: 'CELL_' + string)

In [36]:
# Series -> DataFrame
def gather_plate_data(plate_metadata):
    """ Given Series containing filepaths for plate and layout,
        import these files, join them, and add the series itself 
        to create a master table for all the info about the plate. """
    
    # String -> String -> String
    get_path = lambda directory, column: \
        os.path.join(
            extract_path,
            directory,
            plate_metadata[column])
        
    plate_data = thread_last(
        ['Plates','Plate File'],
        (apply,get_path),
        lambda path: get_plate_data(path,
                                    plate_import_config))
    
    # Add string ID for use as primary key
    plate_data['Cell SID'] = plate_data.apply(generate_cell_sid,
                                              axis = 1)
    
    layout_data = thread_last(
        ['Layouts','Layout File'],
        (apply,get_path),
        lambda path: get_layout_data(path))
    
    return thread_first(
        pd.merge(plate_data,layout_data,on = 'Well Name'),
        (add_dict_to_dataframe,dict(plate_metadata)),
        (add_col,'Upload Timestamp',arrow.now().timestamp))

In [37]:
def generate_message(tests):
    """ Print out statements for all tests that fail. """
    return thread_last(
        tests,
        (partition,2),
        (filter,lambda pair: pair[0] == False),
        (map,snd),
        (str.join,'\n'))

In [38]:
def check(_):
    """ Extract zip and prepare for import into main dataset. 
        If data can be imported, then create a new csv in a temp directory. 
        Returns string of any warning or errors in this process. """
    
    if os.path.exists(extract_path):
        shutil.rmtree(extract_path) # clear out existing files

    with zipfile.ZipFile(zipfile_path, "r") as z:
        z.extractall(extract_path)    
        
    # Check files for correctness
    exists = exists_at_path(extract_path) # curried function
    nonempty = lambda entity: len(os.listdir(os.path.join(extract_path,entity))) > 0
    initial_tests = \
        [exists('metadata.csv'), "File missing: metadata.csv",
         exists('Plates/'), "Folder missing: Plates",
         exists('Layouts/'), "Folder missing: Layouts",
         nonempty('Plates/'), "It looks like you haven't got any plates in your Plates folder.",
         nonempty('Layouts/'), "It looks like you haven't got any layouts in your Layouts folder."]
    
    err = generate_message(initial_tests)
    clear_output()
    
    if err != '':
        print "### ERROR ###"
        print err
    else: 
        metadata_path = os.path.join(extract_path,'metadata.csv')
        metadata = pd.read_csv(metadata_path)
        all_data = thread_last(
            metadata,
            (maprows,gather_plate_data),
            pd.concat)
        
        if os.path.exists(temp_save_path):
            shutil.rmtree(temp_save_path)
        os.makedirs(temp_save_path)
        all_data.to_csv(os.path.join(temp_save_path,'new_data.csv'),
                        index=False)
        print "Ready to upload {} cells!".format(format_num(len(all_data)))

In [39]:
stage_button = widgets.Button(description = "Check data")
stage_button.on_click(check)

In [40]:
stage_button

Ready to upload 139,070 cells!
