In [100]:
# TODO: remove unused functions from library / combine/separate libraries into better modules
# TODO: put zip file somewhere safe after its been used. 
# TODO: create regular backups to google docs or S3
# TODO: profile code to reduce upload times (strongly suspect df.apply() statements are bad. As in SID creation)
# TODO: add plate ID - one for each row in the provided metadata files. Should just be a randomly generated uuid.

In [101]:
import os
import shutil
import zipfile
import hashlib
from collections import OrderedDict

import arrow
from numpy.random import random
import pandas as pd
import IPython.html.widgets as widgets
from IPython.display import clear_output

from toolz import partition, partitionby, thread_last, thread_first
from utils import (snd, exists_at_path, get_layout_data, add_dict_to_dataframe,
                   add_col, maprows, format_num, from_file, format_timestamp, 
                   parse_label_group, string_only_contains, generate_sid)
    
from raw import get_plate_data
from conf import PATH

In [102]:
# String -> String
def rename_column(col):
    """ Rename column col to remove whitespace, backslashes, prefixes,
        and suffixes (esp. large parenthetic suffix). """
    if col.startswith('Cell:'):
        return col.split('(')[0].lstrip("Cell:").rstrip('/').strip(' ')
    else:
        return col.split('(')[0].rstrip('/').strip(' ')

In [103]:
normalization_config = \
   [['Normalized_ColocSpot_area_sum (coloc)',
      ['ColocSpots_area_sum'],
      ['FITC-TxRed_coloc_area_sum']],
    ['Normalized_ColocSpot_area_sum (all)',
      ['ColocSpots_area_sum'],
      ['FITC-TxRed_all_area_sum']],

    ['Normalized coloc spots (by FITC & TxRed)',
      ['# of Coloc Spots'],
      ['# of FITC spots', '# of TxRed spots']],
    ['Normalized coloc spots (by FITC)',
      ['# of Coloc Spots'],
      ['# of FITC spots']],
    ['Normalized coloc spots (by TxRed)',
      ['# of Coloc Spots'],
      ['# of TxRed spots']],

    ['Normalized coloc spots (by FITC in coloc)',
      ['# of Coloc Spots'],
      ['# of FITC in ColocSpots']],
    ['Normalized coloc spots (by TxRed in coloc)',
      ['# of Coloc Spots'],
      ['# of TxRed in ColocSpots']],
    ['Normalized coloc spots (by FITC-TxRed in coloc)',
      ['# of Coloc Spots'],
      ['# of FITC-TxRed in ColocSpots']]]

In [104]:
plate_import_config = dict(
    delimiter = '\t',
    skiprows = 4,
    dropcols = ['Laser focus score',
                '\.[0-9]*\Z'],
    colrename = rename_column,
    normcols = normalization_config)

In [133]:
# Extract files into temporary working directory
zipfile_path = os.path.join(PATH, 'raw', 'data.zip') #'/notebooks/add-data/data.zip'
extract_path = os.path.join(PATH, 'data') #'/notebooks/tmp/extracted-data/'
temp_save_path = os.path.join(PATH, 'data') # '/notebooks/tmp/imported-data/'
db_path = os.path.join(PATH, 'db', 'db.csv') #'/notebooks/moldev-data/db/db.csv'

folders = ['raw', 'data', 'db', 'tmp']
for folder in folders:
    folder_path = os.path.join(PATH, folder)
    if not os.path.exists(folder_path):
        print('Creating "%s".' % folder_path)
        os.mkdir(folder_path)
        


In [106]:
# String -> String
def computeMD5hash(string):
    m = hashlib.md5()
    m.update(string.encode('utf-8'))
    return m.hexdigest()

In [107]:
# Series -> String
def generate_cell_sid(cell_data):
    """ Given Series containing cell information,
        generate hash string to use as string id. """

    columns_to_hash = ['Plate ID', 'Well Name', 'Site ID', 'Cell ID']
    
    return thread_last(
        cell_data[columns_to_hash].tolist(),
        (map,str),
        (str.join,' '),
        computeMD5hash, 
        lambda string: 'CELL_' + string)

In [108]:
# Series -> DataFrame
def gather_plate_data(plate_metadata):
    """ Given Series containing filepaths for plate and layout,
        import these files, join them, and add the series itself 
        to create a master table for all the info about the plate. """
    
    # String -> String -> String
    def get_path(directory,column):
        """ Return path with first folder at given directory, 
            and file at given column of metadata.csv file. 
            
            (i.e. go to folder X and get file found in column Y of metadata file.)"""
            
        return os.path.join(
            extract_path,
            directory,
            plate_metadata[column])
        
    plate_data = thread_last(
        ['Plates','Plate File'],
        (apply,get_path),
        lambda path: get_plate_data(path,
                                    plate_import_config))
    
    # Add string ID for use as primary key
    plate_data['Cell SID'] = plate_data.apply(generate_cell_sid,
                                              axis = 1)
    
#     plate_data['Plate SID'] = "Plate_{}".format(generate_sid())  
    layout_data = thread_last(
        ['Layouts','Layout File'],
        (apply,get_path),
        lambda path: get_layout_data(path))
    
    # Series -> String
    def concatStrings(series):
        """ Concatenate values in all but first column. """
        return ' '.join([str(x) for x in series.values[1:]])

    layout_data['Condition'] = layout_data.apply(concatStrings, axis = 1)
    
    return thread_first(
        pd.merge(plate_data,layout_data,on = 'Well Name'),
        (add_dict_to_dataframe,dict(plate_metadata)))

In [109]:
# DataFrame -> DataFrame -> [Timestamp]
def find_uploads_with_duplicate_cells(db_dataframe,new_dataframe):
    """ Given a primary dataframe acting as central information store, 
        and a new dataframe containing data to be incorporated into the primary store, 
        check if there are any duplicated cells, and when they were added. 
        
        Returns list of timestamps for days when duplicate cells were uploaded. 
        (Returns empty list if there are no duplicate cells.) """
    
    new_cell_sids = new_dataframe['Cell SID']
    duplicate_cells = db_dataframe['Cell SID'].isin(new_cell_sids)
    return db_dataframe[duplicate_cells]['Upload Timestamp'].unique()

In [110]:
def generate_message(tests):
    """ Print out statements for all tests that fail. """
    return thread_last(
        tests,
        (partition,2),
        (filter,lambda pair: pair[0] == False),
        (map,snd),
        (str.join,'\n'))

In [111]:
def check(_):
    """ Extract zip and prepare for import into main dataset. 
        If data can be imported, then create a new csv in a temp directory. 
        Returns string of any warning or errors in this process. """
    
    if os.path.exists(extract_path):
        shutil.rmtree(extract_path) # clear out existing files

    with zipfile.ZipFile(zipfile_path, "r") as z:
        z.extractall(extract_path)    
        
    # Check files for correctness
    exists = exists_at_path(extract_path) # curried function
    nonempty = lambda entity: len(os.listdir(os.path.join(extract_path,entity))) > 0
    initial_tests = \
        [exists('metadata.csv'), "File missing: metadata.csv",
         exists('Plates/'), "Folder missing: Plates",
         exists('Layouts/'), "Folder missing: Layouts",
         nonempty('Plates/'), "It looks like you haven't got any plates in your Plates folder.",
         nonempty('Layouts/'), "It looks like you haven't got any layouts in your Layouts folder."]
    
    err = generate_message(initial_tests)
    clear_output()
    
    if err != '':
        print("### ERROR ###")
        print(err)
    else: 
        # Read metadata
        metadata_path = os.path.join(extract_path,'metadata.csv')
        metadata = pd.read_csv(metadata_path).dropna(how='all',axis=0).dropna(how='all',axis=1)
        
        # Get all data
        all_data = thread_last(
            metadata,
            (maprows,gather_plate_data),
            pd.concat)
        
        all_data['Upload Timestamp'] = arrow.now().timestamp
        
        # Check for duplicated cells
        try: 
            db_dataframe = pd.read_csv(db_path)
            duplicate_timestamps = find_uploads_with_duplicate_cells(db_dataframe,all_data)
            if len(duplicate_timestamps) > 0:
                for ts in duplicate_timestamps:
                    time = arrow.get(ts).to('US/Pacific').format('MMMM DD, YYYY, h:mm a')
                    time_ago = arrow.get(ts).humanize()
                    print("It looks like you already uploaded some of this data on {} ({})".format(time,time_ago))
                print("If you'd like to overwrite this data, you'll need to remove the data for these dates first.")
            else: 
                print("Ready to upload {} cells!".format(format_num(len(all_data))))
        except Exception as e:
            print(e)
        # Save data to temporary location
        if os.path.exists(temp_save_path):
            shutil.rmtree(temp_save_path)
        os.makedirs(temp_save_path)
        all_data.to_csv(os.path.join(temp_save_path,'new_data.csv'),
                        index=False)
    return
        
        

In [112]:
stage_button = widgets.Button(description = "Check data")
stage_button.on_click(check)

# Prepare and check data
This step takes around 3 minutes.

In [113]:
stage_button

It looks like you already uploaded some of this data on April 23, 2016, 5:56 pm (an hour ago)
If you'd like to overwrite this data, you'll need to remove the data for these dates first.


In [119]:
def add_new_data(_):
    """ Add new data (from temp file) to db file.
        Fails if there are duplicate cells."""
    try:
        db_data = [pd.read_csv(db_path)]
    except Exception as e:
        print(e)
        db_data = []
    new_data = pd.read_csv(os.path.join(temp_save_path,'new_data.csv'))
    all_data = pd.concat(db_data+[new_data])
    contains_duplicated_cells = all_data.duplicated('Cell SID').any()
    
    clear_output()
    
    if contains_duplicated_cells:
        print("It looks like the data's already been added.")
    else:
        all_data.to_csv(db_path,index = False)
        print("Just saved data!")

# Save data
Once the data's been checked for correctness (all files are present, and none of the data has already been uploaded), save it!

In [120]:
save_button = widgets.Button(description = "Save data",background_color='Green',color = 'white')
save_button.on_click(add_new_data)
save_button

It looks like the data's already been added.


# Delete past uploads
If you've made any mistake, and need to delete something you've uploaded, this is the place to do it. Just select the upload that you'd like to remove, and click `delete`.

In [121]:
db_data = pd.read_csv(db_path)
timestamps = db_data['Upload Timestamp'].unique()

delete_options = thread_last(
    timestamps,
    list,
    lambda x: sorted(x,reverse=True),
    (map,lambda x: (x,x)),
    (map,lambda x: (format_timestamp(x[0]),x[1])),
    OrderedDict)

delete_dropdown = widgets.Dropdown(options = delete_options)

def delete_handler(_):
    """ Remove data uploaded at selected timestamp. """
    timestamp = delete_dropdown.value
    trimmed_data = db_data[db_data['Upload Timestamp'] != timestamp]
    trimmed_data.to_csv(db_path,index=False)
    clear_output()
    print "Just deleted data."

delete_button = widgets.Button(description = 'Delete',background_color='Red',color = 'white')
delete_button.on_click(delete_handler)
widgets.HBox(children = [delete_button,delete_dropdown])

Just deleted data.


In [None]:
# testpath = '/notebooks/tmp/extracted-data/Plates/APB HS JS (60X) 08.06.2015 siRNA VE821.txt'
# test = get_plate_data(testpath,plate_import_config)

In [None]:
# layouttest = get_layout_data('/notebooks/tmp/extracted-data/Layouts/layout.csv')

In [None]:
# test['Well Name'].unique()
# test2 = pd.read_csv('/notebooks/tmp/imported-data/new_data.csv')

In [127]:
# String -> [String]
# def split_on_newlines(string):
#     """ Given a string which may contain \r, \n, or both, 
#         split on newlines so neither character is present in output. """
    
#     r = '\r' in string
#     n = '\n' in string
    
#     if r and n: 
#         return string.replace('\r','').split('\n')
#     elif r:
#         return string.split('\r')
#     else:
#         return string.split('\n')

In [124]:
# l2 = thread_last(
#      '/notebooks/tmp/extracted-data/Layouts/layout.csv',
#      from_file,
#      lambda string: string.replace('\r','').split('\n'),
#      (map,lambda line: line.rstrip(',')),
#      (partitionby, lambda line: string_only_contains(line,',')),
#      (filter,lambda group: not string_only_contains(group[0],',')),
#      (map,lambda strings: str.join('\n',strings)),
#      (map,parse_label_group),
#      (reduce,lambda left,right: pd.merge(left,right,on='Well Name')))

TypeError: 'tuple' object is not callable

In [129]:
# # String -> Boolean
# def string_is_empty(string):
#     """ Return True if string is empty. """
#     return string == ''

In [132]:
# l2 = thread_last(
#     os.path.join(PATH, 'data', 'Layouts', 'layout.csv'),
#     from_file,
#     split_on_newlines,
#     (map,lambda line: line.rstrip(',')),
#     (partitionby, string_is_empty),
#     (filter,lambda group: not string_is_empty(group[0])),
#     (map,lambda strings: str.join('\n',strings)),
#     (map,parse_label_group),
#     (reduce,lambda left,right: pd.merge(left,right,on='Well Name')))

IOError: [Errno 2] No such file or directory: '/home/cabal/gits/assay-explorer/data/Layouts/layout.csv'

In [None]:
# for x in l2['Units (concentration)'].unique():
#     print x

In [None]:
# list(l2)