In [18]:
import toolz as tz
import os
import zipfile
import pandas as pd

# Top Level Functions

In [44]:
def check_files(path): 
    """ ----------------
        String -> String
        ----------------
        Given path to uploaded files, check necessary
        files are present. Return error string ('' if no errors.)
    
    """
    to_path = lambda f: os.path.join(path, f)
    exists = lambda f: os.path.exists(to_path(f))
    nonempty = lambda f: len(os.listdir(to_path(f))) > 0
    checks = \
        [(exists('metadata.csv'), "File missing: metadata.csv"),
         (exists('Plates/'), "Folder missing: Plates"),
         (exists('Layouts/'), "Folder missing: Layouts"),
         (nonempty('Plates/'), "It looks like you haven't got any plates in your Plates folder."),
         (nonempty('Layouts/'), "It looks like you haven't got any layouts in your Layouts folder.")]
    
    return '\n'.join([string for boolean, string in checks if boolean == False])

def unzip(zipped_path, unzipped_path):
    """ ------------------------------------------
        String -> String -> SideEffect(FileSystem)
        ------------------------------------------
        Given path to zipped file, unzip and write to 
        unzipped path. 
    """
     # clear out existing files
    if os.path.exists(unzipped_path):
        shutil.rmtree(unzipped_path)
    
    # Unzip
    with zipfile.ZipFile(zipped_path, "r") as z:
        z.extractall(unzipped_path)
        
def get_plate_data(path):
    """ -------------------
        String -> DataFrame
        -------------------
        Get plate data, drop empty columns, drop selected columns, 
        rename columns.
    """
    delimiter = '\t',
    skiprows = 4,
    dropcols = ['Laser focus score',
                '\.[0-9]*\Z'],

    def rename_column(col):
        """ Rename column col to remove whitespace, backslashes, prefixes,
            and suffixes (esp. large parenthetic suffix). """
        if col.startswith('Cell:'):
            return col.split('(')[0].lstrip("Cell:").rstrip('/').strip(' ')
        else:
            return col.split('(')[0].rstrip('/').strip(' ')
    
    return thread_first(path,
                        from_file,
                        (str.replace,'\r',''),
                        StringIO,
                        pd.read_csv(delimiter = delimiter, skiprows = skiprows),
                        df.dropna(axis = 1, how = 'all'),
                        (drop_matching_columns, dropcols),
                        df.rename(columns = rename_column))

def gather_plate_data(plate_metadata):
    """ -------------------
        Series -> DataFrame
        -------------------
        Given paths to plate and layout files, 
        combine their contents into one dataframe. 
    """
    plate_path = os.path.join(unzipped_path, 'Plates', plate_metadata['Plate File'])
    plate_data = get_plate_data(plate_path, plate_import_config)
    
    layout_path = os.path.join(unzipped_path, 'Layouts', plate_metadata['Layout File'])
    layout_data = get_layout_data(plate_path, plate_import_config)
    
    generate_condition_string = lambda series: ' '.join([str(x) for x in series.values[1:]])
    layout_data['Condition'] = layout_data.apply(generate_condition_string, axis = 1)

    return tz.thread_first(
        pd.merge(plate_data, layout_data, on = 'Well Name'),
        (add_dict_to_dataframe, dict(plate_metadata))) # add all plate metadata to final dataframe

def get_layout_data(path):
    """ -------------------
        String -> DataFrame
        -------------------
        Given a path to a file with proper format (see below), return a dataframe 
        with 'Well Name' column and additional columns for each provided parameter.
        
        Format: Parameter Name, 1, 2 ...
                A, Value, Value ...
                B, Value, Value ...
        Notes: '\r' is present in csv output on windows (or google docs) and can confuse pandas `read_csv` function.
               Algorithm partitions by whether row is empty (each section of data should be separated by a blank line), 
                 then filters out groups where row is empty (text of row contains only commas).
                ...   
        """
    return tz.thread_last(
        path,
        from_file,
        split_on_newlines,
        (map, lambda line: line.rstrip(',')),
        (tz.partitionby, lambda s: s == ''),
        (filter, lambda group: not string_is_empty(group[0])),
        (filter, lambda group: group[0] != ''),
        (map, lambda strings: pd.read_csv(StringIO(str.join('\n', strings)))),
        (map, parse_label_group),
        (reduce, lambda left, right: pd.merge(left, right, on = 'Well Name')))

get_metadata = lambda f: drop_empty(pd.read_csv(f))

def parse_label_group(df):
    """ -------------------------------------------
        DataFrame -> DataFrame['Well Name', Parameter]
        -------------------------------------------
        Takes string containing all data for one field, and creates a 
        tidy dataframe with two columns: 'Well Name', and field. 
    """
    letters = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
    label_name = df.columns[0]
    return tz.thread_last(
        df.values[:,1:],
        lambda values: pd.DataFrame(values, columns = map(lambda num: stringify(num,2), range(1,values.shape[1] + 1))),
        lambda dataframe: add_col(dataframe, 'Row', pd.Series(letters[:len(dataframe)])),
        lambda dataframe: pd.melt(dataframe, id_vars=['Row']),
        lambda dataframe: add_col(dataframe,'Well Name', dataframe['Row'] + dataframe['variable']),
        lambda dataframe: dataframe.drop(['Row','variable'], axis=1),
        lambda dataframe: dataframe.rename(columns={'value': label_name}),
        lambda dataframe: dataframe[['Well Name', label_name]]
        )

# Utils

In [43]:
def split_on_newlines(string):
    """ ---------------------
        String -> List String
        ---------------------
        Given a string which may contain \r, \n, or both, 
        split on newlines so neither character is present in output. 
    """
    r = '\r' in string
    n = '\n' in string
    
    if r and n: 
        return string.replace('\r','').split('\n')
    elif r:
        return string.split('\r')
    else:
        return string.split('\n')

def add_dict_to_dataframe(dataframe, my_dict):
    """ ----------------------------------
        DataFrame -> Dict a b -> DataFrame
        ----------------------------------
        Return dataframe with new column for each key-value pair.
        Values are repeated for all rows in a given column. """
    d = dataframe.copy()
    for k, v in my_dict.iteritems():
        d[k] = v
    return d

def matches_any_pattern(s,patterns):
    """ -------------------------------
        String -> List Regex -> Boolean
        -------------------------------
        Return True if any of the patterns matches string s. 
    """
    return any([re.search(pattern,s) for pattern in patterns])

def drop_matching_columns(df,patterns):
    """ ----------------------
        DataFrame -> DataFrame
        ----------------------
        Drop columns from dataframe if they match any pattern.
    """
    matching_columns = [col for col in df.columns
                            if matches_any_pattern(col,patterns)]
    return df.drop(matching_columns,axis=1)

drop_empty = lambda df: df.dropna(how='all',axis=0).dropna(how='all',axis=1)    

def from_file(filename):
    """ ----------------
        String -> String
        ----------------
        Return contents of selected file.
    """
    f = open(filename)
    data = f.read()
    f.close()
    return data

def add_col(dataframe, colname, values):
    """ --------------------------------------------
        DataFrame -> String -> (a | [a] | Series[a])
        --------------------------------------------
        Add column to dataframe with given values.
    """
    dataframe[colname] = values
    return dataframe



In [22]:
data_path = '/home/ubuntu/assay-explorer/data/07-20-2015-2016-04-23.zip'
unzip_path = '/home/ubuntu/assay-explorer/data/unzipped/'
meta_path = '/home/ubuntu/assay-explorer/data/unzipped/07-20-2015/metadata.csv'

In [13]:
unzip(data_path, unzip_path)

In [26]:
check_files('/home/ubuntu/assay-explorer/data/unzipped/07-20-2015/')

''

In [33]:
meta = get_metadata(meta_path)

In [None]:
all_data = thread_last(
    metadata,
    (maprows,gather_plate_data),
    pd.concat)