# Descriptions of uploader classes and methods
'''
1) Determine FileType: xlsx, mex, h5ad, h5_from_10x
2) Based on filetype, create a Class object that is specific to the FileType
    - different filetypes require different processing and handling
    - class - xlsx
        - method - open and read excel file into an AnnData object
            - 1st option: use scanpy's read_excel(file)
                - Can't do. scanpy uses AnnData lib to read the excel file. AnnData's read.py is limited
                  to reading 1 sheet, which becomes '.X' of the AnnData object
                - Links:
                    scanpy - https://scanpy.readthedocs.io/en/latest/api/scanpy.api.read_excel.html#scanpy.api.read_excel
                    AnnData - https://github.com/theislab/anndata/blob/master/anndata/readwrite/read.py#L40
            - 2nd option: use pandas.read_excel to import all sheets
            - 3rd option: use python module (xlrd?) to parse out info
        - method? - validation checks
            - check which sheets are present.
                - required - 'expression' and 'observation'
                - optional - 'genes'
            - validate # observations in 'observation' sheet equals # observations in 'expression' sheet
            - validate observation names in 'observation' sheet equal those in 'expression' sheet
            - If 'genes' sheet present, validate # genes in 'genes' sheet equals # genes in 'expression' sheet
            - If 'genes' sheet present, validate gene names in 'genes' sheet equal those in 'expression' sheet
            - check 'expression' contains no strings in matrix
        - method - validate gene symbols / ensembl ids with gEAR MySQL
            - new Global function
        - method - calculate averages, standard_deviations, p-values from replicates
            - ? TODO ?: Add FDR and standard_errors ?
            -use new Global functions:
                - calc_average()
                - calc_std_dev()
                - calc_pval()
        - method - calculate coloring values
            - gene based color & abs color
            - tissue based color & abs color
            - dataset based color & abs color
    - class - mex
    - class - h5ad
    - class - h5_from_10x
3) Handle unstructured metadata (xlsx, txt, or json?)
'''

In [3]:


#reference: http://python-3-patterns-idioms-test.readthedocs.io/en/latest/Factory.html#preventing-direct-creation

class FileType(object):
    filetypes = []
    
def upload_dataset(filetype):
    #This factory nests the dataset filetype classes. Preventing them from being directly called
    # dataset = upload_dataset('excel')
    #   creates an Excel class object which now can be processed for uploading
    class Excel(FileType):
        def _read_file(self):
            # 3 options: 
            #   1. request scanpy writers to add multiple sheets to scanpy.read_excel
            #   2. use pandas.read_excel to import all sheets (3 separate DataFrames I think)
            #   3. suboptimal - use python module to parse out info (xlrd?)

            # NOTE: Depending on the end format of _read_file(), will determine a lot 
            #       of how this work and reusable by other FileType objects (Mex, H5ad, H510x) 
            validate_dataset(self)
            # RETURNS AnnData object or pandas dataframe
            pass

        def _add_calculated_values(self):
            #calculate statistical values
            calc_average()
            calc_pval()
            calc_std_dev()
            calc_std_err() #TODO?
            calc_fdr() #TODO?
            pass

        def _add_color_values(self):
            #calculate raw and absolute coloring
            get_color_gene()
            get_color_tissue()
            get_color_dataset()
            get_color_abs_gene()
            get_color_abs_tissue()
            get_color_abs_dataset()

        pass

    class Mex(FileType):
        pass
    class H5ad(FileType):
        pass
    class H510x(FileType):
        pass
    if filetype == "excel": return Excel()
    if filetype == "mex": return Mex()
    if filetype == "h5ad": return H5ad()
    if filetype == "hdf5": return H510x()
    assert 0, "Do not recognize file type given: " + filetype



# ----- helper functions ----- #
# How these are performed depends on whether UploadFileTypeObject is AnnData vs pandas DataFrames

def validate_dataset(UploadFileTypeObject):
    # Runs validation checks:
    #   1. 'expression' and 'observation' sheets present? Is optional sheet 'genes' present?
    #   2. # rows in 'observation' == # columns in 'expression'
    #   3. names in 'observation' == names in 'expression'
    #   4. # rows in 'genes' == # rows in 'expression' (If 'genes' present)
    #   5. name in 'genes' == names in 'expression' (If 'genes' present)
    #   6. 'expression' sheet lacks string values
    # Raise Error if any above tests fail
    pass
        
def validate_genes(UploadFileTypeObject):
    # After the data file is loaded as a object:
    #   1. Connect to gEAR MySQL and create a cache of genes (denoting primary and secondary symbols)
    #   2. Use cached genes to generate a list of genes that:
    #       A. Are not in the database
    #       B. Are duplicated secondary gene symbols. The primary is already present.
    #   3. Return list to user of what genes were skipped 
        
    # optional: This function could also remove those not found/skipped genes? 
    pass

def calc_average():
    pass

def calc_pval():
    pass

def calc_std_dev():
    pass

def calc_std_err():
    #TODO: Are we doing this?
    # was requested by Seth a ways back
    pass
def calc_fdr():
    #TODO: Are we doing this?
    # corrected pvalue
    pass

def get_color_gene():
    pass
def get_color_tissue():
    pass
def get_color_dataset():
    pass

def get_color_abs_gene():
    pass
def get_color_abs_tissue():
    pass    
def get_color_abs_dataset():
    pass

# Create an excel object
x = upload_dataset('excel')
x

<__main__.upload_dataset.<locals>.Excel at 0x7fb0d61d6f98>