# Descriptions of uploader classes and methods
'''
1) Determine FileType: xlsx, mex, h5ad, h5_from_10x
2) Based on filetype, create a Class object that is specific to the FileType
    - different filetypes require different processing and handling
    - class - xlsx
        - method - open and read excel file into an AnnData object
            - 1st option: use scanpy's read_excel(file)
                - Can't do. scanpy uses AnnData lib to read the excel file. AnnData's read.py is limited
                  to reading 1 sheet, which becomes '.X' of the AnnData object
                - Links:
                    scanpy - https://scanpy.readthedocs.io/en/latest/api/scanpy.api.read_excel.html#scanpy.api.read_excel
                    AnnData - https://github.com/theislab/anndata/blob/master/anndata/readwrite/read.py#L40
            - 2nd option: use pandas.read_excel to import all sheets
            - 3rd option: use python module (xlrd?) to parse out info
        - method? - validation checks
            - check which sheets are present.
                - required - 'expression' and 'observation'
                - optional - 'genes'
            - validate # observations in 'observation' sheet equals # observations in 'expression' sheet
            - validate observation names in 'observation' sheet equal those in 'expression' sheet
            - If 'genes' sheet present, validate # genes in 'genes' sheet equals # genes in 'expression' sheet
            - If 'genes' sheet present, validate gene names in 'genes' sheet equal those in 'expression' sheet
            - check 'expression' contains no strings in matrix
        - method - validate gene symbols / ensembl ids with gEAR MySQL
            - new Global function
        - method - calculate averages, standard_deviations, p-values from replicates
            - ? TODO ?: Add FDR and standard_errors ?
            -use new Global functions:
                - calc_average()
                - calc_std_dev()
                - calc_pval()
        - method - calculate coloring values
            - gene based color & abs color
            - tissue based color & abs color
            - dataset based color & abs color
    - class - mex
    - class - h5ad
    - class - h5_from_10x
3) Handle unstructured metadata (xlsx, txt, or json?)
'''

In [1]:


#reference: http://python-3-patterns-idioms-test.readthedocs.io/en/latest/Factory.html#preventing-direct-creation

class FileType(object):
    filetypes = []
    
def upload_dataset(filetype):
    #This factory nests the dataset filetype classes. Preventing them from being directly called
    # dataset = upload_dataset('excel')
    #   creates an Excel class object which now can be processed for uploading
    class Excel(FileType):
        def _read_file(self):
            # 3 options: 
            #   1. request scanpy writers to add multiple sheets to scanpy.read_excel
            #   2. use pandas.read_excel to import all sheets (3 separate DataFrames I think)
            #   3. suboptimal - use python module to parse out info (xlrd?)

            # NOTE: Depending on the end format of _read_file(), will determine a lot 
            #       of how this work and reusable by other FileType objects (Mex, H5ad, H510x) 
            validate_dataset(self)
            # RETURNS AnnData object or pandas dataframe
            pass

        def _add_calculated_values(self):
            #calculate statistical values
            calc_average()
            calc_pval()
            calc_std_dev()
            calc_std_err() #TODO?
            calc_fdr() #TODO?
            pass

        def _add_color_values(self):
            #calculate raw and absolute coloring
            get_color_gene()
            get_color_tissue()
            get_color_dataset()
            get_color_abs_gene()
            get_color_abs_tissue()
            get_color_abs_dataset()

        pass

    class Mex(FileType):
        pass
    class H5ad(FileType):
        pass
    class H510x(FileType):
        pass
    if filetype == "excel": return Excel()
    if filetype == "mex": return Mex()
    if filetype == "h5ad": return H5ad()
    if filetype == "hdf5": return H510x()
    assert 0, "Do not recognize file type given: " + filetype



# ----- helper functions ----- #
# How these are performed depends on whether UploadFileTypeObject is AnnData vs pandas DataFrames

def validate_dataset(UploadFileTypeObject):
    # Runs validation checks:
    #   1. 'expression' and 'observation' sheets present? Is optional sheet 'genes' present?
    #   2. # rows in 'observation' == # columns in 'expression'
    #   3. names in 'observation' == names in 'expression'
    #   4. # rows in 'genes' == # rows in 'expression' (If 'genes' present)
    #   5. name in 'genes' == names in 'expression' (If 'genes' present)
    #   6. 'expression' sheet lacks string values
    # Raise Error if any above tests fail
    pass
        
def validate_genes(UploadFileTypeObject):
    # After the data file is loaded as a object:
    #   1. Connect to gEAR MySQL and create a cache of genes (denoting primary and secondary symbols)
    #   2. Use cached genes to generate a list of genes that:
    #       A. Are not in the database
    #       B. Are duplicated secondary gene symbols. The primary is already present.
    #   3. Return list to user of what genes were skipped 
        
    # optional: This function could also remove those not found/skipped genes? 
    pass

def calc_average():
    pass

def calc_pval():
    pass

def calc_std_dev():
    pass

def calc_std_err():
    #TODO: Are we doing this?
    # was requested by Seth a ways back
    pass
def calc_fdr():
    #TODO: Are we doing this?
    # corrected pvalue
    pass

def get_color_gene():
    pass
def get_color_tissue():
    pass
def get_color_dataset():
    pass

def get_color_abs_gene():
    pass
def get_color_abs_tissue():
    pass    
def get_color_abs_dataset():
    pass

# Create an excel object
x = upload_dataset('excel')
x

<__main__.upload_dataset.<locals>.Excel at 0x7f7410eff518>

In [2]:
# import scanpy.api as sc
# file = '/home/dolley/gear/tests/base_template.xlsx'
# # sheet = 'expression'
# expression = sc.read_excel(file, 'expression')
# observations = sc.read_excel(file, 'observations')
# genes = sc.read_excel(file, 'genes')




In [3]:
# expression.T

In [4]:
# expression.var

In [22]:
import numpy as np
import pandas as pd
import scanpy.api as sc
filepath = '/home/dolley/gear/tests/base_template.xlsx'
try:
    df = pd.read_excel(filepath, sheet_name='expression', index_col=0).transpose()
except :
    raise Exception("No expression sheet found. Expected spreadsheet sheet named 'expression'.")

try:
    X = df.values[:, 0:].astype(float)
except ValueError:
    raise Exception("Encountered unexpected value type. Expected float type in expression matrix.")

# Get counts of genes and observations from expression
number_genes_from_exp, number_obs_from_exp = X.shape

In [24]:
# Get the observations
# filepath = '/home/dolley/gear/tests/2_absent_obs_sheet.xlsx'
try:
    obs_df = pd.read_excel(filepath, sheet_name='observations', index_col=0)
except ValueError:
    raise Exception("No observations sheet found. Expected spreadsheet sheet named 'observations'.")

# Verify number observations equal those found in expression sheet
number_obs, number_cond = obs_df.shape
print(type(number_obs), number_obs)
print(type(number_obs_from_exp), number_obs_from_exp)
# if number_obs != number_obs_from_exp:
#     raise Exception("Observation count discrepancy. Row count({0}) in 'observations' sheet must match column count of 'expression' sheet({1}).".format(number_obs, number_obs_from_exp))

<class 'int'> 18
<class 'int'> 100


In [64]:
print('df index:\n', df.index)
print('obs_df index:\n', obs_df.index)
if not obs_df.index.equals(df.index):
    print('Not a match :(')
else:
    print('Yay! It match!')

df index:
 Index(['obs-ctrl-1-0h', 'obs-ctrl-2-0h', 'obs-ctrl-3-0h', 'obs-trt-1-0h',
       'obs-trt-2-0h', 'obs-trt-3-0h', 'obs-ctrl-1-24h', 'obs-ctrl-2-24h',
       'obs-ctrl-3-24h', 'obs-trt-1-24h', 'obs-trt-2-24h', 'obs-trt-3-24h',
       'obs-ctrl-1-48h', 'obs-ctrl-2-48h', 'obs-ctrl-3-48h', 'obs-trt-1-48h',
       'obs-trt-2-48h', 'obs-trt-3-48h'],
      dtype='object')
obs_df index:
 Index(['obs-trt-3-48h', 'obs-ctrl-2-0h', 'obs-ctrl-3-0h', 'obs-trt-1-0h',
       'obs-trt-2-0h', 'obs-trt-3-0h', 'obs-ctrl-1-24h', 'obs-ctrl-2-24h',
       'obs-ctrl-3-24h', 'obs-trt-1-24h', 'obs-trt-2-24h', 'obs-trt-3-24h',
       'obs-ctrl-1-48h', 'obs-ctrl-2-48h', 'obs-ctrl-3-48h', 'obs-trt-1-48h',
       'obs-trt-2-48h', 'obs-ctrl-1-0h'],
      dtype='object')
Not a match :(


In [7]:
# Get the genes (if present)
try:
    genes_df = pd.read_excel(filepath, sheet_name='genes', index_col=0)
except :
    raise Exception("No genes sheet found. Expected genes sheet named 'genes'.")

number_genes, number_conditions = genes_df.shape
# print(genes_df.head())
# print(genes_df.shape)
print(number_genes)

100


In [71]:
print('df columns:\n', df.columns.values)
print('genes_df index:\n', genes_df.index)
if not genes_df.index.equals(df.columns):
    print('Not a match :(')
else:
    print('Yay! It match!')

df index:
 ['ENSMUSG00000000567' 'ENSMUSG00000000568' 'ENSMUSG00000000569'
 'ENSMUSG00000000570' 'ENSMUSG00000000571' 'ENSMUSG00000000572'
 'ENSMUSG00000000573' 'ENSMUSG00000000574' 'ENSMUSG00000000575'
 'ENSMUSG00000000576' 'ENSMUSG00000000577' 'ENSMUSG00000000578'
 'ENSMUSG00000000579' 'ENSMUSG00000000580' 'ENSMUSG00000000581'
 'ENSMUSG00000000582' 'ENSMUSG00000000583' 'ENSMUSG00000000584'
 'ENSMUSG00000000585' 'ENSMUSG00000000586' 'ENSMUSG00000000587'
 'ENSMUSG00000000588' 'ENSMUSG00000000589' 'ENSMUSG00000000590'
 'ENSMUSG00000000591' 'ENSMUSG00000000592' 'ENSMUSG00000000593'
 'ENSMUSG00000000594' 'ENSMUSG00000000595' 'ENSMUSG00000000596'
 'ENSMUSG00000000597' 'ENSMUSG00000000598' 'ENSMUSG00000000599'
 'ENSMUSG00000000600' 'ENSMUSG00000000601' 'ENSMUSG00000000602'
 'ENSMUSG00000000603' 'ENSMUSG00000000604' 'ENSMUSG00000000605'
 'ENSMUSG00000000606' 'ENSMUSG00000000607' 'ENSMUSG00000000608'
 'ENSMUSG00000000609' 'ENSMUSG00000000610' 'ENSMUSG00000000611'
 'ENSMUSG00000000612' 'ENSMUS

In [8]:
import anndata

adata = anndata.AnnData(X=X, obs=obs_df, var=genes_df)
# adata = anndata.AnnData(X=X, obs=genes_df, var=obs_df)

# adata = anndata.AnnData(X=X.T, obs=obs_df, var=genes_df)
# adata = anndata.AnnData(X=X.T, obs=genes_df, var=obs_df)

adata

AnnData object with n_obs × n_vars = 18 × 100 
    obs_keys = ['cell_type', 'condition', 'replicate', 'time_point', 'time_unit']
    var_keys = ['gene_symbol']

In [9]:
# Is this formatted like in gEAR?
adata.var.head()

Unnamed: 0,gene_symbol
ENSMUSG00000000567,Gnai3
ENSMUSG00000000568,Pbsn
ENSMUSG00000000569,Cdc45l
ENSMUSG00000000570,H19
ENSMUSG00000000571,Scml2


In [10]:
### CHECK a gEAR h5ad to validate format
dataset = '/home/dolley/gear/www/datasets/17429ab6-d5f5-4f0a-8a1c-ce7d523a96f7.h5ad'

gear_adata = sc.read_h5ad(dataset)
gear_adata.var.head()


Xkr4
Gm1992
Gm37381
Rp1
Rp1


In [11]:
print("Expression:\t" + str(adata.X[1,0]) + "\t.var:  " + str(adata.var.iloc[0,0]) + "\t.obs:\n" + str(adata.obs.iloc[1]))

Expression:	92.0	.var:  Gnai3	.obs:
cell_type     utricle
condition     control
replicate           2
time_point          0
time_unit        hour
Name: obs-ctrl-2-0h, dtype: object


In [12]:
adata.var.head()

Unnamed: 0,gene_symbol
ENSMUSG00000000567,Gnai3
ENSMUSG00000000568,Pbsn
ENSMUSG00000000569,Cdc45l
ENSMUSG00000000570,H19
ENSMUSG00000000571,Scml2


# Test ExcelUploader
* Excel class is renamed to ExcelUploader()
* upload_dataset.py is renamed to datasetuploader.py
    * upload_dataset method is now nested under new class DatasetUploader

In [13]:
# import os, sys
# sys.path.append('/home/dolley/gear/lib')
# import gear.datasetuploader as datasetuploader
# # dataset = datasetuploader.DatasetUploader.upload_dataset('excel')
# filetype = datasetuploader.FileType()
# filetype

In [14]:
# dataset = datasetuploader.DatasetUploader.upload_dataset('excel')

In [15]:
import os, sys
sys.path.append('/home/dolley/gear/lib')
import gear.datasetuploader as datasetuploader

# test_file = '/home/dolley/gear/tests/base_template.xlsx'
test_file = '/home/dolley/gear/tests/3_obs_count_mismatch_exp.xlsx'
dataset = datasetuploader.DatasetUploader.upload_dataset('excel')
adata = dataset._read_file(test_file)
adata

Exception: Observation count discrepancy. Row count (16) in 'observations' sheet must match column count of 'expression' sheet(18).

In [None]:
adata.var.head()