In [1]:
import numpy as np
import os, sys
sys.path.append('/home/dolley/gear/lib')
import gear.datasetuploader as datasetuploader

test_file = '/home/dolley/gear/tests/base_template.xlsx'
dataset = datasetuploader.DatasetUploader.upload_dataset('excel')
dataset._read_file(test_file)
dataset.adata.var.head()

  from ._conv import register_converters as _register_converters


Unnamed: 0_level_0,gene_symbol
genes,Unnamed: 1_level_1
ENSMUSG00000000567,Gnai3
ENSMUSG00000000568,Pbsn
ENSMUSG00000000569,Cdc45l
ENSMUSG00000000570,H19
ENSMUSG00000000571,Scml2


In [2]:
adata = dataset.adata
X = adata.X
obs = adata.obs
var = adata.var

#GET GENE SCOPE COLORING
def get_color(expression, step_size, stat_min):
    if expression is None:
        return 0
    
    if step_size == 0:
        color_idx = int(255/2)
    else:
        color_idx = int((expression - stat_min) / step_size)

    # note some manual check warnings
    if color_idx > 255:
        color_idx = 255
    elif color_idx < 0:
        color_idx = 0

    return color_idx

# GET ABSOLUTE COLORING.
def get_abs_color(expression, abs_step_size):
    if expression is None:
        return 0

    abs_steps = expression / abs_step_size
    abs_color_idx = int(abs_steps)

    if abs_color_idx > 254:
        abs_color_idx = 254
    elif abs_color_idx < 0:
        abs_color_idx = 0

    return abs_color_idx

def get_gene_coloring(adata=None, color_mode=None):

    if adata is None:
        raise Exception("Error: Argument 'adata' is None. Please include an AnnData object.")
    if color_mode is None:
        raise Exception("Error: Argument 'mode' is None. Please choose 'raw' or 'absolute'.")
        
    X = adata.X
    
    #transpose X to iterate gene by gene
    colors_by_gene = list()
    for g, gene in enumerate(X.T):
        print('ROW: ', gene)
        stat_min = None
        stat_max = None
        stat_mean = None

        #if stat_mean == 0: These will also be 0 (zero)
        fld_chg_min = None # stat_min / stat_mean
        fld_chg_max = None # stat_max / stat_mean

        #Get mean
        row_sum = np.nansum(gene)
        print('SUM: ', row_sum)
        if row_sum == 0:
            stat_mean = 0
            fld_chg_min = 0
            fld_chg_max = 0
        else:
            row_sorted = np.sort(gene.copy())
            print('SORTED: ', row_sorted)

            # Find the 95% cutoff and remove values past it
            cutoff_idx = int(len(row_sorted) * 0.95)
            print('CUTOFF: ', cutoff_idx)
            row_trimmed = row_sorted[:cutoff_idx]
            print('TRIMMED: ', row_trimmed)

            # Get min, max, and mean based on cutoff
            stat_min = np.nanmin(row_trimmed)
            stat_max = np.nanmax(row_trimmed)
            stat_mean = round( np.nanmean(row_trimmed), 4)
            print('MIN: ', stat_min)
            print('MAX: ', stat_max)
            print('MEAN: ', stat_mean)

            row_colors = list()
            if color_mode == 'raw':
                # Get the step size of the values based on 255 colors. 
                step_size = (stat_max - stat_min) / 255
                print('STEP_SIZE: ', step_size)

                for i, expression in enumerate(gene):
                    color_idx = get_color(expression, step_size, stat_min)
                    row_colors.append(color_idx)

            if color_mode == 'absolute':
                abs_step_size = stat_max / 255

                if abs_step_size == 0:
                    abs_step_size = 0.01

                for i, expression in enumerate(gene):
                    abs_color_idx = get_abs_color(expression, abs_step_size)
                    row_colors.append(abs_color_idx)

        print('GENE COLORS: ', row_colors)
        colors_by_gene.append(row_colors)

    # print(colors_by_gene)
    Xcolors = np.array(colors_by_gene)
    return Xcolors.T

In [3]:
# Try out gene RAW coloring...
XColGeneRaw = get_gene_coloring(adata, color_mode='raw')
adata.XColGeneRaw = XColGeneRaw
adata.XColGeneRaw

ROW:  [ 72.  92.  51.  93.   1.  46.   0.  33.  46.  75.  56.  28.  90. 100.
   7.  25.  40.  81.]
SUM:  936.0
SORTED:  [  0.   1.   7.  25.  28.  33.  40.  46.  46.  51.  56.  72.  75.  81.
  90.  92.  93. 100.]
CUTOFF:  17
TRIMMED:  [ 0.  1.  7. 25. 28. 33. 40. 46. 46. 51. 56. 72. 75. 81. 90. 92. 93.]
MIN:  0.0
MAX:  93.0
MEAN:  49.1765
STEP_SIZE:  0.36470588235294116
GENE COLORS:  [197, 252, 139, 255, 2, 126, 0, 90, 126, 205, 153, 76, 246, 255, 19, 68, 109, 222]
ROW:  [79. 15. 66. 74. 76. 81. 74. 22. 30.  8. 59. 60. 46. 61. 77.  7. 82. 14.]
SUM:  931.0
SORTED:  [ 7.  8. 14. 15. 22. 30. 46. 59. 60. 61. 66. 74. 74. 76. 77. 79. 81. 82.]
CUTOFF:  17
TRIMMED:  [ 7.  8. 14. 15. 22. 30. 46. 59. 60. 61. 66. 74. 74. 76. 77. 79. 81.]
MIN:  7.0
MAX:  81.0
MEAN:  49.9412
STEP_SIZE:  0.2901960784313726
GENE COLORS:  [248, 27, 203, 230, 237, 254, 230, 51, 79, 3, 179, 182, 134, 186, 241, 0, 255, 24]
ROW:  [40. 15. 94. 75. 40. 49. 81. 73. 45.  9. 14. 45. 67.  3. 40. 60. 49. 43.]
SUM:  842.0
SORTED:

TRIMMED:  [ 5. 14. 24. 26. 28. 32. 32. 46. 50. 51. 53. 54. 61. 66. 77. 85. 91.]
MIN:  5.0
MAX:  91.0
MEAN:  46.7647
STEP_SIZE:  0.33725490196078434
GENE COLORS:  [145, 136, 237, 26, 166, 121, 68, 142, 180, 0, 133, 80, 255, 80, 213, 62, 56, 254]
ROW:  [85. 84. 81. 78. 80.  0. 29. 14.  4. 97.  7. 21. 38. 25. 72. 19. 90. 37.]
SUM:  861.0
SORTED:  [ 0.  4.  7. 14. 19. 21. 25. 29. 37. 38. 72. 78. 80. 81. 84. 85. 90. 97.]
CUTOFF:  17
TRIMMED:  [ 0.  4.  7. 14. 19. 21. 25. 29. 37. 38. 72. 78. 80. 81. 84. 85. 90.]
MIN:  0.0
MAX:  90.0
MEAN:  44.9412
STEP_SIZE:  0.35294117647058826
GENE COLORS:  [240, 237, 229, 220, 226, 0, 82, 39, 11, 255, 19, 59, 107, 70, 204, 53, 254, 104]
ROW:  [ 94.  49.  16.  86.  11.  59.  10.  13.  96.  46.  55.  22.  99.  68.
  77. 100.  49. 100.]
SUM:  1050.0
SORTED:  [ 10.  11.  13.  16.  22.  46.  49.  49.  55.  59.  68.  77.  86.  94.
  96.  99. 100. 100.]
CUTOFF:  17
TRIMMED:  [ 10.  11.  13.  16.  22.  46.  49.  49.  55.  59.  68.  77.  86.  94.
  96.  99. 100.]


array([[197, 248, 120, ..., 255,  14, 232],
       [252,  27,  39, ..., 236, 124,   8],
       [139, 203, 255, ..., 110, 195, 159],
       ...,
       [ 68,   0, 186, ..., 229, 218, 204],
       [109, 255, 150, ..., 239, 181, 221],
       [222,  24, 130, ...,   0,  79,  47]])

In [4]:
# Try out gene ABSOLUTE coloring...
XColGeneAbs = get_gene_coloring(adata, color_mode='absolute')
adata.XColGeneAbs = XColGeneAbs
adata.XColGeneAbs

ROW:  [ 72.  92.  51.  93.   1.  46.   0.  33.  46.  75.  56.  28.  90. 100.
   7.  25.  40.  81.]
SUM:  936.0
SORTED:  [  0.   1.   7.  25.  28.  33.  40.  46.  46.  51.  56.  72.  75.  81.
  90.  92.  93. 100.]
CUTOFF:  17
TRIMMED:  [ 0.  1.  7. 25. 28. 33. 40. 46. 46. 51. 56. 72. 75. 81. 90. 92. 93.]
MIN:  0.0
MAX:  93.0
MEAN:  49.1765
GENE COLORS:  [197, 252, 139, 254, 2, 126, 0, 90, 126, 205, 153, 76, 246, 254, 19, 68, 109, 222]
ROW:  [79. 15. 66. 74. 76. 81. 74. 22. 30.  8. 59. 60. 46. 61. 77.  7. 82. 14.]
SUM:  931.0
SORTED:  [ 7.  8. 14. 15. 22. 30. 46. 59. 60. 61. 66. 74. 74. 76. 77. 79. 81. 82.]
CUTOFF:  17
TRIMMED:  [ 7.  8. 14. 15. 22. 30. 46. 59. 60. 61. 66. 74. 74. 76. 77. 79. 81.]
MIN:  7.0
MAX:  81.0
MEAN:  49.9412
GENE COLORS:  [248, 47, 207, 232, 239, 254, 232, 69, 94, 25, 185, 188, 144, 192, 242, 22, 254, 44]
ROW:  [40. 15. 94. 75. 40. 49. 81. 73. 45.  9. 14. 45. 67.  3. 40. 60. 49. 43.]
SUM:  842.0
SORTED:  [ 3.  9. 14. 15. 40. 40. 40. 43. 45. 45. 49. 49. 60. 67. 73

TRIMMED:  [ 1.  2.  5.  5. 18. 26. 28. 29. 33. 38. 55. 61. 63. 79. 83. 84. 85.]
MIN:  1.0
MAX:  85.0
MEAN:  40.8824
GENE COLORS:  [165, 78, 15, 183, 254, 54, 87, 237, 6, 84, 189, 99, 252, 3, 249, 114, 15, 254]
ROW:  [97. 87. 45. 35. 30.  2. 88. 22. 40. 82. 61. 41. 34. 67. 30. 90. 18. 49.]
SUM:  918.0
SORTED:  [ 2. 18. 22. 30. 30. 34. 35. 40. 41. 45. 49. 61. 67. 82. 87. 88. 90. 97.]
CUTOFF:  17
TRIMMED:  [ 2. 18. 22. 30. 30. 34. 35. 40. 41. 45. 49. 61. 67. 82. 87. 88. 90.]
MIN:  2.0
MAX:  90.0
MEAN:  48.2941
GENE COLORS:  [254, 246, 127, 99, 85, 5, 249, 62, 113, 232, 172, 116, 96, 189, 85, 254, 51, 138]
ROW:  [21. 25. 58. 58. 34. 46. 15. 30. 48. 85. 96. 28. 43. 87. 28. 85. 35. 88.]
SUM:  910.0
SORTED:  [15. 21. 25. 28. 28. 30. 34. 35. 43. 46. 48. 58. 58. 85. 85. 87. 88. 96.]
CUTOFF:  17
TRIMMED:  [15. 21. 25. 28. 28. 30. 34. 35. 43. 46. 48. 58. 58. 85. 85. 87. 88.]
MIN:  15.0
MAX:  88.0
MEAN:  47.8824
GENE COLORS:  [60, 72, 168, 168, 98, 133, 43, 86, 139, 246, 254, 81, 124, 252, 81, 246

array([[197, 248, 125, ..., 254,  29, 232],
       [252,  47,  47, ..., 237, 132,  11],
       [139, 207, 254, ..., 120, 199, 160],
       ...,
       [ 68,  22, 188, ..., 231, 220, 205],
       [109, 254, 154, ..., 240, 185, 221],
       [222,  44, 135, ...,  17,  90,  49]])

In [5]:
# Tissue level coloring
obs = adata.obs

# Get index positions of each tissue type

In [87]:
#Prints 18 rows. 1 for each condition replicate
# for i, row in enumerate(X):
#     print(row)


def get_tissue_coloring(adata=None, color_mode=None):
    """
    Input:
        adata = AnnData object,
        mode = calculate raw coloring or absolute coloring.
            Options: 'raw' or 'absolute'

    Output:
        numpy array similar to X that contains tissue based coloring
    """

    if adata is None:
        raise Exception("Error: Argument 'adata' is None. Please include an AnnData object.")
    if color_mode is None:
        raise Exception("Error: Argument 'mode' is None. Please choose 'raw' or 'absolute'.")

    X = adata.X
    Xrows, Xcols = X.shape
    obs = adata.obs
    #Get the number of replicates for each condition (tissue == condition)
    from gear.datasetstats import get_replicate_counts
    replicate_count = get_replicate_counts(obs)


    #Group replicates by tissue (condition)
    tissues = list()
    count_reps = 0
    start = 0
    end = 0
    for i, rep_count in enumerate(replicate_count):
        end += rep_count
        current_tissue = X[start:end]
        tissue_flat = current_tissue.flatten()

        tissues.append(tissue_flat)
        start += rep_count

    # print(len(tissues)) #6 conditions for base_template.xslx
    # print(tissues) #success! replicates are grouped

    #Get the mean, min, max, etc for coloring!
    colors_by_tissue = list()
    for t, tissue in enumerate(tissues):   
        stat_min = None
        stat_max = None
        stat_mean = None

        #if stat_mean == 0: These will also be 0 (zero)
        fld_chg_min = None # stat_min / stat_mean
        fld_chg_max = None # stat_max / stat_mean

        #Get mean
        row_sum = np.nansum(tissue)
        if row_sum == 0:
            stat_mean = 0
            fld_chg_min = 0
            fld_chg_max = 0
        else:
            row_sorted = np.sort(tissue.copy())

            # Find the 95% cutoff and remove values past it
            cutoff_idx = int(len(row_sorted) * 0.95)
            row_trimmed = row_sorted[:cutoff_idx]

            # Get min, max, and mean based on cutoff
            stat_min = np.nanmin(row_trimmed)
            stat_max = np.nanmax(row_trimmed)
            stat_mean = round( np.nanmean(row_trimmed), 4)

            # Get the step size of the values based on 255 colors.
            step_size = (stat_max - stat_min) / 255

            # Get raw/absolute coloring for each expression value
            row_colors = list()
            if color_mode == 'raw':
                for i, expression in enumerate(tissue):
                    color_idx = get_color(expression, step_size, stat_min)
                    row_colors.append(color_idx)
                    
                    if i == 0 or i == 100 or i == 200:
                        print("index: ", i, "\texpression: ", expression, "\tabs: ", row_colors[i] )

            if color_mode == 'absolute':
                abs_step_size = stat_max / 255

                if abs_step_size == 0:
                    abs_step_size = 0.01

                for i, expression in enumerate(tissue):
                    abs_color_idx = get_abs_color(expression, abs_step_size)
                    row_colors.append(abs_color_idx)
                    
                    if i == 0 or i == 100 or i == 200:
                        print("index: ", i, "\texpression: ", expression, "\tabs: ", row_colors[i] )

        colors_by_tissue.append(row_colors)

    #creates np array of # groups (1 group per tissue)
    colors_by_tissue = np.array(colors_by_tissue)
    
    #Separate out individual replicates from group
    colors_by_replicate = list()
    for tissue, count in zip(colors_by_tissue, replicate_count):
        
        #Reshape into # rows as there are replicates by columns in X
        tissue = tissue.reshape((count, Xcols))
        colors_by_replicate.append(tissue)
    
    #convert into np array
    colors_by_tissue = np.array(colors_by_replicate)
    
    #stack by columns to produce exact layout as X
    XColorsTissue = np.vstack(colors_by_tissue)
    
    return XColorsTissue

In [84]:
# Try out tissue ABSOLUTE coloring...
XColTissAbs = get_tissue_coloring(adata, color_mode='absolute')
adata.XColTissAbs = XColTissAbs
print("Last column (gene) of X: ", adata.X[:,0])
print("Last column (gene) of XColTissAbs: ", adata.XColTissAbs[:, 0])
print("X.shape: ", X.shape)
print("TissueAbs: ", adata.XColTissAbs.shape)

index:  0 	expression:  72.0 	abs:  191
index:  100 	expression:  92.0 	abs:  244
index:  200 	expression:  51.0 	abs:  135
index:  0 	expression:  93.0 	abs:  252
index:  100 	expression:  1.0 	abs:  2
index:  200 	expression:  46.0 	abs:  124
index:  0 	expression:  0.0 	abs:  0
index:  100 	expression:  33.0 	abs:  87
index:  200 	expression:  46.0 	abs:  122
index:  0 	expression:  75.0 	abs:  199
index:  100 	expression:  56.0 	abs:  148
index:  200 	expression:  28.0 	abs:  74
index:  0 	expression:  90.0 	abs:  239
index:  100 	expression:  100.0 	abs:  254
index:  200 	expression:  7.0 	abs:  18
index:  0 	expression:  25.0 	abs:  70
index:  100 	expression:  40.0 	abs:  113
index:  200 	expression:  81.0 	abs:  229
Last column (gene) of X:  [ 72.  92.  51.  93.   1.  46.   0.  33.  46.  75.  56.  28.  90. 100.
   7.  25.  40.  81.]
Last column (gene) of XColTissAbs:  [191 244 135 252   2 124   0  87 122 199 148  74 239 254  18  70 113 229]
X.shape:  (18, 100)
TissueAbs:  (18, 

In [88]:
# Try out tissue RAW coloring...
XColTissRaw = get_tissue_coloring(adata, color_mode='raw')
adata.XColTissRaw = XColTissRaw
print("Last column (gene) of X: ", adata.X[:,0])
print("Last column (gene) of XColTissRaw: ", adata.XColTissRaw[:, 0])
print("X.shape: ", X.shape)
print("TissueRaw: ", adata.XColTissRaw.shape)

index:  0 	expression:  72.0 	abs:  191
index:  100 	expression:  92.0 	abs:  244
index:  200 	expression:  51.0 	abs:  135
index:  0 	expression:  93.0 	abs:  252
index:  100 	expression:  1.0 	abs:  2
index:  200 	expression:  46.0 	abs:  124
index:  0 	expression:  0.0 	abs:  0
index:  100 	expression:  33.0 	abs:  87
index:  200 	expression:  46.0 	abs:  122
index:  0 	expression:  75.0 	abs:  199
index:  100 	expression:  56.0 	abs:  148
index:  200 	expression:  28.0 	abs:  74
index:  0 	expression:  90.0 	abs:  239
index:  100 	expression:  100.0 	abs:  255
index:  200 	expression:  7.0 	abs:  18
index:  0 	expression:  25.0 	abs:  70
index:  100 	expression:  40.0 	abs:  113
index:  200 	expression:  81.0 	abs:  229
Last column (gene) of X:  [ 72.  92.  51.  93.   1.  46.   0.  33.  46.  75.  56.  28.  90. 100.
   7.  25.  40.  81.]
Last column (gene) of XColTissRaw:  [191 244 135 252   2 124   0  87 122 199 148  74 239 255  18  70 113 229]
X.shape:  (18, 100)
TissueRaw:  (18, 