### Initialization

In [1]:
import numpy as np
import pandas as pd

In [2]:
def Percentile(matrix, p):
    """
    Estimation of percentile without zeros
    
    Parameters
    ----------
    matrix : array_like
        Matrix to calculate percentile.
    p : float in range of [0,100]
        Percentile to compute, must be between 0 and 100 inclusive.
        
    Returns
    -------
    float
        Сalculated percentile.
    """
    return np.percentile(matrix[np.any(matrix > 0, axis=1)], p, axis=0)

In [3]:
def PercentileNormalization(matrix, p):
    """
    Percentile normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
    p : float in range of [0,100]
        Percentile to compute, which must be between 0 and 100 inclusive.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    return matrix / Percentile(matrix, p)

In [4]:
def QuartileNormalization(matrix, q):
    """
    Quartile normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
    q : string from {"lower", "median", "upper"} or quartile number (1, 2 or 3)
        The names of quartiles to compute in accordance:
        "lower" = 1,
        "median" = 2,
        "upper" = 3.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    d = {"upper": 75, "lower": 25, "median": 50, 3: 75, 1: 25, 2: 50}
    assert q in d, 'Unexpected quartile for normalization: "' + str(q) + '"'
    return PercentileNormalization(matrix, d[q])

### Numeric test

In [5]:
matrix = pd.read_table('../test_data/dataset.txt', sep=' ', header=None)

Before:

In [6]:
matrix

Unnamed: 0,0,1,2
0,5,10,1
1,10,20,2
2,1,2,5
3,353,0,2
4,7,7,7
5,8,9,8


After:

In [7]:
normal_matrix = QuartileNormalization(matrix, 'lower')
normal_matrix

Unnamed: 0,0,1,2
0,0.909091,3.076923,0.5
1,1.818182,6.153846,1.0
2,0.181818,0.615385,2.5
3,64.181818,0.0,1.0
4,1.272727,2.153846,3.5
5,1.454545,2.769231,4.0


Other input:

In [8]:
normal_matrix = QuartileNormalization(matrix, 1)
normal_matrix

Unnamed: 0,0,1,2
0,0.909091,3.076923,0.5
1,1.818182,6.153846,1.0
2,0.181818,0.615385,2.5
3,64.181818,0.0,1.0
4,1.272727,2.153846,3.5
5,1.454545,2.769231,4.0


### Perfomance test

In [9]:
matrix = np.random.randint(0, 1000, size=(3000, 5000))

print("QuartileNormalization - ", end="")
%time normal_matrix = QuartileNormalization(matrix, 'upper')

QuartileNormalization - Wall time: 653 ms


Read matrix from .csv file and normalize it:

In [10]:
data = pd.read_csv("../data/srr1784310_subset.csv", index_col=0)
data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,49,50,48,33,47,42,40,21,171,28,...,4,1,3,5,8,5,2,3,4,12
uc009vew.1,50,45,57,28,72,59,44,40,155,58,...,10,2,4,3,1,4,4,6,5,10
Lars2,90,110,89,113,77,81,57,91,62,71,...,11,14,15,10,10,17,18,20,5,24
Hsp90ab1,143,103,104,112,111,90,94,125,55,84,...,12,13,11,6,15,3,14,9,10,11
Ptma,67,136,85,66,96,73,74,80,34,64,...,11,10,5,3,10,7,14,7,16,8


In [11]:
currQuartile = 2
normal_data = QuartileNormalization(data, currQuartile)
normal_data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,6.125,5.882353,6.0,4.714286,6.714286,6.0,6.666667,3.0,28.5,5.6,...,4.0,1.0,3.0,5.0,8.0,5.0,2.0,3.0,4.0,12.0
uc009vew.1,6.25,5.294118,7.125,4.0,10.285714,8.428571,7.333333,5.714286,25.833333,11.6,...,10.0,2.0,4.0,3.0,1.0,4.0,4.0,6.0,5.0,10.0
Lars2,11.25,12.941176,11.125,16.142857,11.0,11.571429,9.5,13.0,10.333333,14.2,...,11.0,14.0,15.0,10.0,10.0,17.0,18.0,20.0,5.0,24.0
Hsp90ab1,17.875,12.117647,13.0,16.0,15.857143,12.857143,15.666667,17.857143,9.166667,16.8,...,12.0,13.0,11.0,6.0,15.0,3.0,14.0,9.0,10.0,11.0
Ptma,8.375,16.0,10.625,9.428571,13.714286,10.428571,12.333333,11.428571,5.666667,12.8,...,11.0,10.0,5.0,3.0,10.0,7.0,14.0,7.0,16.0,8.0


In [14]:
d = {"upper": 75, "lower": 25, "median": 50, 3: 75, 1: 25, 2: 50}

np.isclose(normal_data.sum(axis=0), data.sum(axis=0) / Percentile(data, d[currQuartile])).all()

True