### Initialization

In [1]:
import numpy as np
import pandas as pd

In [2]:
def PercentileNormalization(matrix, p):
    """
    Percentile normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
    p : float in range of [0,100]
        Percentile to compute, which must be between 0 and 100 inclusive.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    return matrix / np.percentile(matrix[~np.all(matrix == 0, axis=1)], p, axis=0)

In [3]:
def QuartileNormalization(matrix, q):
    """
    Quartile normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
    q : string from set = {"upper", "lower", "median"}
        The names of quartiles to compute in accordance:
        "upper" = 75,
        "lower" = 25,
        "median" = 50.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    d = {"upper": 75, "lower": 25, "median": 50}
    return PercentileNormalization(matrix, d[q])

### Numeric test

In [4]:
matrix = pd.read_table('../test_data/dataset.txt', sep=' ', header=None)

Before:

In [5]:
matrix

Unnamed: 0,0,1,2
0,5,10,1
1,10,20,2
2,1,2,5
3,353,0,2
4,7,7,7
5,8,9,8


After:

In [6]:
normal_matrix = QuartileNormalization(matrix, 'lower')
normal_matrix

Unnamed: 0,0,1,2
0,0.909091,3.076923,0.5
1,1.818182,6.153846,1.0
2,0.181818,0.615385,2.5
3,64.181818,0.0,1.0
4,1.272727,2.153846,3.5
5,1.454545,2.769231,4.0


### Perfomance test

In [11]:
matrix = np.random.randint(0, 1000, size=(3000, 5000))

print("QuartileNormalization - ", end="")
%time normal_matrix = QuartileNormalization(matrix, 'upper')

QuartileNormalization - Wall time: 671 ms


Read matrix from .csv file and normalize it:

In [8]:
data = pd.read_csv("../data/srr1784310_subset.csv", index_col=0)
data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,49,50,48,33,47,42,40,21,171,28,...,4,1,3,5,8,5,2,3,4,12
uc009vew.1,50,45,57,28,72,59,44,40,155,58,...,10,2,4,3,1,4,4,6,5,10
Lars2,90,110,89,113,77,81,57,91,62,71,...,11,14,15,10,10,17,18,20,5,24
Hsp90ab1,143,103,104,112,111,90,94,125,55,84,...,12,13,11,6,15,3,14,9,10,11
Ptma,67,136,85,66,96,73,74,80,34,64,...,11,10,5,3,10,7,14,7,16,8


In [9]:
normal_data = QuartileNormalization(data, 'upper')
normal_data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,3.5,3.333333,3.428571,2.693878,3.615385,3.818182,3.636364,1.866667,14.25,3.111111,...,2.0,0.5,1.5,2.5,4.0,2.5,1.0,1.5,2.0,6.0
uc009vew.1,3.571429,3.0,4.071429,2.285714,5.538462,5.363636,4.0,3.555556,12.916667,6.444444,...,5.0,1.0,2.0,1.5,0.5,2.0,2.0,3.0,2.5,5.0
Lars2,6.428571,7.333333,6.357143,9.22449,5.923077,7.363636,5.181818,8.088889,5.166667,7.888889,...,5.5,7.0,7.5,5.0,5.0,8.5,9.0,10.0,2.5,12.0
Hsp90ab1,10.214286,6.866667,7.428571,9.142857,8.538462,8.181818,8.545455,11.111111,4.583333,9.333333,...,6.0,6.5,5.5,3.0,7.5,1.5,7.0,4.5,5.0,5.5
Ptma,4.785714,9.066667,6.071429,5.387755,7.384615,6.636364,6.727273,7.111111,2.833333,7.111111,...,5.5,5.0,2.5,1.5,5.0,3.5,7.0,3.5,8.0,4.0
