### Initialization

In [1]:
import numpy as np
import pandas as pd

<div style="color:red">
Цитата из статьи:  
Here, the scale factor is calculated from the 75th percentile of the counts for each library after removing transcripts, which are zero in **all** libraries.
</div>

In [2]:
def QuartileNormalization(matrix, q):
    """
    Quantile normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
    q : float in range of [0,100]
        Percentile to compute, which must be between 0 and 100 inclusive.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    return matrix / np.percentile(matrix[~np.any(matrix == 0, axis=1)], q, axis=0)

### Numeric test

In [3]:
matrix = pd.read_table('../test_data/dataset.txt', sep=' ', header=None)

Before:

In [4]:
matrix

Unnamed: 0,0,1,2
0,5,10,1
1,10,20,2
2,1,2,5
3,353,0,2
4,7,7,7
5,9,9,9
6,8,9,8
7,1,4,1


After:

In [5]:
normal_matrix = QuartileNormalization(matrix, 75)
normal_matrix

Unnamed: 0,0,1,2
0,0.588235,1.052632,0.133333
1,1.176471,2.105263,0.266667
2,0.117647,0.210526,0.666667
3,41.529412,0.0,0.266667
4,0.823529,0.736842,0.933333
5,1.058824,0.947368,1.2
6,0.941176,0.947368,1.066667
7,0.117647,0.421053,0.133333


### Perfomance test

In [6]:
matrix = np.random.randint(0, 1000, size=(3000, 5000))

print("QuartileNormalization - ", end="")
%time normal_matrix = QuartileNormalization(matrix, 75)

QuartileNormalization - Wall time: 137 ms


Read matrix from .csv file and normalize it:

In [7]:
data = pd.read_csv("../data/srr1784310_subset.csv", index_col=0)
data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,49,50,48,33,47,42,40,21,171,28,...,4,1,3,5,8,5,2,3,4,12
uc009vew.1,50,45,57,28,72,59,44,40,155,58,...,10,2,4,3,1,4,4,6,5,10
Lars2,90,110,89,113,77,81,57,91,62,71,...,11,14,15,10,10,17,18,20,5,24
Hsp90ab1,143,103,104,112,111,90,94,125,55,84,...,12,13,11,6,15,3,14,9,10,11
Ptma,67,136,85,66,96,73,74,80,34,64,...,11,10,5,3,10,7,14,7,16,8


In [8]:
normal_data = QuartileNormalization(data, 75)
normal_data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,0.355072,0.342466,0.390244,0.383721,0.447619,0.466667,0.430108,0.201923,2.758065,0.333333,...,0.333333,0.071429,0.25,1.0,0.666667,0.357143,0.153846,0.230769,0.307692,1.090909
uc009vew.1,0.362319,0.308219,0.463415,0.325581,0.685714,0.655556,0.473118,0.384615,2.5,0.690476,...,0.833333,0.142857,0.333333,0.6,0.083333,0.285714,0.307692,0.461538,0.384615,0.909091
Lars2,0.652174,0.753425,0.723577,1.313953,0.733333,0.9,0.612903,0.875,1.0,0.845238,...,0.916667,1.0,1.25,2.0,0.833333,1.214286,1.384615,1.538462,0.384615,2.181818
Hsp90ab1,1.036232,0.705479,0.845528,1.302326,1.057143,1.0,1.010753,1.201923,0.887097,1.0,...,1.0,0.928571,0.916667,1.2,1.25,0.214286,1.076923,0.692308,0.769231,1.0
Ptma,0.485507,0.931507,0.691057,0.767442,0.914286,0.811111,0.795699,0.769231,0.548387,0.761905,...,0.916667,0.714286,0.416667,0.6,0.833333,0.5,1.076923,0.538462,1.230769,0.727273


<div style="color:red">
Эта строчка является валидацией метода. Её потом надо будет вынести в unit-тесты. Для данного метода проверка суммы ничего не даёт - нам нужно проверять квартили. Кстати, лучше делать это через `assert`.
</div>

In [9]:
np.isclose(normal_data.sum(axis=0), 1).all()

False