### Initialization

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import rankdata

In [2]:
def percentile(matrix, p):
    """
    Estimation of percentile without zeros
    
    Parameters
    ----------
    matrix : array_like
        Matrix to calculate percentile.
    p : float in range of [0,100]
        Percentile to compute, must be between 0 and 100 inclusive.
        
    Returns
    -------
    float
        Сalculated percentile.
    """
    return np.percentile(matrix[np.any(matrix > 0, axis=1)], p, axis=0)

In [3]:
def tmm_normalization(matr, index_ref=None, factor_m=0.3, factor_a=0.05):
    """
    Trimmed mean of M-values normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    matrix = pd.DataFrame(matr)
    np.seterr(divide='ignore')      # for divide on zeros in log2
    
    def log2_tmm(index_vec):
        check_inf = (~np.isinf(matr_a[index_vec])) & (~np.isinf(matr_m[index_vec]))
        curr_vec = matrix[index_vec][check_inf]
        bool_curr_vec = curr_vec > 0
        ref_vec = matrix[index_ref][check_inf]
        bool_ref = ref_vec > 0
        ranks = rankdata(matr_a[index_vec][check_inf], method='ordinal')
        bool_a = (ranks > len(ranks) * factor_a) & (ranks < len(ranks) * (1 - factor_a))
        ranks = rankdata(matr_m[index_vec][check_inf], method='ordinal')
        bool_m = (ranks > len(ranks) * factor_m) & (ranks < len(ranks) * (1 - factor_m))
        bool_result = bool_curr_vec & bool_ref & bool_a & bool_m
        total_curr_vec = np.sum(curr_vec[bool_result])
        total_ref_vec = np.sum(ref_vec[bool_result])
        w_vec = ((total_curr_vec - curr_vec[bool_result]) / (total_curr_vec * curr_vec[bool_result]) + 
                 (total_ref_vec - ref_vec[bool_result]) / (total_ref_vec * ref_vec[bool_result]))
        m_vec = np.log2(curr_vec[bool_result] / total_curr_vec) / np.log2(ref_vec[bool_result] / total_ref_vec)
        low_sum = np.sum(w_vec)
        if low_sum != 0:
            res = np.sum(w_vec * m_vec) / low_sum
        else:
            res = 0;
        return res
        
    f75 = percentile(matrix, 0.75)
    if index_ref is None:
        number_index_ref = np.argmin(abs(f75 - np.mean(f75)))
        index_ref = matrix.columns.values[number_index_ref]
    tmp = matrix / np.sum(matrix, axis=0)
    matr_a = np.log2(tmp.mul(tmp[index_ref], axis=0)) / 2
    matr_m = np.log2(tmp.div(tmp[index_ref], axis=0))
    tmm_factor = 2 ** np.array([log2_tmm(i) for i in matrix])
    result = matrix / tmm_factor
    return matr / tmm_factor

### Numeric test

In [11]:
numeric_matrix = pd.read_table('../test_data/dataset.txt', sep=' ', header=None)

Before:

In [12]:
numeric_matrix

Unnamed: 0,0,1,2
0,5,10,1
1,10,20,2
2,1,2,5
3,353,0,2
4,7,7,7
5,8,9,8


After:

In [14]:
normal_numeric_matrix = tmm_normalization(numeric_matrix, factor_m = 0.1)
normal_numeric_matrix

Unnamed: 0,0,1,2
0,2.117609,4.7188,0.5
1,4.235217,9.4376,1.0
2,0.423522,0.94376,2.5
3,149.503172,0.0,1.0
4,2.964652,3.30316,3.5
5,3.388174,4.24692,4.0


### Perfomance test

In [7]:
matrix = np.random.randint(0, 1000, size=(3000, 5000))

print("TrimmedMeanNormalization - ", end="")
%time normal_matrix = tmm_normalization(matrix)

TrimmedMeanNormalization - Wall time: 1min 4s


Read matrix from .csv file and normalize it:

In [8]:
data = pd.read_csv("../data/srr1784310_subset.csv", index_col=0)
data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,49,50,48,33,47,42,40,21,171,28,...,4,1,3,5,8,5,2,3,4,12
uc009vew.1,50,45,57,28,72,59,44,40,155,58,...,10,2,4,3,1,4,4,6,5,10
Lars2,90,110,89,113,77,81,57,91,62,71,...,11,14,15,10,10,17,18,20,5,24
Hsp90ab1,143,103,104,112,111,90,94,125,55,84,...,12,13,11,6,15,3,14,9,10,11
Ptma,67,136,85,66,96,73,74,80,34,64,...,11,10,5,3,10,7,14,7,16,8


In [9]:
normal_data = tmm_normalization(data)
normal_data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,24.490442,24.981927,24.030404,16.5,23.47919,20.94936,19.975528,10.475724,85.37774,13.977872,...,2.003115,0.501779,1.499282,2.495467,4.01139,2.50401,0.999506,1.502517,2.001383,6.013012
uc009vew.1,24.990247,22.483735,28.536104,14.0,35.968121,29.428863,21.97308,19.95376,77.38918,28.954163,...,5.007787,1.003558,1.999042,1.49728,0.501424,2.003208,1.999012,3.005034,2.501729,5.010844
Lars2,44.982445,54.96024,44.556373,56.5,38.465907,40.402338,28.465127,45.394803,30.955672,35.443889,...,5.508566,7.024904,7.496408,4.990934,5.014238,8.513632,8.995553,10.01678,2.501729,12.026024
Hsp90ab1,71.472107,51.46277,52.065874,56.0,55.450853,44.891487,46.94249,62.355499,27.460677,41.933615,...,6.009344,6.523125,5.497366,2.99456,7.521356,1.502406,6.996541,4.507551,5.003458,5.511928
Ptma,33.486931,67.950842,42.55384,33.0,47.957494,36.411984,36.954726,39.907519,16.975691,31.949421,...,5.508566,5.017788,2.498803,1.49728,5.014238,3.505613,6.996541,3.505873,8.005533,4.008675
