### Initialization

In [68]:
import numpy as np
import pandas as pd

In [3]:
def Percentile(matrix, p):
    """
    Percentile  calculating
    
    Parameters
    ----------
    matrix : array_like
        Matrix for calculate percentile.
    p : float in range of [0,100]
        Percentile to compute, which must be between 0 and 100 inclusive.
        
    Returns
    -------
    float
        Сalculated percentile.
    """
    return np.percentile(matrix[np.any(matrix > 0, axis=1)], p, axis=0)

In [79]:
def TMMNormalization(matr):
    """
    Trimmed mean of M-values normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    
    def innerW(k, r):
        """
        Calculeting a auxiliary factor of normalization
        
        Parameters
        ----------
        k : int
            index current column
        r : int
            index reference column
        
        Returns
        -------
        array_like
            Auxiliary factor of normalization
        """
        f = lambda x: (N[x] - matrix[x]) / (N[x] * matrix[x])
        return f(k) + f(r)
    
    def innerM(k, r):
        """
        Calculeting a auxiliary factor of normalization
        
        Parameters
        ----------
        k : int
            index current column
        r : int
            index reference column
        
        Returns
        -------
        array_like
            Auxiliary factor of normalization
        """
        f = lambda x: np.log2(matrix[x] / N[x])
        return f(k) / f(r)
    
    def innerLog2_TMM(col):
        """
        Calculeting a Log2(TMM_factor)
        
        Parameters
        ----------
        col : int
            index current column (cell)
        
        Returns
        -------
        array_like
            Log2(TMM_factor)
        """
        w = innerW(col, refColumn)
        m = innerM(col, refColumn)
        return np.sum(w * m, axis=0) / np.sum(w, axis=0)
    
    matrix = matr[np.all(matr > 0, axis=1)]
    np.seterr(divide='ignore')
    f75 = Percentile(matr, 0.75)
    refColumn = np.argmin(abs(f75 - np.mean(f75)))
    N = np.sum(matrix, axis=0)
    tmp = matrix / N
    matrA = np.log2(tmp.mul(tmp[refColumn], axis=0)) / 2
    matrM = np.log2(tmp.div(tmp[refColumn], axis=0))
    TMM_factor = 2 ** np.array([innerLog2_TMM(i) for i in matrix])
    return matrix / TMM_factor

In [80]:
def matrixForm_TMMNormalization(matr):
    """
    Trimmed mean of M-values normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    
    np.seterr(divide='ignore')
    matrix = matr[np.all(matr > 0, axis=1)]
    f75 = Percentile(matr, 0.75)
    refColumn = np.argmin(abs(f75 - np.mean(f75)))
    N = np.sum(matrix, axis=0)
    tmp = matrix / N
    matrA = np.log2(tmp.mul(tmp[refColumn], axis=0)) / 2
    matrM = np.log2(tmp.div(tmp[refColumn], axis=0))
    w = (N - matrix) / (N * matrix)
    W = (w.add(w[refColumn], axis=0))
    m = np.log2(matrix / N)
    M = (m.div(m[refColumn], axis=0))
    log2_TMM = np.sum(W * M, axis=0) / np.sum(W, axis=0)
    return matrix / 2 ** log2_TMM

### Numeric test

In [5]:
matrix = pd.read_table('../test_data/dataset.txt', sep=' ', header=None)

Before:

In [77]:
matrix.describe()

Unnamed: 0,0,1,2
count,6.0,6.0,6.0
mean,64.0,8.0,4.166667
std,141.613559,7.071068,2.926887
min,1.0,0.0,1.0
25%,5.5,3.25,2.0
50%,7.5,8.0,3.5
75%,9.5,9.75,6.5
max,353.0,20.0,8.0


After:

In [81]:
normal_matrix = matrixForm_TMMNormalization(matrix)
normal_matrix

Unnamed: 0,0,1,2
0,2.117609,4.870626,0.5
1,4.235217,9.741251,1.0
2,0.423522,0.974125,2.5
4,2.964652,3.409438,3.5
5,3.388174,4.383563,4.0


In [82]:
normal_matrix = TMMNormalization(matrix)
normal_matrix

Unnamed: 0,0,1,2
0,2.117609,4.870626,0.5
1,4.235217,9.741251,1.0
2,0.423522,0.974125,2.5
4,2.964652,3.409438,3.5
5,3.388174,4.383563,4.0


### Perfomance test

In [7]:
matrix = np.random.randint(0, 1000, size=(3000, 5000))

print("TrimmedMeanNormalization - ", end="")
%time normal_matrix = TMMNormalization(matrix, _, _)

TrimmedMeanNormalization - Wall time: 0 ns


Read matrix from .csv file and normalize it:

In [8]:
data = pd.read_csv("../data/srr1784310_subset.csv", index_col=0)
data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,49,50,48,33,47,42,40,21,171,28,...,4,1,3,5,8,5,2,3,4,12
uc009vew.1,50,45,57,28,72,59,44,40,155,58,...,10,2,4,3,1,4,4,6,5,10
Lars2,90,110,89,113,77,81,57,91,62,71,...,11,14,15,10,10,17,18,20,5,24
Hsp90ab1,143,103,104,112,111,90,94,125,55,84,...,12,13,11,6,15,3,14,9,10,11
Ptma,67,136,85,66,96,73,74,80,34,64,...,11,10,5,3,10,7,14,7,16,8


In [9]:
normal_data = TMMNormalization(matrix, _, _)
normal_data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,49,50,48,33,47,42,40,21,171,28,...,4,1,3,5,8,5,2,3,4,12
uc009vew.1,50,45,57,28,72,59,44,40,155,58,...,10,2,4,3,1,4,4,6,5,10
Lars2,90,110,89,113,77,81,57,91,62,71,...,11,14,15,10,10,17,18,20,5,24
Hsp90ab1,143,103,104,112,111,90,94,125,55,84,...,12,13,11,6,15,3,14,9,10,11
Ptma,67,136,85,66,96,73,74,80,34,64,...,11,10,5,3,10,7,14,7,16,8


In [10]:
np.isclose(normal_data.sum(axis=0), _).all()

False