### Initialization

In [117]:
import numpy as np
import pandas as pd
from scipy.stats import rankdata

In [112]:
def percentile(matrix, p):
    """
    Estimation of percentile without zeros
    
    Parameters
    ----------
    matrix : array_like
        Matrix to calculate percentile.
    p : float in range of [0,100]
        Percentile to compute, must be between 0 and 100 inclusive.
        
    Returns
    -------
    float
        Сalculated percentile.
    """
    return np.percentile(matrix[np.any(matrix > 0, axis=1)], p, axis=0)

In [152]:
def tmm_normalization(matrix, refColumn=None, factor_m=0.3, factor_a=0.05):
    """
    Trimmed mean of M-values normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    
    np.seterr(divide='ignore')      # for divide on zeros in log2
    
    def log2_tmm(index_vec):
        curr_vec = matrix[index_vec][(~np.isinf(matr_a[index_vec])) & (~np.isinf(matr_m[index_vec]))]
        bool_curr_vec = curr_vec > 0
        ref_vec = matrix[index_ref][(~np.isinf(matr_a[index_vec])) & (~np.isinf(matr_m[index_vec]))]
        bool_ref = ref_vec > 0
        ranks = rankdata(matr_a[index_vec][(~np.isinf(matr_a[index_vec])) & (~np.isinf(matr_m[index_vec]))], method='ordinal')
        bool_a = (ranks > len(ranks) * factor_a) & (ranks < len(ranks) * (1 - factor_a))
        ranks = rankdata(matr_m[index_vec][(~np.isinf(matr_a[index_vec])) & (~np.isinf(matr_m[index_vec]))], method='ordinal')
        bool_m = (ranks > len(ranks) * factor_m) & (ranks < len(ranks) * (1 - factor_m))
        bool_result = bool_curr_vec & bool_ref & bool_a & bool_m
        total_curr_vec = np.sum(curr_vec[bool_result])
        total_ref_vec = np.sum(ref_vec[bool_result])
        w_vec = ((total_curr_vec - curr_vec[bool_result]) / (total_curr_vec * curr_vec[bool_result]) + 
                 (total_ref_vec - ref_vec[bool_result]) / (total_ref_vec * ref_vec[bool_result]))
        m_vec = np.log2(curr_vec[bool_result] / total_curr_vec) / np.log2(ref_vec[bool_result] / total_ref_vec)
        try:
            res = np.sum(w_vec * m_vec, axis=0) / np.sum(w_vec, axis=0)
        except ZeroDivisionError:
            res = 0
        finally:
            return res
        
    f75 = percentile(matrix, 0.75)
    if refColumn is None:
        index_ref = np.argmin(abs(f75 - np.mean(f75)))

    tmp = matrix / np.sum(matrix, axis=0)
    matr_a = np.log2(tmp.mul(tmp[index_ref], axis=0)) / 2
    matr_m = np.log2(tmp.div(tmp[index_ref], axis=0))
    tmm_factor = 2 ** np.array([log2_tmm(i) for i in matrix])
    result = matrix / tmm_factor
    return matrix / tmm_factor

### Numeric test

In [114]:
matrix = pd.read_table('../test_data/dataset.txt', sep=' ', header=None)

Before:

In [153]:
matrix

Unnamed: 0,0,1,2
0,5,10,1
1,10,20,2
2,1,2,5
3,353,0,2
4,7,7,7
5,8,9,8


After:

In [154]:
normal_matrix = tmm_normalization(matrix, factor_m = 0.1)
normal_matrix

Unnamed: 0,0,1,2
0,2.117609,4.7188,0.5
1,4.235217,9.4376,1.0
2,0.423522,0.94376,2.5
3,149.503172,0.0,1.0
4,2.964652,3.30316,3.5
5,3.388174,4.24692,4.0


### Perfomance test

In [155]:
matrix = np.random.randint(0, 1000, size=(3000, 5000))

print("TrimmedMeanNormalization - ", end="")
%time normal_matrix = tmm_normalization(matrix)

TrimmedMeanNormalization - 

AttributeError: 'numpy.ndarray' object has no attribute 'mul'

Read matrix from .csv file and normalize it:

In [156]:
data = pd.read_csv("../data/srr1784310_subset.csv", index_col=0)
data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,49,50,48,33,47,42,40,21,171,28,...,4,1,3,5,8,5,2,3,4,12
uc009vew.1,50,45,57,28,72,59,44,40,155,58,...,10,2,4,3,1,4,4,6,5,10
Lars2,90,110,89,113,77,81,57,91,62,71,...,11,14,15,10,10,17,18,20,5,24
Hsp90ab1,143,103,104,112,111,90,94,125,55,84,...,12,13,11,6,15,3,14,9,10,11
Ptma,67,136,85,66,96,73,74,80,34,64,...,11,10,5,3,10,7,14,7,16,8


In [158]:
normal_data = tmm_normalization(matrix)
normal_data.head()

AttributeError: 'numpy.ndarray' object has no attribute 'mul'