<span style="color:red">It's better to use headers:</span>

### Initialization

~~Import.~~ <span style="color:red">You don't need to write obvious comments.</span>

In [6]:
import numpy as np
import pandas as pd

Read input data.

In [7]:
# def readData(fileName):
#     file = open(fileName, "r")
#     matrix = []
#     for line in file:
#         matrix.append(np.array([int(i) for i in line.split()]))
#     file.close()
#     return np.array(matrix)

~~Function for total count normalization.~~ <span style="color:red">Use [dockstring](https://www.python.org/dev/peps/pep-0257/) to comment functions. I prefer [numpy style](http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_numpy.html). In such trivial cases comments aren't necessary, but for more complicated functions you have to write them.</span>

In [8]:
def totalCountNormalization(matrix):
    """
    Total count normalization
    
    Parameters
    ----------
    matrix : array_like
        Matrix to normalize.
        
    Returns
    -------
    array_like
        Normalized matrix.
    """
    return matrix / matrix.sum(axis=0)

### Numeric test

<span style="color:red">You don't need manual read function:</span>

In [47]:
matrix = pd.read_table('../test_data/dataset.txt', sep=' ', header=None)

In [29]:
# print("Before =\n", matrix, end='\n\n')
# normal_matrix = totalCountNormalization(matrix)
# print("After =\n", normal_matrix)

<span style="color:red">"print" for DataFrame doesn't works well. Also you don't need to print constant strings as you have markdown comments:</span>

Before:

In [46]:
matrix

Unnamed: 0,0,1,2
0,5,10,1
1,10,20,2
2,1,2,5
3,353,0,2
4,7,7,7
5,9,9,9
6,8,9,8
7,1,4,1


After:

In [34]:
normal_matrix = totalCountNormalization(matrix)
normal_matrix

Unnamed: 0,0,1,2
0,0.01269,0.163934,0.028571
1,0.025381,0.327869,0.057143
2,0.002538,0.032787,0.142857
3,0.895939,0.0,0.057143
4,0.017766,0.114754,0.2
5,0.022843,0.147541,0.257143
6,0.020305,0.147541,0.228571
7,0.002538,0.065574,0.028571


### Performance test

In [19]:
matrix = np.random.randint(0, 1000, size=(3000, 5000))

print("totalCountNormalization - ", end="")
%time normal_matrix = totalCountNormalization(matrix)

totalCountNormalization - CPU times: user 118 ms, sys: 18 ms, total: 136 ms
Wall time: 135 ms


In [27]:
matrix

Unnamed: 0,0,1,2
0,5,10,1
1,10,20,2
2,1,2,5
3,353,0,2
4,7,7,7
5,9,9,9
6,8,9,8
7,1,4,1


Read matrix from .csv file and normalize it:

<span style="color:red">Use special argument *"index_col"* to correctly process column names:</span>

In [38]:
data = pd.read_csv("../data/srr1784310_subset.csv", index_col=0)
data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,49,50,48,33,47,42,40,21,171,28,...,4,1,3,5,8,5,2,3,4,12
uc009vew.1,50,45,57,28,72,59,44,40,155,58,...,10,2,4,3,1,4,4,6,5,10
Lars2,90,110,89,113,77,81,57,91,62,71,...,11,14,15,10,10,17,18,20,5,24
Hsp90ab1,143,103,104,112,111,90,94,125,55,84,...,12,13,11,6,15,3,14,9,10,11
Ptma,67,136,85,66,96,73,74,80,34,64,...,11,10,5,3,10,7,14,7,16,8


<span style="color:red">DataFrame works similar to np.array, so you don't need to convert the type:</span>

In [39]:
normal_data = totalCountNormalization(data)
normal_data.head()

Unnamed: 0,AGCACCTCTAAGCTTCT,GAGACAGATACGCTAGTC,CCAACCGTCGATTGAT,TGATATTGCCTAACAATCC,ATATGCATACTAGGAT,GACTAGACCCAAACGCCT,ACCTTGCCAAACCTCC,GATGACCCTCACCTTGCC,AATATACCTATATGCAT,GATAACCATCCCTCGTCT,...,TGAATGCATGGGGTTAGTG,TGAAGCGTAGGGAACGATT,CCCATCTGTTATCTGT,ATCATGAGGTAGTCTAG,TGATACGTGCTTGACGGAC,AGGTCACAGGCATGGGT,TGAGTTCTGTTGGGAACCT,GAGGTCCCTTCGACTCCT,GATTAGACCGGCTTAC,GTTCAACTGGTTAGTG
uc009vfc.1,0.001454,0.001404,0.001463,0.001231,0.001563,0.001743,0.001677,0.000813,0.007715,0.001302,...,0.001227,0.00028,0.001045,0.0017,0.002315,0.001548,0.000595,0.001002,0.001246,0.003578
uc009vew.1,0.001483,0.001263,0.001737,0.001044,0.002394,0.002448,0.001844,0.001549,0.006993,0.002697,...,0.003067,0.000559,0.001393,0.00102,0.000289,0.001238,0.00119,0.002005,0.001558,0.002982
Lars2,0.00267,0.003088,0.002713,0.004214,0.00256,0.003361,0.002389,0.003523,0.002797,0.003302,...,0.003374,0.003915,0.005225,0.0034,0.002894,0.005262,0.005356,0.006682,0.001558,0.007156
Hsp90ab1,0.004243,0.002892,0.00317,0.004177,0.003691,0.003735,0.00394,0.00484,0.002481,0.003906,...,0.003681,0.003635,0.003831,0.00204,0.00434,0.000929,0.004165,0.003007,0.003115,0.00328
Ptma,0.001988,0.003818,0.002591,0.002461,0.003192,0.003029,0.003102,0.003098,0.001534,0.002976,...,0.003374,0.002796,0.001742,0.00102,0.002894,0.002167,0.004165,0.002339,0.004984,0.002385


<span style="color:red">Validation ("isclose" is a proper way to check equality of float values):</span>

In [43]:
np.isclose(normal_data.sum(axis=0), 1).all()

True