In [5]:
import pandas as pd
import cPickle as pkl

In [7]:
pd.options.display.max_rows = 100

In [13]:
all_time_tes_annotated = pkl.load(open("../../parameters/all_time_tes_annotated.df"))
len(all_time_tes_annotated)

3727

We want to calculate a table of $\mu$ and $\sigma$ per functional category and time.

How many genes a functional group should have at minimum:

In [27]:
min_genes = 50

In [28]:
all_time_tes_annotated.head()

Unnamed: 0,0,300,600,900,1200,1500,1800,2100,2400,2700,3000,3300,Function
YBR177C,0.128333,0.133333,,0.043333,0.055833,0.034167,0.031,0.0375,0.091667,0.0775,0.123333,0.065,
YIL140W,0.098333,0.1,0.040833,0.048333,0.048333,0.031667,,,0.078333,,,,
YLR268W,0.135,0.11,0.060333,0.054048,0.055333,0.036458,0.03697,0.034167,0.088333,0.080556,0.12,0.072222,SNARE interactions in vesicular transport
YJL155C,0.115,,0.038333,0.053333,0.043333,0.035,0.025833,0.02,0.078333,0.08,0.1,0.075,Carbohydrate metabolism
YLR197W,0.143333,0.145,0.071322,0.069624,0.075333,0.048251,0.048145,0.050645,0.105,0.108512,0.139792,0.102849,Ribosome biogenesis in eukaryotes


In [29]:
functional_counts = all_time_tes_annotated.Function.value_counts(dropna=False)

We keep only functional groups with at least `min_genes` genes to calculate a better $\mu$ and $\sigma$.
We drop the unlabelled ones.

In [30]:
functional_counts[functional_counts >= min_genes].index.drop(u'')

Index([u'Other enzymes', u'Amino acid metabolism', u'Chromosome-related',
       u'Ribosome', u'Cofactor biosynthesis', u'Lipid and steroid metabolism',
       u'Chaperones and folding catalysts',
       u'Ribosome biogenesis in eukaryotes', u'Glycolysis', u'Spliceosome'],
      dtype='object')

In [32]:
all_time_tes_annotated_selected = all_time_tes_annotated[all_time_tes_annotated
                                                         .Function.isin(functional_counts[functional_counts >= 
                                                                                          min_genes].index.drop(u''))]
len(all_time_tes_annotated_selected)

831

In [20]:
all_time_tes_annotated_selected.groupby(['Function'], as_index=False).mean()

Unnamed: 0,Function,0,300,600,900,1200,1500,1800,2100,2400,2700,3000,3300
0,Amino acid metabolism,0.126517,0.127096,0.05919,0.061137,0.063923,0.040562,0.040712,0.042214,0.091953,0.094946,0.121516,0.087235
1,Basal transcription factors,0.098458,0.098444,0.044166,0.046016,0.046824,0.029518,0.030149,0.030498,0.070825,0.070598,0.091257,0.064471
2,Carbohydrate metabolism,0.115404,0.115119,0.050334,0.0502,0.052946,0.034329,0.03272,0.03288,0.081291,0.079838,0.108852,0.074851
3,Cell cycle,0.085456,0.089636,0.040987,0.04018,0.041526,0.024847,0.026961,0.027069,0.063038,0.060246,0.073247,0.057771
4,Cell wall,0.140799,0.142782,0.073713,0.074639,0.079754,0.051572,0.053026,0.054203,0.109292,0.106402,0.132356,0.100528
5,Chaperones and folding catalysts,0.120317,0.121155,0.063392,0.065402,0.067502,0.043988,0.043895,0.044797,0.094547,0.094374,0.118083,0.089172
6,Chromosome-related,0.085173,0.086855,0.037941,0.039821,0.040434,0.02547,0.025144,0.026581,0.061771,0.06198,0.082989,0.058491
7,Cofactor biosynthesis,0.09797,0.097949,0.045801,0.047823,0.047251,0.029897,0.030791,0.031248,0.072486,0.070352,0.098655,0.067822
8,Cytoskeleton proteins,0.108229,0.104094,0.042882,0.043528,0.046285,0.028941,0.0284,0.029317,0.070224,0.066502,0.099848,0.072902
9,DNA replication complex,0.09799,0.101932,0.045131,0.049223,0.048989,0.030853,0.029691,0.032518,0.071469,0.071329,0.095606,0.066644


Strangely `std()` does not work.

In [26]:
all_time_tes_annotated_selected.groupby(['Function'], as_index=False).mean().rank(ascending=False)

Unnamed: 0,Function,0,300,600,900,1200,1500,1800,2100,2400,2700,3000,3300
0,39.0,7.0,8.0,11.0,11.0,9.0,10.0,11.0,8.0,9.0,5.0,7.0,7.0
1,38.0,26.0,27.0,28.0,28.0,28.0,28.0,27.0,28.0,28.0,27.0,29.0,29.0
2,37.0,12.0,15.0,22.0,24.0,21.0,19.0,24.0,25.0,20.0,20.0,17.0,20.0
3,36.0,33.0,33.0,32.0,34.0,33.0,37.0,33.0,33.0,34.0,37.0,38.0,35.0
4,35.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0
5,34.0,10.0,11.0,4.0,5.0,4.0,5.0,5.0,6.0,7.0,6.0,10.0,6.0
6,33.0,34.0,35.0,36.0,35.0,37.0,34.0,35.0,35.0,36.0,34.0,34.0,34.0
7,32.0,28.0,28.0,26.0,27.0,27.0,27.0,26.0,27.0,26.0,28.0,26.0,27.0
8,31.0,20.0,23.0,30.0,30.0,29.0,30.0,31.0,31.0,29.0,30.0,25.0,22.0
9,30.0,27.0,24.0,27.0,25.0,26.0,26.0,29.0,26.0,27.0,26.0,28.0,28.0


In [33]:
all_time_tes_annotated_selected.groupby(['Function'], as_index=False).mean().rank(ascending=False).plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f077d1146d0>

https://stats.stackexchange.com/questions/29560/can-i-compare-ordinal-rankings-and-if-so-how