# Average, rank analysis by functional group

In [1]:
import pandas as pd
import cPickle as pkl
import matplotlib
matplotlib.use("pgf")
pgf_with_rc_fonts = {
    "font.family": "serif",
    "font.serif": [u'Adobe Caslon Pro'],                   # use latex default serif font
    "font.sans-serif": ["DejaVu Sans"], # use a specific sans-serif font
}
matplotlib.rcParams.update(pgf_with_rc_fonts)
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
%matplotlib inline

import seaborn as sns

In [2]:
pd.options.display.max_rows = 100

Protein synthesis rate per cell (not per transcript):

In [3]:
all_time_tes_annotated = pkl.load(open("../../parameters/all_time_tes_annotated.df"))
len(all_time_tes_annotated)

3727

We want to calculate a table of $\mu$ and $\sigma$ per functional category and time.

How many genes a functional group should have at minimum:

In [4]:
#min_genes = 35
min_genes = 50

In [5]:
all_time_tes_annotated.head()

Unnamed: 0,0,300,600,900,1200,1500,1800,2100,2400,2700,3000,3300,Function
YBR177C,0.128333,0.133333,,0.043333,0.055833,0.034167,0.031,0.0375,0.091667,0.0775,0.123333,0.065,
YIL140W,0.098333,0.1,0.040833,0.048333,0.048333,0.031667,,,0.078333,,,,
YLR268W,0.135,0.11,0.060333,0.054048,0.055333,0.036458,0.03697,0.034167,0.088333,0.080556,0.12,0.072222,SNARE interactions in vesicular transport
YJL155C,0.115,,0.038333,0.053333,0.043333,0.035,0.025833,0.02,0.078333,0.08,0.1,0.075,Carbohydrate metabolism
YLR197W,0.143333,0.145,0.071322,0.069624,0.075333,0.048251,0.048145,0.050645,0.105,0.108512,0.139792,0.102849,Ribosome biogenesis in eukaryotes


In [6]:
functional_counts = all_time_tes_annotated.Function.value_counts(dropna=False)

We keep only functional groups with at least `min_genes` genes to calculate a better $\mu$ and $\sigma$.

We drop the unlabelled ones:

In [7]:
functional_counts[functional_counts >= min_genes].index.drop(u'')

Index([u'Other enzymes', u'Amino acid metabolism', u'Chromosome-related',
       u'Ribosome', u'Cofactor biosynthesis', u'Lipid and steroid metabolism',
       u'Chaperones and folding catalysts',
       u'Ribosome biogenesis in eukaryotes', u'Glycolysis', u'Spliceosome'],
      dtype='object')

In [8]:
functional_counts.head(n=11)

                                     1859
Other enzymes                         122
Amino acid metabolism                 116
Chromosome-related                    112
Ribosome                              110
Cofactor biosynthesis                  82
Lipid and steroid metabolism           66
Chaperones and folding catalysts       63
Ribosome biogenesis in eukaryotes      55
Glycolysis                             53
Spliceosome                            52
Name: Function, dtype: int64

In [9]:
all_time_tes_annotated_selected = all_time_tes_annotated[all_time_tes_annotated
                                                         .Function.isin(functional_counts[functional_counts >= 
                                                                                          min_genes].index.drop(u''))]
len(all_time_tes_annotated_selected)

831

In [10]:
all_time_tes_annotated_selected.head()

Unnamed: 0,0,300,600,900,1200,1500,1800,2100,2400,2700,3000,3300,Function
YLR197W,0.143333,0.145,0.071322,0.069624,0.075333,0.048251,0.048145,0.050645,0.105,0.108512,0.139792,0.102849,Ribosome biogenesis in eukaryotes
YGR148C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Ribosome
YBR111W-A,0.0,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Chromosome-related
YHR007C,0.149444,0.138889,0.068333,0.068167,0.074417,0.046286,0.046468,0.048476,0.107273,0.103485,0.142222,0.10141,Lipid and steroid metabolism
YBR248C,0.150556,0.1325,0.073095,0.072222,0.068,0.054545,0.053333,0.051061,0.106,0.115238,0.135417,0.106,Amino acid metabolism


Average translational efficiency by function:

In [11]:
means = all_time_tes_annotated_selected.groupby(['Function'], as_index=False).mean()
means.set_index('Function', inplace=True)

Display in color:

In [12]:
#cm = sns.light_palette("blue", as_cmap=True)
cm = sns.diverging_palette(240, 10, n=9, as_cmap=True)

means.round(decimals=3).style.background_gradient(cmap=cm)

Unnamed: 0_level_0,0,300,600,900,1200,1500,1800,2100,2400,2700,3000,3300
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Amino acid metabolism,0.127,0.127,0.059,0.061,0.064,0.041,0.041,0.042,0.092,0.095,0.122,0.087
Chaperones and folding catalysts,0.12,0.121,0.063,0.065,0.068,0.044,0.044,0.045,0.095,0.094,0.118,0.089
Chromosome-related,0.085,0.087,0.038,0.04,0.04,0.025,0.025,0.027,0.062,0.062,0.083,0.058
Cofactor biosynthesis,0.098,0.098,0.046,0.048,0.047,0.03,0.031,0.031,0.072,0.07,0.099,0.068
Glycolysis,0.135,0.136,0.063,0.066,0.066,0.044,0.044,0.045,0.103,0.102,0.124,0.091
Lipid and steroid metabolism,0.11,0.111,0.053,0.055,0.055,0.035,0.036,0.036,0.084,0.084,0.107,0.077
Other enzymes,0.106,0.109,0.051,0.05,0.051,0.034,0.033,0.034,0.08,0.078,0.104,0.073
Ribosome,0.109,0.11,0.062,0.063,0.065,0.044,0.044,0.045,0.087,0.086,0.105,0.082
Ribosome biogenesis in eukaryotes,0.111,0.112,0.055,0.056,0.059,0.037,0.037,0.039,0.085,0.084,0.111,0.08
Spliceosome,0.105,0.1,0.048,0.051,0.052,0.033,0.033,0.035,0.075,0.076,0.101,0.07


In [13]:
print(means.round(decimals=3).to_latex())

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
{} &   0    &   300  &   600  &   900  &   1200 &   1500 &   1800 &   2100 &   2400 &   2700 &   3000 &   3300 \\
Function                          &        &        &        &        &        &        &        &        &        &        &        &        \\
\midrule
Amino acid metabolism             &  0.127 &  0.127 &  0.059 &  0.061 &  0.064 &  0.041 &  0.041 &  0.042 &  0.092 &  0.095 &  0.122 &  0.087 \\
Chaperones and folding catalysts  &  0.120 &  0.121 &  0.063 &  0.065 &  0.068 &  0.044 &  0.044 &  0.045 &  0.095 &  0.094 &  0.118 &  0.089 \\
Chromosome-related                &  0.085 &  0.087 &  0.038 &  0.040 &  0.040 &  0.025 &  0.025 &  0.027 &  0.062 &  0.062 &  0.083 &  0.058 \\
Cofactor biosynthesis             &  0.098 &  0.098 &  0.046 &  0.048 &  0.047 &  0.030 &  0.031 &  0.031 &  0.072 &  0.070 &  0.099 &  0.068 \\
Glycolysis                        &  0.135 &  0.136 &  0.063 &  0.066 &  0.066 &  0.044 &  0.044 &  0.045 &  0.1

Strangely `std()` does not work.

Show only ranks of means to reduce the noise and effect of different scales:

Average across time:

In [14]:
means.mean(axis=1).sort_values(ascending=False)

Function
Glycolysis                           0.084775
Chaperones and folding catalysts     0.080552
Amino acid metabolism                0.079750
Ribosome                             0.075172
Ribosome biogenesis in eukaryotes    0.072311
Lipid and steroid metabolism         0.070110
Other enzymes                        0.066889
Spliceosome                          0.064699
Cofactor biosynthesis                0.061504
Chromosome-related                   0.052721
dtype: float64

In [15]:
ranks = means.rank(ascending=False)
ranks

Unnamed: 0_level_0,0,300,600,900,1200,1500,1800,2100,2400,2700,3000,3300
Function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Amino acid metabolism,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,2.0,2.0,3.0
Chaperones and folding catalysts,3.0,3.0,1.0,2.0,1.0,2.0,2.0,3.0,2.0,3.0,3.0,2.0
Chromosome-related,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
Cofactor biosynthesis,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
Glycolysis,1.0,1.0,2.0,1.0,2.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0
Lipid and steroid metabolism,5.0,5.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,5.0,6.0
Other enzymes,7.0,7.0,7.0,8.0,8.0,7.0,7.0,8.0,7.0,7.0,7.0,7.0
Ribosome,6.0,6.0,3.0,3.0,3.0,1.0,1.0,2.0,4.0,4.0,6.0,4.0
Ribosome biogenesis in eukaryotes,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0
Spliceosome,8.0,8.0,8.0,7.0,7.0,8.0,8.0,7.0,8.0,8.0,8.0,8.0


In [16]:
ranks[3300]

Function
Amino acid metabolism                 3.0
Chaperones and folding catalysts      2.0
Chromosome-related                   10.0
Cofactor biosynthesis                 9.0
Glycolysis                            1.0
Lipid and steroid metabolism          6.0
Other enzymes                         7.0
Ribosome                              4.0
Ribosome biogenesis in eukaryotes     5.0
Spliceosome                           8.0
Name: 3300, dtype: float64

In [None]:
#anks.sort_index(axis='3300')
ranks.sort([3300]).index

  


Index([u'Glycolysis', u'Chaperones and folding catalysts',
       u'Amino acid metabolism', u'Ribosome',
       u'Ribosome biogenesis in eukaryotes', u'Lipid and steroid metabolism',
       u'Other enzymes', u'Spliceosome', u'Cofactor biosynthesis',
       u'Chromosome-related'],
      dtype='object', name=u'Function')

In [None]:
plt.close('all')

#plt.style.use('seaborn-poster')

font = {'family': 'serif', 'size': 16}
matplotlib.rc('font', **font)

plt.rcParams['figure.figsize'] = (10, 10)
plt.rcParams['axes.edgecolor']='black'
plt.rcParams['axes.labelcolor']='black'
plt.rcParams['xtick.color']='black'
plt.rcParams['ytick.color']='black'
plt.rcParams['grid.color']='grey'
plt.rcParams['grid.alpha']='0.5'
plt.rcParams['grid.linestyle']='-.'

ax = means.transpose().plot(linewidth=2.3)
ax.set_yscale('log')
ax.set_ylim([0.2, 0.02])

plt.xlabel('time [s]')
plt.ylabel('Mean of translational efficiency')
plt.gca().invert_yaxis()
#plt.gca().set_yticks(range(1, len(ranks) + 1, 1))
plt.grid()

handles, labels = ax.get_legend_handles_labels()

# sort labels
labels_tmp = ranks.sort([3300]).index
# sort handles the same way
# https://stackoverflow.com/questions/6618515/sorting-list-based-on-values-from-another-list
handles_tmp = [x for _, x in sorted(zip(ranks[3300], handles))]

plt.legend(handles_tmp, labels_tmp, loc='center left', bbox_to_anchor=(0.99, 0.5), labelspacing=0.93,
          frameon=False)

plt.savefig("ribosome_efficiency_ranking_vs_time.pgf", bbox_inches='tight')
#plt.show()



In [None]:
sorted_functions = [('Ribosome', 0.12628066790927703),
 ('Glycolysis', 0.08185288434831853),
 ('Chaperones and folding catalysts', 0.07145962745808186),
 ('Amino acid metabolism', 0.062478004030412454),
 ('Ribosome biogenesis in eukaryotes', 0.0562636947388247),
 ('Lipid and steroid metabolism', 0.05269564143716751),
 ('Other enzymes', 0.05184456762611764),
 ('Cofactor biosynthesis', 0.04967116015179395),
 ('Spliceosome', 0.048536242667356795),
 ('Chromosome-related', 0.04100702207808541)]

See https://github.com/gittenberg/TRSL/blob/master/workbooks/analyses/07a%20TRSL_analyse_time-resolved_translation_efficiencies%20diff.%20cut-off%20and%20colors.ipynb

In [None]:
colordict = {function[0]: plt.get_cmap("Paired")(i) for i, function in enumerate(sorted_functions)}

In [None]:
plt.close('all')

#plt.style.use('seaborn-poster')

font = {'family': 'serif', 'size': 16}
matplotlib.rc('font', **font)

plt.rcParams['figure.figsize'] = (10, 10)
plt.rcParams['axes.edgecolor']='black'
plt.rcParams['axes.labelcolor']='black'
plt.rcParams['xtick.color']='black'
plt.rcParams['ytick.color']='black'
plt.rcParams['grid.color']='grey'
plt.rcParams['grid.alpha']='0.5'
plt.rcParams['grid.linestyle']='-.'

ax = ranks.transpose().plot(linewidth=2.3, color=[colordict.get(x, '#333333') for x in ranks.transpose().columns])

plt.xlabel('time [s]')
plt.ylabel('Rank of translational efficiency')
plt.gca().invert_yaxis()
plt.gca().set_yticks(range(1, len(ranks) + 1, 1))
plt.grid()

handles, labels = ax.get_legend_handles_labels()

# sort labels
labels_tmp = ranks.sort([1800]).index
print labels_tmp

# sort handles the same way
# https://stackoverflow.com/questions/6618515/sorting-list-based-on-values-from-another-list
handles_tmp = [x for _, x in sorted(zip(ranks[1800], handles))]

plt.legend(handles_tmp, labels_tmp, loc='center left', bbox_to_anchor=(0.99, 0.5), labelspacing=0.93,
          frameon=False)

plt.savefig("ribosome_efficiency_ranking_vs_time.pgf", bbox_inches='tight')
#plt.show()

Same with Computer Modern for paper:

In [None]:
from matplotlib.pyplot import cm

Some manual adjustments:

In [None]:
colordict['Ribosome'], colordict['Lipid and steroid metabolism'] = \
colordict['Lipid and steroid metabolism'], colordict['Ribosome']

In [None]:
colordict

https://stackoverflow.com/questions/47104862/pandas-dataframe-plot-colors-by-column-name

In [None]:
ranks.transpose().head()

In [None]:
plt.close('all')

#plt.style.use('seaborn-poster')

font = {'family': 'serif', 'size': 16}
#matplotlib.rc('font', **font)

plt.rc('text', usetex=True)
plt.rc('font', family='serif')

plt.rcParams['figure.figsize'] = (10, 10)
plt.rcParams['axes.edgecolor']='black'
plt.rcParams['axes.labelcolor']='black'
plt.rcParams['xtick.color']='black'
plt.rcParams['ytick.color']='black'
plt.rcParams['grid.color']='grey'
plt.rcParams['grid.alpha']='0.5'
plt.rcParams['grid.linestyle']='-.'

ax = ranks.transpose().plot(linewidth=2.3, color=[colordict.get(x, '#333333') for x in ranks.transpose().columns])

plt.xlabel('time [s]')
plt.ylabel('Rank of translational efficiency')
plt.gca().invert_yaxis()
plt.gca().set_yticks(range(1, len(ranks) + 1, 1))
plt.grid()

handles, labels = ax.get_legend_handles_labels()

# sort labels
labels_tmp = ranks.sort([1800]).index
# sort handles the same way
# https://stackoverflow.com/questions/6618515/sorting-list-based-on-values-from-another-list
handles_tmp = [x for _, x in sorted(zip(ranks[1800], handles))]

plt.legend(handles_tmp, labels_tmp, loc='center left', bbox_to_anchor=(0.99, 0.5), labelspacing=0.93,
          frameon=False)

plt.savefig("ribosome_efficiency_ranking_vs_time.pdf", bbox_inches='tight')
#plt.show()

TODO: look at Kendall's $\tau$ as per

https://stats.stackexchange.com/questions/29560/can-i-compare-ordinal-rankings-and-if-so-how