##Analyze time-resolved transcriptome data

In [7]:
import csv
import pandas as pd
import cPickle as pkl

from pygenome import sg

First we get all the systematic names from `yeastgenome.org`:

In [9]:
orf_genomic_dict = pkl.load(open("../parameters/orf_coding.p"))
len(orf_genomic_dict)

5917

Some appear to be missing (expected were >6000).

In [82]:
filename = "../data/nostress_WT_R1.csv"
transcripts = pd.DataFrame()

with open(filename, mode='r') as infile:
    reader = csv.reader(infile, delimiter=',')
    times = [int(float(val)) for val in reader.next()[1:]]
    ntimes = len(times)
    for rows in reader:
        try:
            # convert to systematic names
            colname = rows[0] if rows[0] in orf_genomic_dict else sg.systematic_name(rows[0])
            transcript_values = pd.Series(data=[float(rows[i]) for i in range(2, ntimes + 1)])  # the time point -1 is not needed
            transcripts[colname] = transcript_values
        except:
            pass

TODO: to normalize or not to normalize?

Create row labels (time points):

In [83]:
transcripts.index = times[1:]

In [84]:
print len(transcripts.columns)

5795


In [85]:
transcripts.ix[:25, :10]

Unnamed: 0,YAL001C,YAL002W,YAL003W,YAL005C,YAL007C,YAL008W,YAL009W,YAL010C,YAL011W,YAL012W
0,4,5,227,386.259074,18,9,7,3,7,228
10,5,4,180,233.285132,23,14,7,1,5,245
15,5,5,256,234.858509,25,13,10,3,8,303
20,7,5,291,283.728916,27,10,12,2,8,312
25,5,4,224,186.376147,17,10,14,1,2,181


Calculate measure of transcriptional load:

Attempt 1: by weighting transcript abundance with transcript length

In [86]:
orf_lengths = {prot: len(orf_genomic_dict[prot]) for prot in orf_genomic_dict}

In [89]:
tr_load_l = pd.DataFrame()

for column in transcripts:
    tr_load_l[column] = transcripts[column] * orf_lengths[column]

In [90]:
tr_load_l.ix[:25, :10]

Unnamed: 0,YAL001C,YAL002W,YAL003W,YAL005C,YAL007C,YAL008W,YAL009W,YAL010C,YAL011W,YAL012W
0,13932,19125,140967,745093.753746,11664,5373,5460,4446,13146,270180
10,17415,15300,111780,450007.019628,14904,8358,5460,1482,9390,290325
15,17415,19125,158976,453042.063861,16200,7761,7800,4446,15024,359055
20,24381,19125,180711,547313.078964,17496,5970,9360,2964,15024,369720
25,17415,15300,139104,359519.587563,11016,5970,10920,1482,3756,214485


Attempt 2: by weighting transcript abundance with initiation probability

In [91]:
init_rates_plotkin = pkl.load(open('../parameters/init_rates_plotkin.p'))

In [95]:
tr_load_i = pd.DataFrame()

for column in transcripts:
    if column in init_rates_plotkin:
        tr_load_i[column] = transcripts[column] * init_rates_plotkin[column]

In [97]:
tr_load_i.ix[:25, :10]

Unnamed: 0,YAL001C,YAL002W,YAL003W,YAL007C,YAL008W,YAL009W,YAL010C,YAL011W,YAL012W,YAL013W
0,3e-06,2e-06,0.001358,3.6e-05,2.7e-05,5e-06,3e-06,7e-06,0.001024,2e-05
10,4e-06,2e-06,0.001077,4.7e-05,4.2e-05,5e-06,1e-06,5e-06,0.001101,1.3e-05
15,4e-06,2e-06,0.001532,5.1e-05,3.9e-05,8e-06,3e-06,8e-06,0.001361,3e-05
20,5e-06,2e-06,0.001741,5.5e-05,3e-05,9e-06,2e-06,8e-06,0.001402,5e-05
25,4e-06,2e-06,0.00134,3.4e-05,3e-05,1.1e-05,1e-06,2e-06,0.000813,2e-05
