    Copyright (C) 2022 Allen Buskirk

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.

In [1]:
import pandas as pd #used for constructing dataframes
import numpy as np
import pickle #used to save generated plots

#plotting with Bokeh:
from bokeh.io import output_notebook, push_notebook, export, curdoc
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, CDSView, HoverTool, LinearColorMapper, FixedTicker, ColorBar, NumeralTickFormatter
from bokeh.layouts import column, gridplot
from bokeh.transform import linear_cmap
import colorcet as cc
output_notebook()

#adding iPython interaction to bokeh plots
from IPython.display import display
from ipywidgets import interact, IntSlider, ToggleButtons, SelectionSlider


In [2]:
filelist = [
 'RIBO_4SU_R1',
 'RIBO_4SU_R2',
 'RIBO_4SU_R3',
 'RIBO_4SU_UV_R1',
 'RIBO_4SU_UV_R2',
 'RIBO_4SU_UV_R3',
 'RIBO_UNT_R1',
 'RIBO_UNT_R2',
 'RIBO_UNT_R3',
 'RIBO_UV_R1',
 'RIBO_UV_R2',
 'RIBO_UV_R3'
    ]

In [3]:
### this transcript dataframe has all the information needed for translation 
### transcript_id is also the key for the ribosome density as a list in the density dictionaries below

tx_df = pickle.load(open("/home/allen/code/4suv/github_4suv/tx_df_pkl", "rb"))
print("Pickled tx_df loaded:", len(tx_df), "transcripts")
tx_df.head()

Pickled tx_df loaded: 19580 transcripts


Unnamed: 0,transcript_id,chrom,strand,start,stop,spliced_len,seq
0,ENST00000367770.5,1,-,48,2276,2916,CTGCTTGGCTTTGAGGAAGAGTGGCAGTACTGCCTCACTGCATAAG...
1,ENST00000286031.10,1,+,700,3261,4355,GGCTTTGGCCCTGGAAAGCCTCGCGGACGTGTTCTGACCCAAGGTT...
2,ENST00000374004.5,1,-,146,1735,2350,GGGAGGACCCCAATCTAGGCCCAAGAGGGAAAGCCACGTGCCTGTA...
3,ENST00000359637.2,1,+,62,1219,1454,GAGGAGAACTGGACGTTGTGAACAGAGTTAGCTGGTAAATGTCCTC...
4,ENST00000374409.5,1,-,255,1259,2804,TCGTCACAGCCATGAGTGAGACTTGAAGCCCGTTTTACGTATGAAG...


In [4]:
def run_metagenes():
    avgwin = np.array([0 for x in range(up+down)], dtype=float)
    counter = 0
    tooshort = 0
    above_thresh = 0

    for tr in tx_df.transcript_id:
        rpc = sum(dd[tr]) / len(dd[tr]) * 3
        if rpc > threshold:
            above_thresh += 1
            pos = int(tx_df[tx_df.transcript_id == tr][position])
            if (pos - up) < 0:
                tooshort += 1
            elif (pos + down) > len(dd[tr]):
                tooshort += 1
            else:
                window = dd[tr][pos-up:pos+down].astype(float)
                if sum(window) == 0.0:
                    continue
                window /= sum(window) # normalize each window to the same amount
                avgwin += window
                counter +=1
                
    print('There were', above_thresh, 'genes above the threshold', threshold, 'rpc')
    print(counter, 'transcripts included in average')

    avgwin /= counter
    avgwin *= len(avgwin)
    return avgwin

In [5]:
up = 100
down = 1000
threshold = 0.1
position = 'start'     #start or stop

out_dict = {}

for fn in filelist:
    dd = pickle.load(open("/home/allen/data/4suv/density/all/"+fn+"_density_pkl", "rb"))
    print(fn)
    out_dict[fn] = run_metagenes()


RIBO_4SU_R1
There were 6680 genes above the threshold 0.1 rpc
3267 transcripts included in average
RIBO_4SU_R2
There were 6938 genes above the threshold 0.1 rpc
3446 transcripts included in average
RIBO_4SU_R3
There were 6856 genes above the threshold 0.1 rpc
3379 transcripts included in average
RIBO_4SU_UV_R1
There were 7735 genes above the threshold 0.1 rpc
3971 transcripts included in average
RIBO_4SU_UV_R2
There were 4336 genes above the threshold 0.1 rpc
1868 transcripts included in average
RIBO_4SU_UV_R3
There were 5864 genes above the threshold 0.1 rpc
2760 transcripts included in average
RIBO_UNT_R1
There were 6171 genes above the threshold 0.1 rpc
2944 transcripts included in average
RIBO_UNT_R2
There were 7065 genes above the threshold 0.1 rpc
3522 transcripts included in average
RIBO_UNT_R3
There were 7327 genes above the threshold 0.1 rpc
3664 transcripts included in average
RIBO_UV_R1
There were 5921 genes above the threshold 0.1 rpc
2790 transcripts included in average
RI

In [6]:
x_val = [x for x in range(-up,down)]

p = figure(sizing_mode="stretch_width", height=300, title='meta-start',
           y_axis_label='avg ribosome density', x_axis_label='nt from start')

for fn in filelist:
    if "UNT" in fn:     
        p.line(x_val, out_dict[fn], color='black', line_width=2)
    if "4SU_UV" in fn:
        p.line(x_val, out_dict[fn], color='red', line_width=2)
    if "O_UV" in fn:
        p.line(x_val, out_dict[fn], color='green', line_width=2)
    if "4SU_R" in fn:
        p.line(x_val, out_dict[fn], color='blue', line_width=2)

show(p)



In [86]:
### this saves the metagene plot information with nt on the x axis as plotted above
df1 = pd.DataFrame(out_dict)
df1.to_csv('/home/allen/data/4suv/analyses/all_start_meta.csv')

In [7]:
### this smoothes the data with a 3 nt window, not showing per codon instead of per nt

codon_dict = {}
for fn in filelist:
    codon_dict[fn] = []
    for x in range(9,1097,3):
        codon_dict[fn].append(sum(out_dict[fn][x:x+3])/3)

In [8]:
x_val = [x for x in range(-30,300)]

p = figure(sizing_mode="stretch_width", height=300, title='meta-start',
           y_axis_label='avg ribosome density', x_axis_label='codons from start')

for fn in filelist:
    if "UNT" in fn:     
        p.line(x_val, codon_dict[fn], color='black', line_width=2)
    if "4SU_UV" in fn:
        p.line(x_val, codon_dict[fn], color='red', line_width=2)
    if "O_UV" in fn:
        p.line(x_val, codon_dict[fn], color='green', line_width=2)
    if "4SU_R" in fn:
        p.line(x_val, codon_dict[fn], color='blue', line_width=2)

show(p)





In [89]:
### this saves the smoothed data -- these were then plotted for publication

df2 = pd.DataFrame(codon_dict)
df2.to_csv('/home/allen/data/4suv/analyses/all_start_meta_codons.csv')