## Production rate analysis of PUNDs
Author: Robert Ietswaart  
Date: 20230427  
License: BSD2.  
Load modules j3dl and activate virtual environment using j4RNAdecay on O2.  
Python v3.7.4

Source: `KP_20220713.ipynb`  
For RNA flow project. 

In [1]:
import os
import re
import copy
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import logging
import argparse
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
import seaborn as sns


# from __init__ import __version__
from __init__ import default_logger_format, default_date_format

In [2]:
# def main():
np.random.seed(12345)

parser = argparse.ArgumentParser(
    description='Estimating the fraction of transcripts that get nuclear degraded, ' 
                'through production rate estimates kp.')

args = parser.parse_args("")#EDIT: added "" as argument to run in ipynb instead of .py script

In [7]:
path = os.path.join('/n','groups','churchman','ri23','bseq','KP20230427')

# Add a logger specific to the project and processing stage
logger = logging.getLogger('KP')
log_file = os.path.join(path,'LogErr', 'KP_20230427.log')
formatter = logging.Formatter(default_logger_format,
                              datefmt=default_date_format)
log_handler = logging.FileHandler(log_file)
log_handler.setFormatter(formatter)
logger.addHandler(log_handler)

organisms = ['m','h']
org_map = {'m': 'mouse', 'h': 'human'}
org_red_reps = {'m': ['G_R','H_S'], 'h': ['T', 'U']}
reps = ['G','H','R','S','T','U']
red_r = {'G': 'G_R', 'H': 'H_S', 'R': 'G_R', 'S': 'H_S', 'T': 'T', 'U': 'U'}
fracs = ['chr', 'nuc', 'cyto', 'poly', 'tot']
k_bound_lo = 1e-4 #unit: min^-1: 1 per 7 days
k_bound_hi = 1e4 #unit: min^-1: 1 per 6 ms

RATE_TYPE = ['half_life_','k_']

rt = RATE_TYPE[1] #ML model dependent variable: rates

Timescales = ['chr',
              'chr_release',
              'nucdeg',
              'nucexp',
              'nuc',
              'cyto',
              'poly_entry',
              'whole_cell']
Timescales = [rt + ts for ts in Timescales]
TC_TYPES = ['top1000','bottom500']#BM model turnover method to estimate TC
OUT_TYPES = ['.Mean', '.MAP', '.0.975.quantile', '.0.025.quantile']


kp_compartment = 'chromatin' #'wc' #
kp_ot = '_Mean' #'_MAP' #

B = dict()          #Bayes fits file
K = dict()          #Bayes Factor
P = dict()
GS = dict()         #GRAND-SLAM

for o in organisms:  
    GS[o] = dict()   
    if o == 'm':
        path_gs = os.path.join('/n','groups','churchman','ri23','bseq','GS20210506')  
    elif o == 'h':
        path_gs = os.path.join('/n','groups','churchman','ri23','bseq','GS20210713_human')
    path_b = os.path.join('/n','groups','churchman','ri23','bseq','Bayes20230128')
    filename_b = 'Bayes_Rates_20230128_'+ org_map[o] + '.tsv'
    path_k = os.path.join('/n','groups','churchman','ri23','bseq','BayesFactor20221206')
    filename_k = 'Bayes_factor_20230317_' + org_map[o] + '_final.tsv'

    B[o] = pd.read_csv(os.path.join(path_b, filename_b), sep='\t')
    K[o] = pd.read_csv(os.path.join(path_k, filename_k), sep='\t')
    
    for r in reps:
        for fr in fracs:
            for tc in TC_TYPES:
                filename_gs = r + '_' + fr + '_noMT_' + tc + '.csv'
                if os.path.exists(os.path.join(path_gs, filename_gs)):
                    GS[o][red_r[r]+fr+tc]= pd.read_csv(os.path.join(path_gs, filename_gs) ,index_col=0)

    for i, rr in enumerate(org_red_reps[o]):
        p_path = os.path.join('/n','groups','churchman','ri23','bseq','KP20220713')        
        if o == 'm':
            filename = kp_compartment+'_kp_rep'+str(i+1)+'_'+org_map[o]+kp_ot+'.txt'  
        elif o == 'h':
            filename = kp_compartment+'_kp_rep'+str(i+1)+kp_ot+'.txt' 
        P[rr] = pd.read_csv(os.path.join(p_path, filename), sep='\t')
        mapper = {'kp':rr+'.kp', 'type':rr+'.type'}
        P[rr].rename(mapper=mapper, axis=1, inplace=True)

### Preprocess rates: dependent variables in model
- preprocess nucexp: nucexp_from_chr or nucexp_from_nucdeg depending on nucdeg no or yes 
- clip to domain bounds

In [8]:
T_bf = 100

C = copy.deepcopy(B)
for o in organisms:
    C[o] = C[o].merge(K[o], on='Gene', how='outer', suffixes=('', '_bf'))
    
    for rr in org_red_reps[o]:
        C[o] = C[o].merge(P[rr], on='Gene', how='outer', suffixes=('', '_'+rr))

    if o == 'h' and kp_ot == '_Mean' and kp_compartment == 'chromatin':
        C[o] = C[o][~C[o]['Symbol'].isin({'TMPO'})]#artefactually fast rate for Mean rep2; seems only outlier 
        
    logger.info('%s all genes: %d' % (org_map[o], len(C[o])))

    for ts in Timescales:
        for ot in OUT_TYPES:
            for rr in org_red_reps[o]:
                if ts == rt + 'chr_release':
                    C[o][rr+'.'+ts+ot] = copy.deepcopy(C[o][rr+'.'+ts+'_from_nucdeg'+ot].where(
                        C[o]['PUND'], C[o][rr+'.'+rt+'chr'+ot]))
                if ts == rt + 'nucdeg':#only for nucdeg genes according to Bayes Factor
                    C[o][rr+'.'+ts+ot].where(C[o]['PUND'], (24*60), inplace=True)
                if ts == rt + 'nucexp':
                    C[o][rr+'.'+ts+ot] = copy.deepcopy(C[o][rr+'.'+ts+'_from_nucdeg'+ot].where(
                        C[o]['PUND'], C[o][rr+'.'+ts+'_from_nucres'+ot]))

                ###Clip range of values beyond realistic timescale values: 0.1min and 24h 
                if rr+'.'+ts+ot in C[o].keys():
                    C[o][rr+'.'+ts+ot].where(C[o][rr+'.'+ts+ot] > 1e-1, 1e-1, inplace=True) 
                    C[o][rr+'.'+ts+ot].where(C[o][rr+'.'+ts+ot] < (24*60), (24*60), inplace=True) 


INFO: [2023-04-27 07:35:05] KP - mouse all genes: 17584
INFO: [2023-04-27 07:35:05] KP - human all genes: 18154


### Calculate fraction of degraded transcripts

In [14]:
ot = kp_ot.replace('_','.') 
    
for o in organisms:
    for rr in org_red_reps[o]:      
        C[o][rr+'.frac_ND'] = C[o][rr+'.k_nucdeg'+ot] / (C[o][rr+'.k_nucdeg'+ot]+C[o][rr+'.k_chr_release'+ot])\
            * (C[o][rr+'.k_chr_release'+ot] + C[o][rr+'.k_nucdeg'+ot] + C[o][rr+'.k_nucexp'+ot]) \
            / (C[o][rr+'.k_nucdeg'+ot] + C[o][rr+'.k_nucexp'+ot])
        C[o][rr+'.kp_ND'] = C[o][rr+'.frac_ND'] * C[o][rr+'.kp']

## Standard PUNDs criteria

In [19]:
logger.info('kp derived from compartment: %s with %s' % (kp_compartment, ot))


for o in organisms:
    C[o] = C[o][~C[o]['PUND'].isna()]
    global_kp_nd = C[o][C[o]['PUND']][[rr+'.kp_ND' for rr in org_red_reps[o]]].sum().sum()
    frac_nd_of_all = global_kp_nd / C[o][[rr+'.kp' for rr in org_red_reps[o]]].sum().sum()
    logger.info('%s fraction of transcripts nucdegraded over all produced protein-coding transcripts: %s' % (o,frac_nd_of_all))
    frac_nd_of_PUNDs = global_kp_nd / C[o][C[o]['PUND']][[rr+'.kp' for rr in org_red_reps[o]]].sum().sum()
    logger.info('%s fraction of transcripts nucdegraded over all produced PUND transcripts: %s' % (o,frac_nd_of_PUNDs))
 

INFO: [2023-04-27 08:42:06] KP - kp derived from compartment: chromatin with .Mean
INFO: [2023-04-27 08:42:06] KP - m fraction of transcripts nucdegraded over all produced protein-coding transcripts: 0.30925441893285815
INFO: [2023-04-27 08:42:06] KP - m fraction of transcripts nucdegraded over all produced PUND transcripts: 0.7438919273573544
INFO: [2023-04-27 08:42:06] KP - h fraction of transcripts nucdegraded over all produced protein-coding transcripts: 0.2312543731105066
INFO: [2023-04-27 08:42:06] KP - h fraction of transcripts nucdegraded over all produced PUND transcripts: 0.6846793665475807
