## Write ranked timescale to file for GSEA
Author: Robert Ietswaart  
Date: 20220613  
License: BSD2.  
Load modules j3dl and activate virtual environment using j4RNAdecay on O2.  
Python v3.7.4

Source: `GSEA_20220121.ipynb` and `Function_20220301.ipynb`  
For Subcellular Timelapse seq project. 

In [2]:
import os
import re
import copy
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import logging
import argparse
from __init__ import default_logger_format, default_date_format

In [3]:
# def main():
np.random.seed(12345)

parser = argparse.ArgumentParser(
    description='Write ranked timescale to file for GSEA.')

args = parser.parse_args("")#EDIT: added "" as argument to run in ipynb instead of .py script

In [9]:
path = os.path.join('/n','groups','churchman','ri23','bseq','GSEA20220613')

# Add a logger specific to the project and processing stage
logger = logging.getLogger('GSEA')
log_file = os.path.join(path,'LogErr', 'GSEA_preprocess_20220613.log')
formatter = logging.Formatter(default_logger_format,
                              datefmt=default_date_format)
log_handler = logging.FileHandler(log_file)
log_handler.setFormatter(formatter)
logger.addHandler(log_handler)

organisms = ['m','h']
org_map = {'m': 'mouse', 'h': 'human'}
org_id_map = {'m': 10090, 'h': 9606}
fracs = ['chr', 'nuc', 'cyto', 'poly', 'tot']
reps = ['G','H','R','S','T','U']
red_reps = ['G_R', 'H_S', 'T', 'U']
# org_reps = {'m': ['G','H','R','S'], 'h': ['T', 'U']}
org_red_reps = {'m': ['G_R','H_S'], 'h': ['T', 'U']}
red_r = {'G': 'G_R', 'H': 'H_S', 'R': 'G_R', 'S': 'H_S', 'T': 'T', 'U': 'U'}
time_id = [str(i) for i in range(1,6)]
background_id = {r: '1' for r in reps}
time_mins = [0, 15, 30, 60, 120]

fracs_model = {'chr': ['chr_fit'],  
               'nuc': ['nuc_fit', 'nuc_fit_from_chr', 'nuc_fit_from_nucdeg'],
               'cyto': ['cyto_fit_from_nucres', 'cyto_fit_from_chr', 'cyto_fit_from_nucdeg'],
               'poly': ['poly_fit_from_nucres'],
               'tot': ['tot_fit', 'tot_pred_from_nucres', 'tot_pred_from_chr', 
                       'tot_fit_from_nucdeg', 'tot_pred_from_nucdeg']}

RATE_TYPE = ['half_life_','k_','T_']
rt = RATE_TYPE[0]
Timescales = ['chr',
              'nuc',
              'nucexp_from_chr',
              'nucdeg',
              'cyto',
              'poly_entry',
              'whole_cell',
              'nucexp_from_nucdeg']
Timescales = [rt + ts for ts in Timescales]
k_bound_lo = 1e-4 #unit: min^-1: 1 per 7 days TEST2_half_life np.log(2)/1e-4 vs 1e-4
k_bound_hi = 1e4 #unit: min^-1: 1 per 6 ms,        TEST2_half_life
T_bound_lo = np.log(2)/k_bound_hi #unit: min, ~6 ms
T_bound_hi = np.log(2)/k_bound_lo #unit: min, ~7 days

OUT_TYPES = ['.Mean', '.MAP', '.0.975.quantile', '.0.025.quantile']


GS = dict()         #GRAND-SLAM
B = dict()          #Bayes fits file
K = dict()          #Bayes Factor
for o in organisms:  
    GS[o] = dict()   
    if o == 'm':
        path_gs = os.path.join('/n','groups','churchman','ri23','bseq','GS20210506')
        path_b = os.path.join('/n','groups','churchman','ri23','bseq','Bayes20220222')
        filename_b = 'Bayes20220228.tsv'   
    elif o == 'h':
        path_gs = os.path.join('/n','groups','churchman','ri23','bseq','GS20210713_human')
        path_b = os.path.join('/n','groups','churchman','ri23','bseq','Bayes20220228_human')
        filename_b = 'Bayes20220228_human.tsv'
    path_k = os.path.join('/n','groups','churchman','ri23','bseq','BayesFactor20220307')
    filename_k = 'Bayes_factor_20220315_' + org_map[o] + '.tsv'

    B[o] = pd.read_csv(os.path.join(path_b, filename_b), sep='\t')
    K[o] = pd.read_csv(os.path.join(path_k, filename_k), sep='\t')
    
    for r in reps:
        for fr in fracs:
            for tc in TC_TYPES:
                filename_gs = r + '_' + fr + '_noMT_' + tc + '.csv'
                if os.path.exists(os.path.join(path_gs, filename_gs)):
                    GS[o][red_r[r]+fr+tc]= pd.read_csv(os.path.join(path_gs, filename_gs) ,index_col=0)

In [10]:
T_bf = 100

C = copy.deepcopy(B)
for o in organisms:
    C[o] = C[o].merge(K[o], on='Gene', how='outer', suffixes=('', '_bf'))
    logger.info('%s all genes: %d' % (org_map[o], len(C[o])))
    
    for ot in OUT_TYPES[:2]:
        for rr in org_red_reps[o]:
            ts = rt+'nucexp_from_chr'
            C[o][rr+'.'+ts+ot].where(((C[o][rr+'.bayes_factor'] <= T_bf) | (C[o][rr+'.bayes_factor'].isna())), 
                                     C[o][rr+'.'+rt+'nucexp_from_nucdeg'+ot], inplace=True)                        
            ts = rt+'nucdeg'
            C[o][rr+'.'+ts+ot].where(C[o][rr+'.bayes_factor'] > T_bf, np.nan, inplace=True)
 

            for ts in Timescales:
                
                ###Clip range of values beyond numerical integration domain bounds                  
                C[o][rr+'.'+ts+ot].where(C[o][rr+'.'+ts+ot] > T_bound_lo, T_bound_lo, inplace=True) 
                C[o][rr+'.'+ts+ot].where(C[o][rr+'.'+ts+ot] < T_bound_hi, T_bound_hi, inplace=True)  

INFO: [2022-06-13 13:17:57] GSEA - mouse all genes: 17584
INFO: [2022-06-13 13:17:57] GSEA - human all genes: 18155


## Output ranked half life Z-scores

Z scores = log(T1/2)-mean(log(T1/2))/std(log(T1/2)).

In [12]:
gsea = dict()

for o in organisms:
    logger.info('%s all genes: %d' % (org_map[o], len(C[o])))
    for ot in OUT_TYPES[:2]:
        for rr in org_red_reps[o]:
            for ts in Timescales[:(len(Timescales)-1)]:

                cols = ['Gene', 'Symbol']
                cols.extend([rr+'.'+ts+ot for rr in org_red_reps[o]])
                gsea[ts] = copy.deepcopy(C[o][cols])
                gsea[ts] = gsea[ts][~gsea[ts][cols[2:]].isna().all(axis=1)]#filter out NA rows
                logger.info('%s genes without NA half lives %s: %d' % (org_map[o], ts, len(gsea[ts])))
        
                gsea[ts]['timescale'] = gsea[ts][cols[2:]].mean(axis=1, skipna=True)          
                #filter out bounds: cases where both replicates could not estimate timescale
                gsea[ts] = gsea[ts][~gsea[ts]['timescale'].isin([T_bound_lo,T_bound_hi])]
                logger.info('%s genes without timescales equal to domain bounds in %s: %d' % 
                        (org_map[o], ts, len(gsea[ts])))        
                #drop columns that are not used in output
                gsea[ts].drop(['Gene',cols[2],cols[3]], axis=1, inplace=True)
                gsea[ts].sort_values(by='timescale',ascending=False, inplace=True)       
  
                #standardize rates (to Z-score) to enhance learning
                gsea[ts]['timescale'] = np.log(gsea[ts]['timescale'])
                mu = gsea[ts]['timescale'].mean()
                std = gsea[ts]['timescale'].std()
                gsea[ts]['timescale'] = (gsea[ts]['timescale'] - mu) / std
                filename = 'GSEA_'+o+'_all_'+ts+ot+'_z.rnk'
                logger.info(filename)
                gsea[ts].to_csv(os.path.join(path,filename), header=False, index=False, sep='\t')  

INFO: [2022-06-13 13:19:50] GSEA - mouse all genes: 17584
INFO: [2022-06-13 13:19:50] GSEA - mouse genes without NA half lives half_life_chr: 17584
INFO: [2022-06-13 13:19:50] GSEA - mouse genes without timescales equal to domain bounds in half_life_chr: 12643
INFO: [2022-06-13 13:19:50] GSEA - GSEA_m_all_half_life_chr.Mean_z.rnk
INFO: [2022-06-13 13:19:50] GSEA - mouse genes without NA half lives half_life_nuc: 17584
INFO: [2022-06-13 13:19:50] GSEA - mouse genes without timescales equal to domain bounds in half_life_nuc: 12175
INFO: [2022-06-13 13:19:50] GSEA - GSEA_m_all_half_life_nuc.Mean_z.rnk
INFO: [2022-06-13 13:19:50] GSEA - mouse genes without NA half lives half_life_nucexp_from_chr: 17584
INFO: [2022-06-13 13:19:50] GSEA - mouse genes without timescales equal to domain bounds in half_life_nucexp_from_chr: 12036
INFO: [2022-06-13 13:19:50] GSEA - GSEA_m_all_half_life_nucexp_from_chr.Mean_z.rnk
INFO: [2022-06-13 13:19:50] GSEA - mouse genes without NA half lives half_life_nucde

INFO: [2022-06-13 13:19:52] GSEA - human genes without timescales equal to domain bounds in half_life_chr: 12757
INFO: [2022-06-13 13:19:52] GSEA - GSEA_h_all_half_life_chr.Mean_z.rnk
INFO: [2022-06-13 13:19:52] GSEA - human genes without NA half lives half_life_nuc: 18155
INFO: [2022-06-13 13:19:52] GSEA - human genes without timescales equal to domain bounds in half_life_nuc: 13323
INFO: [2022-06-13 13:19:52] GSEA - GSEA_h_all_half_life_nuc.Mean_z.rnk
INFO: [2022-06-13 13:19:52] GSEA - human genes without NA half lives half_life_nucexp_from_chr: 18155
INFO: [2022-06-13 13:19:52] GSEA - human genes without timescales equal to domain bounds in half_life_nucexp_from_chr: 12601
INFO: [2022-06-13 13:19:52] GSEA - GSEA_h_all_half_life_nucexp_from_chr.Mean_z.rnk
INFO: [2022-06-13 13:19:52] GSEA - human genes without NA half lives half_life_nucdeg: 18155
INFO: [2022-06-13 13:19:52] GSEA - human genes without timescales equal to domain bounds in half_life_nucdeg: 744
INFO: [2022-06-13 13:19:5

# OLD

In [17]:
!pip freeze

asteval==0.9.23
attrs==19.3.0
backcall==0.1.0
bleach==3.1.4
certifi==2021.5.30
charset-normalizer==2.0.1
cycler==0.10.0
decorator==4.4.2
defusedxml==0.6.0
docopt==0.6.2
entrypoints==0.3
future==0.18.2
goatools==1.1.6
gtfparse==1.2.1
idna==3.2
importlib-metadata==1.5.2
ipykernel==5.2.0
ipython==7.13.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.16.0
Jinja2==2.11.1
joblib==1.1.0
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.2
jupyter-console==6.1.0
jupyter-core==4.6.3
kiwisolver==1.1.0
llvmlite==0.36.0
lmfit==1.0.2
MarkupSafe==1.1.1
matplotlib==3.2.1
mistune==0.8.4
mpmath==1.2.1
nbconvert==5.6.1
nbformat==5.0.4
networkx==2.4
notebook==6.0.3
numba==0.53.1
numba-scipy==0.3.0
numpy==1.16.5
pandas==1.0.3
pandocfilters==1.4.2
parso==0.6.2
patsy==0.5.2
pexpect==4.8.0
pickleshare==0.7.5
Pillow==7.0.0
prometheus-client==0.7.1
prompt-toolkit==3.0.4
ptyprocess==0.6.0
pydot==1.4.2
Pygments==2.6.1
pyparsing==2.4.6
pyrsistent==0.16.0