# Description

This notebook reads 1) the normalized gene expression and 2) pathways from the data processed by
MultiPLIER scripts (https://github.com/greenelab/multi-plier) and saves it into a more friendly Python
format (Pandas DataFrames as pickle files).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pickle

import numpy as np
import pandas as pd

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

import conf

In [3]:
readRDS = ro.r['readRDS']

# Read entire recount data prep file

In [4]:
conf.RECOUNT2['PREPROCESSED_GENE_EXPRESSION_FILE']

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/recount2/recount_data_prep_PLIER.RDS')

In [5]:
recount_data_prep = readRDS(str(
    conf.RECOUNT2['PREPROCESSED_GENE_EXPRESSION_FILE']
))

# Read recount2 gene expression data

In [6]:
recount2_rpkl_cm = recount_data_prep.rx2('rpkm.cm')

In [7]:
recount2_rpkl_cm

SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,...,SRP035599.SRR1139382,SRP035599.SRR1139356,SRP035599.SRR1139370
-0.312500,-0.312931,-0.312931,...,-0.300220,-0.297667,-0.310151
-0.328279,-0.328279,-0.328279,,-0.326339,-0.322127,-0.327438
-0.286319,-0.286859,-0.286859,,-0.286671,-0.286859,-0.286740
-0.536646,-0.536646,-0.536646,,2.458255,2.919662,1.410846
...,...,...,,...,...,...
-0.142179,-0.142384,-0.140314,,-0.139339,-0.142384,-0.142232
-0.349886,-0.349886,-0.349886,,-0.264224,0.020815,-0.324208
-0.085582,-0.085582,-0.085582,,0.603333,0.915213,1.788489
-0.567112,-0.569146,-0.569146,,-0.455832,-0.378093,-0.491513


In [8]:
recount2_rpkl_cm.rownames

0,1,2,3,4,5,6
'GAS6','MMP14','DSP',...,'PLEKHG6','GNGT2','SERPINH1'


In [9]:
recount2_rpkl_cm.colnames

0,1,2,3,4,5,6
'SRP00059...,'SRP00059...,'SRP00059...,...,'SRP03559...,'SRP03559...,'SRP03559...


In [10]:
with localconverter(ro.default_converter + pandas2ri.converter):
  recount2_rpkl_cm = ro.conversion.rpy2py(recount2_rpkl_cm)

In [11]:
# recount2_rpkl_cm = pd.DataFrame(
#     data=pandas2ri.ri2py(recount2_rpkl_cm).values,
#     index=recount2_rpkl_cm.rownames,
#     columns=recount2_rpkl_cm.colnames,
# )

In [12]:
assert recount2_rpkl_cm.shape == (6750, 37032)

In [13]:
recount2_rpkl_cm.shape

(6750, 37032)

In [14]:
recount2_rpkl_cm.head()

Unnamed: 0,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557,SRP000599.SRR013558,...,SRP035599.SRR1139372,SRP035599.SRR1139393,SRP035599.SRR1139388,SRP035599.SRR1139378,SRP035599.SRR1139399,SRP035599.SRR1139386,SRP035599.SRR1139375,SRP035599.SRR1139382,SRP035599.SRR1139356,SRP035599.SRR1139370
GAS6,-0.3125,-0.312931,-0.312931,-0.312931,-0.312931,-0.308253,-0.312931,-0.312931,-0.312931,-0.312931,...,-0.301711,-0.305581,-0.303344,-0.2978,-0.307122,-0.285499,-0.309599,-0.30022,-0.297667,-0.310151
MMP14,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.32514,...,-0.314587,-0.322952,-0.326439,-0.325994,-0.326272,-0.322523,-0.326375,-0.326339,-0.322127,-0.327438
DSP,-0.286319,-0.286859,-0.286859,-0.286859,-0.286859,-0.286859,-0.277195,-0.256862,-0.27879,-0.269701,...,-0.286859,-0.286859,-0.286745,-0.286688,-0.286725,-0.286529,-0.286859,-0.286671,-0.286859,-0.28674
MARCKSL1,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,...,0.807663,1.294564,1.527655,1.404788,1.047931,0.892119,1.507099,2.458255,2.919662,1.410846
SPARC,-0.370498,-0.370498,-0.369171,-0.370498,-0.370498,-0.370498,-0.370498,-0.370498,-0.370498,-0.370498,...,-0.345409,-0.31075,-0.34812,-0.356938,-0.355206,-0.366197,-0.351174,-0.363703,-0.350825,-0.360762


## Testing

Test whether what I load from a plain R session is the same as in here.

In [15]:
recount2_rpkl_cm.loc['GAS6', 'SRP000599.SRR013549']

-0.3124999764074366

In [16]:
assert recount2_rpkl_cm.loc['GAS6', 'SRP000599.SRR013549'].round(4) == -0.3125

In [17]:
assert recount2_rpkl_cm.loc['GAS6', 'SRP045352.SRR1539229'].round(7) == -0.2843801

In [18]:
assert recount2_rpkl_cm.loc['CFL2', 'SRP056840.SRR1951636'].round(7) == -0.3412832

In [19]:
recount2_rpkl_cm.iloc[9, 16]

-0.49388522390959116

In [20]:
assert recount2_rpkl_cm.iloc[9, 16].round(7) == -0.4938852

## Save

In [21]:
output_filename = Path(
    conf.RECOUNT2['BASE_DIR'],
    'recount_data_prep_PLIER.pkl'
).resolve()

display(output_filename)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/recount2/recount_data_prep_PLIER.pkl')

In [22]:
recount2_rpkl_cm.to_pickle(output_filename)

In [23]:
# from utils.hdf5 import simplify_string_for_hdf5

In [24]:
# output_filename = os.path.join(conf.DATA_DIR, 'recount_data_prep_PLIER.h5')
# display(output_filename)

In [25]:
# with pd.HDFStore(output_filename, mode='w', complevel=1) as store:
#     for idx, gene in enumerate(recount2_rpkl_cm.index):
#         if idx % 100:
#             print(f'', flush=True, end='')
        
#         clean_gene = simplify_string_for_hdf5(gene)
#         store[clean_gene] = recount2_rpkl_cm.loc[gene]

In [26]:
del recount2_rpkl_cm

# Read recount2 pathways

In [27]:
recount2_all_paths_cm = recount_data_prep.rx2('all.paths.cm')

In [28]:
recount2_all_paths_cm

0,1,2,3,4,5,6
0.0,0.0,0.0,...,0.0,0.0,0.0


In [29]:
recount2_all_paths_cm.rownames

0,1,2,3,4,5,6
'GAS6','MMP14','DSP',...,'PLEKHG6','GNGT2','SERPINH1'


In [30]:
recount2_all_paths_cm.colnames

0,1,2,3,4,5,6
'IRIS_Bce...,'IRIS_Bce...,'IRIS_Bce...,...,'PID_BCR_...,'PID_TELO...,'PID_PI3K...


In [31]:
with localconverter(ro.default_converter + pandas2ri.converter):
  recount2_all_paths_cm_values = ro.conversion.rpy2py(recount2_all_paths_cm)

In [32]:
recount2_all_paths_cm_values

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [33]:
recount2_all_paths_cm = pd.DataFrame(
    data=recount2_all_paths_cm_values,
    index=recount2_all_paths_cm.rownames,
    columns=recount2_all_paths_cm.colnames,
    dtype=bool,
)

In [34]:
assert recount2_all_paths_cm.shape == (6750, 628)

In [35]:
recount2_all_paths_cm.shape

(6750, 628)

In [36]:
recount2_all_paths_cm.dtypes.unique()

array([dtype('bool')], dtype=object)

In [37]:
recount2_all_paths_cm.head()

Unnamed: 0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour,IRIS_CD4Tcell-Th1-restimulated48hour,IRIS_CD4Tcell-Th2-restimulated12hour,IRIS_CD4Tcell-Th2-restimulated48hour,IRIS_CD8Tcell-N0,IRIS_DendriticCell-Control,...,KEGG_GNRH_SIGNALING_PATHWAY,KEGG_BASAL_TRANSCRIPTION_FACTORS,REACTOME_SYNTHESIS_OF_DNA,KEGG_HEMATOPOIETIC_CELL_LINEAGE,KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY,PID_IL4_2PATHWAY,REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,PID_BCR_5PATHWAY,PID_TELOMERASEPATHWAY,PID_PI3KPLCTRKPATHWAY
GAS6,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
MMP14,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
DSP,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
MARCKSL1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
SPARC,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Testing

In [38]:
recount2_all_paths_cm.loc['CTSD', 'REACTOME_SCFSKP2_MEDIATED_DEGRADATION_OF_P27_P21']

False

In [39]:
assert not recount2_all_paths_cm.loc['CTSD', 'REACTOME_SCFSKP2_MEDIATED_DEGRADATION_OF_P27_P21']

In [40]:
assert recount2_all_paths_cm.loc['CTSD', 'PID_P53DOWNSTREAMPATHWAY']

In [41]:
assert recount2_all_paths_cm.loc['MMP14', 'PID_HIF2PATHWAY']

## Save

In [42]:
output_filename = Path(
    conf.RECOUNT2['BASE_DIR'],
    'recount_all_paths_cm.pkl'
).resolve()

display(output_filename)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/recount2/recount_all_paths_cm.pkl')

In [43]:
recount2_all_paths_cm.to_pickle(output_filename)

In [44]:
del recount2_all_paths_cm