# Description

This notebook process the LINCS data consensus signatures from [here](https://figshare.com/articles/dataset/Consensus_signatures_for_LINCS_L1000_perturbations/3085426/1).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

import conf
from entity import Gene

In [3]:
readRDS = ro.r["readRDS"]

In [4]:
saveRDS = ro.r["saveRDS"]

# Settings

In [5]:
OUTPUT_DATA_DIR = Path(conf.RESULTS["DRUG_DISEASE_ANALYSES"], "lincs")
display(OUTPUT_DATA_DIR)
OUTPUT_DATA_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs')

# Load LINCS consensus signatures

In [6]:
input_file = conf.LINCS["CONSENSUS_SIGNATURES_FILE"]

display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/hetionet/lincs-v2.0/consensi-drugbank.tsv.bz2')

In [7]:
lincs_data = pd.read_csv(input_file, sep="\t", index_col="perturbagen").T

In [8]:
lincs_data.shape

(7467, 1170)

In [9]:
lincs_data.head()

perturbagen,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
100,-1.001,-1.835,1.391,1.132,0.257,1.932,0.508,1.408,0.777,0.032,...,-1.692,-0.516,-1.435,-0.317,-0.012,0.641,-0.23,-0.518,-0.177,2.146
1000,1.146,-1.863,0.011,-1.02,1.143,-0.115,1.327,0.31,-1.853,0.872,...,0.354,0.498,0.268,-1.084,-0.142,-0.077,0.633,-1.807,0.032,0.135
10000,-0.693,1.694,-0.804,-0.164,1.145,-1.465,1.221,-0.747,0.829,-0.961,...,-1.196,-0.23,-1.049,-0.347,0.586,0.865,-0.021,2.18,-0.956,0.105
10001,-0.037,0.383,0.269,-0.997,0.185,-0.536,0.424,-0.119,-1.313,0.579,...,-0.343,0.116,-0.245,-0.127,-1.367,0.149,0.117,2.084,1.178,0.772
10005,0.162,-0.899,0.105,-0.09,-1.291,1.404,0.185,0.157,-0.327,-0.026,...,-0.136,-1.115,-0.28,0.2,0.638,-0.197,-0.36,-2.302,-0.117,-0.167


In [10]:
assert lincs_data.index.is_unique

In [11]:
# check that DrugBank ids are consistent
_tmp = lincs_data.columns.map(len).unique()
assert _tmp.shape[0] == 1

In [12]:
assert lincs_data.columns.is_unique

## Gene IDs to Gene names

In [13]:
from rpy2.robjects.packages import importr
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

pandas2ri.activate()

In [14]:
clusterProfiler = importr("clusterProfiler")

In [15]:
_now_mapped_genes = clusterProfiler.bitr(
    lincs_data.index.tolist(),
    fromType="ENTREZID",
    toType="ENSEMBL",
    OrgDb="org.Hs.eg.db",
)

R[write to console]: Loading required package: org.Hs.eg.db

R[write to console]: Loading required package: AnnotationDbi

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, m

In [16]:
_now_mapped_genes.shape

(8259, 2)

In [17]:
# some genes have entrez ids that map to several ensembl id
display(_now_mapped_genes[_now_mapped_genes["ENTREZID"].duplicated(keep=False)])

Unnamed: 0,ENTREZID,ENSEMBL
3,10000,ENSG00000117020
4,10000,ENSG00000275199
12,100129361,ENSG00000256537
13,100129361,ENSG00000262986
14,100129361,ENSG00000281930
...,...,...
8202,9859,ENSG00000276725
8243,9920,ENSG00000176595
8244,9920,ENSG00000273645
8288,9994,ENSG00000118412


In [18]:
_now_mapped_genes = _now_mapped_genes.assign(
    in_phenomexcan=_now_mapped_genes["ENSEMBL"].apply(
        lambda x: x in Gene.GENE_ID_TO_NAME_MAP
    )
)

In [19]:
_now_mapped_genes[_now_mapped_genes["in_phenomexcan"]].shape

(7125, 3)

In [20]:
_now_mapped_genes.head()

Unnamed: 0,ENTREZID,ENSEMBL,in_phenomexcan
1,100,ENSG00000196839,True
2,1000,ENSG00000170558,True
3,10000,ENSG00000117020,True
4,10000,ENSG00000275199,False
5,10001,ENSG00000133997,True


In [21]:
_now_mapped_genes = _now_mapped_genes[
    _now_mapped_genes["in_phenomexcan"]
].drop_duplicates(subset=["ENTREZID"])

In [22]:
_now_mapped_genes.shape

(7120, 3)

In [23]:
_now_mapped_genes.head()

Unnamed: 0,ENTREZID,ENSEMBL,in_phenomexcan
1,100,ENSG00000196839,True
2,1000,ENSG00000170558,True
3,10000,ENSG00000117020,True
5,10001,ENSG00000133997,True
6,10005,ENSG00000101473,True


In [24]:
_now_mapped_genes_dict = _now_mapped_genes.set_index("ENTREZID").to_dict()["ENSEMBL"]

In [25]:
lincs_data = lincs_data.loc[_now_mapped_genes_dict.keys()].rename(
    index=_now_mapped_genes_dict
)

In [26]:
lincs_data.head()

perturbagen,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
ENSG00000196839,-1.001,-1.835,1.391,1.132,0.257,1.932,0.508,1.408,0.777,0.032,...,-1.692,-0.516,-1.435,-0.317,-0.012,0.641,-0.23,-0.518,-0.177,2.146
ENSG00000170558,1.146,-1.863,0.011,-1.02,1.143,-0.115,1.327,0.31,-1.853,0.872,...,0.354,0.498,0.268,-1.084,-0.142,-0.077,0.633,-1.807,0.032,0.135
ENSG00000117020,-0.693,1.694,-0.804,-0.164,1.145,-1.465,1.221,-0.747,0.829,-0.961,...,-1.196,-0.23,-1.049,-0.347,0.586,0.865,-0.021,2.18,-0.956,0.105
ENSG00000133997,-0.037,0.383,0.269,-0.997,0.185,-0.536,0.424,-0.119,-1.313,0.579,...,-0.343,0.116,-0.245,-0.127,-1.367,0.149,0.117,2.084,1.178,0.772
ENSG00000101473,0.162,-0.899,0.105,-0.09,-1.291,1.404,0.185,0.157,-0.327,-0.026,...,-0.136,-1.115,-0.28,0.2,0.638,-0.197,-0.36,-2.302,-0.117,-0.167


In [27]:
# make sure we have ensembl id only in the index
_tmp = pd.Series(lincs_data.index.map(len)).value_counts()
display(_tmp)
assert _tmp.shape[0] == 1

15    7120
dtype: int64

In [28]:
lincs_data.head()

perturbagen,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
ENSG00000196839,-1.001,-1.835,1.391,1.132,0.257,1.932,0.508,1.408,0.777,0.032,...,-1.692,-0.516,-1.435,-0.317,-0.012,0.641,-0.23,-0.518,-0.177,2.146
ENSG00000170558,1.146,-1.863,0.011,-1.02,1.143,-0.115,1.327,0.31,-1.853,0.872,...,0.354,0.498,0.268,-1.084,-0.142,-0.077,0.633,-1.807,0.032,0.135
ENSG00000117020,-0.693,1.694,-0.804,-0.164,1.145,-1.465,1.221,-0.747,0.829,-0.961,...,-1.196,-0.23,-1.049,-0.347,0.586,0.865,-0.021,2.18,-0.956,0.105
ENSG00000133997,-0.037,0.383,0.269,-0.997,0.185,-0.536,0.424,-0.119,-1.313,0.579,...,-0.343,0.116,-0.245,-0.127,-1.367,0.149,0.117,2.084,1.178,0.772
ENSG00000101473,0.162,-0.899,0.105,-0.09,-1.291,1.404,0.185,0.157,-0.327,-0.026,...,-0.136,-1.115,-0.28,0.2,0.638,-0.197,-0.36,-2.302,-0.117,-0.167


## Remove NaN values

In [29]:
assert not lincs_data.isna().any().any()

## Save

In [30]:
output_file = Path(OUTPUT_DATA_DIR, "lincs-data.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/lincs-data.pkl')

In [31]:
lincs_data.to_pickle(output_file)

### RDS format

In [32]:
output_rds_file = output_file.with_suffix(".rds")
display(output_rds_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/lincs-data.rds')

In [33]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(lincs_data)

In [34]:
data_r

DB00014,DB00091,DB00121,...,DB09020,DB09022,DB09023
...,...,...,...,...,...,...


In [35]:
saveRDS(data_r, str(output_rds_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f386244ca80> [RTYPES.NILSXP]

In [36]:
# testing: load the rds file again
data_r = readRDS(str(output_rds_file))

In [37]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)
#     data_again.index = data_again.index.astype(int)

In [38]:
data_again.shape

(7120, 1170)

In [39]:
data_again.head()

Unnamed: 0,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
ENSG00000196839,-1.001,-1.835,1.391,1.132,0.257,1.932,0.508,1.408,0.777,0.032,...,-1.692,-0.516,-1.435,-0.317,-0.012,0.641,-0.23,-0.518,-0.177,2.146
ENSG00000170558,1.146,-1.863,0.011,-1.02,1.143,-0.115,1.327,0.31,-1.853,0.872,...,0.354,0.498,0.268,-1.084,-0.142,-0.077,0.633,-1.807,0.032,0.135
ENSG00000117020,-0.693,1.694,-0.804,-0.164,1.145,-1.465,1.221,-0.747,0.829,-0.961,...,-1.196,-0.23,-1.049,-0.347,0.586,0.865,-0.021,2.18,-0.956,0.105
ENSG00000133997,-0.037,0.383,0.269,-0.997,0.185,-0.536,0.424,-0.119,-1.313,0.579,...,-0.343,0.116,-0.245,-0.127,-1.367,0.149,0.117,2.084,1.178,0.772
ENSG00000101473,0.162,-0.899,0.105,-0.09,-1.291,1.404,0.185,0.157,-0.327,-0.026,...,-0.136,-1.115,-0.28,0.2,0.638,-0.197,-0.36,-2.302,-0.117,-0.167


In [40]:
pd.testing.assert_frame_equal(
    lincs_data,
    data_again,
    check_names=False,
    check_exact=True,
    #     rtol=0.0,
    #     atol=1e-50,
    #     check_dtype=False,
)

### Text format

In [41]:
# tsv format
output_text_file = output_file.with_suffix(".tsv.gz")
display(output_text_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/lincs-data.tsv.gz')

In [42]:
lincs_data.to_csv(output_text_file, sep="\t", index=True, float_format="%.5e")

In [43]:
# testing
# data2 = data.copy()
# data2.index = list(range(0, data2.shape[0]))

data_again = pd.read_csv(output_text_file, sep="\t", index_col=0)

# data_again.index = list(data_again.index)
# data_again["part_k"] = data_again["part_k"].astype(float)

In [44]:
data_again.shape

(7120, 1170)

In [45]:
data_again.head()

Unnamed: 0,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
ENSG00000196839,-1.001,-1.835,1.391,1.132,0.257,1.932,0.508,1.408,0.777,0.032,...,-1.692,-0.516,-1.435,-0.317,-0.012,0.641,-0.23,-0.518,-0.177,2.146
ENSG00000170558,1.146,-1.863,0.011,-1.02,1.143,-0.115,1.327,0.31,-1.853,0.872,...,0.354,0.498,0.268,-1.084,-0.142,-0.077,0.633,-1.807,0.032,0.135
ENSG00000117020,-0.693,1.694,-0.804,-0.164,1.145,-1.465,1.221,-0.747,0.829,-0.961,...,-1.196,-0.23,-1.049,-0.347,0.586,0.865,-0.021,2.18,-0.956,0.105
ENSG00000133997,-0.037,0.383,0.269,-0.997,0.185,-0.536,0.424,-0.119,-1.313,0.579,...,-0.343,0.116,-0.245,-0.127,-1.367,0.149,0.117,2.084,1.178,0.772
ENSG00000101473,0.162,-0.899,0.105,-0.09,-1.291,1.404,0.185,0.157,-0.327,-0.026,...,-0.136,-1.115,-0.28,0.2,0.638,-0.197,-0.36,-2.302,-0.117,-0.167


In [46]:
pd.testing.assert_frame_equal(
    lincs_data,
    data_again,
    check_names=False,
    check_exact=False,
    rtol=0.0,
    atol=5e-5,
)

# Project into MultiPLIER

In [47]:
from multiplier import MultiplierProjection

In [48]:
mproj = MultiplierProjection()

In [49]:
lincs_projection = mproj.transform(lincs_data)

In [50]:
lincs_projection.shape

(987, 1170)

In [51]:
lincs_projection.head()

perturbagen,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
LV1,0.036115,-0.35861,0.091067,0.008923,0.046469,0.136747,0.036151,0.034024,-0.027445,0.060644,...,0.06993,0.021537,0.020511,-0.00625,-0.00266,-0.01721,-0.004029,-0.14549,0.102504,0.057621
LV2,0.012281,0.009738,0.00465,-0.006554,0.021758,0.010425,0.000794,-0.000603,0.011534,-0.004425,...,0.000344,0.001123,-0.012675,-0.004849,0.000375,0.002623,-0.003252,-0.066209,0.025378,0.012453
LV3,-0.005533,-0.059174,0.013454,0.008906,-0.004939,0.033663,-0.010045,0.003824,0.009814,-0.007492,...,-0.021941,-0.002417,0.000155,0.002941,-0.0151,-0.016372,0.012249,0.009216,0.018254,-0.003516
LV4,-0.004151,-0.032884,0.005934,-0.003991,-0.028524,0.002709,0.007001,0.024704,0.005058,-0.004345,...,0.013806,0.003197,0.013099,0.002643,-0.008836,0.023852,0.028957,0.014681,0.000826,-0.009739
LV5,-0.015156,-0.005276,0.025747,0.000346,-0.013932,-0.010911,0.041403,-0.018285,0.003507,-0.007621,...,-0.011577,-0.015956,-0.027614,-0.001707,-0.000542,-0.016918,0.001959,0.024147,0.013004,-0.014166


In [52]:
assert not lincs_projection.isna().any().any()

## Save

In [53]:
output_file = Path(OUTPUT_DATA_DIR, "lincs-projection.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/lincs-projection.pkl')

In [54]:
lincs_projection.to_pickle(output_file)

### RDS format

In [55]:
output_rds_file = output_file.with_suffix(".rds")
display(output_rds_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/lincs-projection.rds')

In [56]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(lincs_projection)

In [57]:
data_r

DB00014,DB00091,DB00121,...,DB09020,DB09022,DB09023
...,...,...,...,...,...,...


In [58]:
saveRDS(data_r, str(output_rds_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f386244ca80> [RTYPES.NILSXP]

In [59]:
# testing: load the rds file again
data_r = readRDS(str(output_rds_file))

In [60]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)
#     data_again.index = data_again.index.astype(int)

In [61]:
data_again.shape

(987, 1170)

In [62]:
data_again.head()

Unnamed: 0,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
LV1,0.036115,-0.35861,0.091067,0.008923,0.046469,0.136747,0.036151,0.034024,-0.027445,0.060644,...,0.06993,0.021537,0.020511,-0.00625,-0.00266,-0.01721,-0.004029,-0.14549,0.102504,0.057621
LV2,0.012281,0.009738,0.00465,-0.006554,0.021758,0.010425,0.000794,-0.000603,0.011534,-0.004425,...,0.000344,0.001123,-0.012675,-0.004849,0.000375,0.002623,-0.003252,-0.066209,0.025378,0.012453
LV3,-0.005533,-0.059174,0.013454,0.008906,-0.004939,0.033663,-0.010045,0.003824,0.009814,-0.007492,...,-0.021941,-0.002417,0.000155,0.002941,-0.0151,-0.016372,0.012249,0.009216,0.018254,-0.003516
LV4,-0.004151,-0.032884,0.005934,-0.003991,-0.028524,0.002709,0.007001,0.024704,0.005058,-0.004345,...,0.013806,0.003197,0.013099,0.002643,-0.008836,0.023852,0.028957,0.014681,0.000826,-0.009739
LV5,-0.015156,-0.005276,0.025747,0.000346,-0.013932,-0.010911,0.041403,-0.018285,0.003507,-0.007621,...,-0.011577,-0.015956,-0.027614,-0.001707,-0.000542,-0.016918,0.001959,0.024147,0.013004,-0.014166


In [63]:
pd.testing.assert_frame_equal(
    lincs_projection,
    data_again,
    check_names=False,
    check_exact=True,
    #     rtol=0.0,
    #     atol=1e-50,
    #     check_dtype=False,
)

### Text format

In [64]:
# tsv format
output_text_file = output_file.with_suffix(".tsv.gz")
display(output_text_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/lincs-projection.tsv.gz')

In [65]:
lincs_projection.to_csv(output_text_file, sep="\t", index=True, float_format="%.5e")

In [66]:
# testing
# data2 = data.copy()
# data2.index = list(range(0, data2.shape[0]))

data_again = pd.read_csv(output_text_file, sep="\t", index_col=0)

# data_again.index = list(data_again.index)
# data_again["part_k"] = data_again["part_k"].astype(float)

In [67]:
data_again.shape

(987, 1170)

In [68]:
data_again.head()

Unnamed: 0,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
LV1,0.036115,-0.35861,0.091067,0.008923,0.046469,0.136747,0.036151,0.034024,-0.027445,0.060644,...,0.06993,0.021537,0.020511,-0.00625,-0.00266,-0.01721,-0.004029,-0.14549,0.102504,0.057621
LV2,0.012281,0.009738,0.00465,-0.006554,0.021758,0.010425,0.000794,-0.000603,0.011534,-0.004425,...,0.000344,0.001123,-0.012675,-0.004849,0.000375,0.002623,-0.003252,-0.066209,0.025378,0.012453
LV3,-0.005533,-0.059174,0.013454,0.008906,-0.004939,0.033663,-0.010045,0.003824,0.009814,-0.007492,...,-0.021941,-0.002417,0.000155,0.002941,-0.0151,-0.016372,0.012249,0.009216,0.018254,-0.003516
LV4,-0.004151,-0.032884,0.005934,-0.003991,-0.028524,0.002709,0.007001,0.024704,0.005058,-0.004345,...,0.013806,0.003197,0.013099,0.002643,-0.008836,0.023852,0.028957,0.014681,0.000826,-0.009739
LV5,-0.015156,-0.005276,0.025747,0.000346,-0.013932,-0.010911,0.041403,-0.018285,0.003507,-0.007621,...,-0.011577,-0.015956,-0.027614,-0.001707,-0.000542,-0.016918,0.001959,0.024147,0.013004,-0.014166


In [69]:
pd.testing.assert_frame_equal(
    lincs_projection,
    data_again,
    check_names=False,
    check_exact=False,
    rtol=0.0,
    atol=5e-5,
)