# Description

This notebook process the LINCS data consensus signatures from [here](https://figshare.com/articles/dataset/Consensus_signatures_for_LINCS_L1000_perturbations/3085426/1).

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import pandas as pd

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

import conf
from entity import Gene

In [2]:
readRDS = ro.r["readRDS"]

In [3]:
saveRDS = ro.r["saveRDS"]

# Settings

In [3]:
OUTPUT_DATA_DIR = Path(conf.RESULTS["DRUG_DISEASE_ANALYSES"], "lincs")
display(OUTPUT_DATA_DIR)
OUTPUT_DATA_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs')

# Load LINCS consensus signatures

In [4]:
input_file = conf.LINCS["CONSENSUS_SIGNATURES_FILE"]

display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/hetionet/lincs-v2.0/consensi-drugbank.tsv.bz2')

In [5]:
lincs_data = pd.read_csv(input_file, sep="\t", index_col="perturbagen").T

In [6]:
lincs_data.shape

(7467, 1170)

In [7]:
lincs_data.head()

perturbagen,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
100,-1.001,-1.835,1.391,1.132,0.257,1.932,0.508,1.408,0.777,0.032,...,-1.692,-0.516,-1.435,-0.317,-0.012,0.641,-0.23,-0.518,-0.177,2.146
1000,1.146,-1.863,0.011,-1.02,1.143,-0.115,1.327,0.31,-1.853,0.872,...,0.354,0.498,0.268,-1.084,-0.142,-0.077,0.633,-1.807,0.032,0.135
10000,-0.693,1.694,-0.804,-0.164,1.145,-1.465,1.221,-0.747,0.829,-0.961,...,-1.196,-0.23,-1.049,-0.347,0.586,0.865,-0.021,2.18,-0.956,0.105
10001,-0.037,0.383,0.269,-0.997,0.185,-0.536,0.424,-0.119,-1.313,0.579,...,-0.343,0.116,-0.245,-0.127,-1.367,0.149,0.117,2.084,1.178,0.772
10005,0.162,-0.899,0.105,-0.09,-1.291,1.404,0.185,0.157,-0.327,-0.026,...,-0.136,-1.115,-0.28,0.2,0.638,-0.197,-0.36,-2.302,-0.117,-0.167


In [8]:
assert lincs_data.index.is_unique

In [9]:
# check that DrugBank ids are consistent
_tmp = lincs_data.columns.map(len).unique()
assert _tmp.shape[0] == 1

In [10]:
assert lincs_data.columns.is_unique

## Gene IDs to Gene names

In [11]:
from rpy2.robjects.packages import importr
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

pandas2ri.activate()

In [12]:
clusterProfiler = importr("clusterProfiler")

In [13]:
_now_mapped_genes = clusterProfiler.bitr(
    lincs_data.index.tolist(),
    fromType="ENTREZID",
    toType="ENSEMBL",
    OrgDb="org.Hs.eg.db",
)

R[write to console]: Loading required package: org.Hs.eg.db

R[write to console]: Loading required package: AnnotationDbi

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, m

In [14]:
_now_mapped_genes.shape

(8259, 2)

In [15]:
# some genes have entrez ids that map to several ensembl id
display(_now_mapped_genes[_now_mapped_genes["ENTREZID"].duplicated(keep=False)])

Unnamed: 0,ENTREZID,ENSEMBL
3,10000,ENSG00000117020
4,10000,ENSG00000275199
12,100129361,ENSG00000256537
13,100129361,ENSG00000262986
14,100129361,ENSG00000281930
...,...,...
8202,9859,ENSG00000276725
8243,9920,ENSG00000176595
8244,9920,ENSG00000273645
8288,9994,ENSG00000118412


In [16]:
_now_mapped_genes = _now_mapped_genes.assign(
    in_phenomexcan=_now_mapped_genes["ENSEMBL"].apply(
        lambda x: x in Gene.GENE_ID_TO_NAME_MAP
    )
)

In [17]:
_now_mapped_genes[_now_mapped_genes["in_phenomexcan"]].shape

(7125, 3)

In [18]:
_now_mapped_genes.head()

Unnamed: 0,ENTREZID,ENSEMBL,in_phenomexcan
1,100,ENSG00000196839,True
2,1000,ENSG00000170558,True
3,10000,ENSG00000117020,True
4,10000,ENSG00000275199,False
5,10001,ENSG00000133997,True


In [19]:
_now_mapped_genes = _now_mapped_genes[
    _now_mapped_genes["in_phenomexcan"]
].drop_duplicates(subset=["ENTREZID"])

In [20]:
_now_mapped_genes.shape

(7120, 3)

In [21]:
_now_mapped_genes.head()

Unnamed: 0,ENTREZID,ENSEMBL,in_phenomexcan
1,100,ENSG00000196839,True
2,1000,ENSG00000170558,True
3,10000,ENSG00000117020,True
5,10001,ENSG00000133997,True
6,10005,ENSG00000101473,True


In [22]:
_now_mapped_genes_dict = _now_mapped_genes.set_index("ENTREZID").to_dict()["ENSEMBL"]

In [23]:
lincs_data = lincs_data.loc[_now_mapped_genes_dict.keys()].rename(
    index=_now_mapped_genes_dict
)

In [24]:
lincs_data.head()

perturbagen,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
ENSG00000196839,-1.001,-1.835,1.391,1.132,0.257,1.932,0.508,1.408,0.777,0.032,...,-1.692,-0.516,-1.435,-0.317,-0.012,0.641,-0.23,-0.518,-0.177,2.146
ENSG00000170558,1.146,-1.863,0.011,-1.02,1.143,-0.115,1.327,0.31,-1.853,0.872,...,0.354,0.498,0.268,-1.084,-0.142,-0.077,0.633,-1.807,0.032,0.135
ENSG00000117020,-0.693,1.694,-0.804,-0.164,1.145,-1.465,1.221,-0.747,0.829,-0.961,...,-1.196,-0.23,-1.049,-0.347,0.586,0.865,-0.021,2.18,-0.956,0.105
ENSG00000133997,-0.037,0.383,0.269,-0.997,0.185,-0.536,0.424,-0.119,-1.313,0.579,...,-0.343,0.116,-0.245,-0.127,-1.367,0.149,0.117,2.084,1.178,0.772
ENSG00000101473,0.162,-0.899,0.105,-0.09,-1.291,1.404,0.185,0.157,-0.327,-0.026,...,-0.136,-1.115,-0.28,0.2,0.638,-0.197,-0.36,-2.302,-0.117,-0.167


In [25]:
# make sure we have ensembl id only in the index
_tmp = pd.Series(lincs_data.index.map(len)).value_counts()
display(_tmp)
assert _tmp.shape[0] == 1

15    7120
dtype: int64

In [26]:
lincs_data.head()

perturbagen,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
ENSG00000196839,-1.001,-1.835,1.391,1.132,0.257,1.932,0.508,1.408,0.777,0.032,...,-1.692,-0.516,-1.435,-0.317,-0.012,0.641,-0.23,-0.518,-0.177,2.146
ENSG00000170558,1.146,-1.863,0.011,-1.02,1.143,-0.115,1.327,0.31,-1.853,0.872,...,0.354,0.498,0.268,-1.084,-0.142,-0.077,0.633,-1.807,0.032,0.135
ENSG00000117020,-0.693,1.694,-0.804,-0.164,1.145,-1.465,1.221,-0.747,0.829,-0.961,...,-1.196,-0.23,-1.049,-0.347,0.586,0.865,-0.021,2.18,-0.956,0.105
ENSG00000133997,-0.037,0.383,0.269,-0.997,0.185,-0.536,0.424,-0.119,-1.313,0.579,...,-0.343,0.116,-0.245,-0.127,-1.367,0.149,0.117,2.084,1.178,0.772
ENSG00000101473,0.162,-0.899,0.105,-0.09,-1.291,1.404,0.185,0.157,-0.327,-0.026,...,-0.136,-1.115,-0.28,0.2,0.638,-0.197,-0.36,-2.302,-0.117,-0.167


## Remove NaN values

In [27]:
assert not lincs_data.isna().any().any()

## Save

In [28]:
output_file = Path(OUTPUT_DATA_DIR, "lincs-data.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/lincs-data.pkl')

In [29]:
lincs_data.to_pickle(output_file)

### RDS format

In [45]:
output_rds_file = output_file.with_suffix(".rds")
display(output_rds_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-mashr-zscores.rds')

In [12]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(lincs_data)

In [13]:
data_r

20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,...,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
0.169468,0.102558,0.239545,...,1.377624,0.738444,0.298259
1.358856,1.846875,0.139324,,0.101731,1.012735,0.945167
0.151008,1.173202,1.179426,,0.083316,3.493196,0.991948
1.302722,0.841524,1.578926,,1.704863,1.351619,1.027297
...,...,...,,...,...,...
1.320478,0.622330,1.817328,,1.520303,0.313632,0.324707
1.116961,0.297000,0.320781,,0.528771,2.371080,1.548179
0.402507,1.662981,0.632502,,0.352208,1.247501,1.352347
0.155576,1.060303,0.737441,,0.805663,0.026323,0.732112


In [14]:
saveRDS(data_r, str(output_rds_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f2e11835a00> [RTYPES.NILSXP]

In [46]:
# testing: load the rds file again
data_r = readRDS(str(output_rds_file))

In [47]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)
#     data_again.index = data_again.index.astype(int)

In [48]:
data_again.shape

(22515, 4091)

In [49]:
data_again.head()

Unnamed: 0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
ENSG00000000457,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
ENSG00000000460,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
ENSG00000000938,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
ENSG00000000971,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


In [53]:
pd.testing.assert_frame_equal(
    lincs_data,
    data_again,
    check_names=False,
    check_exact=True,
    #     rtol=0.0,
    #     atol=1e-50,
    #     check_dtype=False,
)

### Text format

In [9]:
# tsv format
output_text_file = output_file.with_suffix(".tsv.gz")
display(output_text_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-mashr-zscores.tsv.gz')

In [10]:
lincs_data.to_csv(output_text_file, sep="\t", index=True, float_format="%.5e")

In [11]:
# testing
# data2 = data.copy()
# data2.index = list(range(0, data2.shape[0]))

data_again = pd.read_csv(output_text_file, sep="\t", index_col=0)

# data_again.index = list(data_again.index)
# data_again["part_k"] = data_again["part_k"].astype(float)

In [12]:
data_again.shape

(22515, 4091)

In [13]:
data_again.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.31345,1.47215,0.72616,1.51637,1.29977,1.06809,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.37762,0.738444,0.298259
ENSG00000000457,1.35886,1.84687,0.139324,0.12953,0.757757,1.10398,0.612418,1.82233,2.03537,1.00806,...,1.44179,0.654791,2.54565,1.20298,0.514244,0.237223,0.414171,0.101731,1.01274,0.945167
ENSG00000000460,0.151008,1.1732,1.17943,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.03331,0.482261,0.695624,0.33648,0.083316,3.4932,0.991948
ENSG00000000938,1.30272,0.841524,1.57893,0.72134,0.139314,4.38702,0.125959,1.24712,0.215124,0.892083,...,0.126657,0.048048,1.88636,0.540496,0.127524,1.4945,0.056432,1.70486,1.35162,1.0273
ENSG00000000971,1.33881,0.262339,0.689379,1.70202,0.325859,0.063161,1.14113,0.882682,0.035533,1.81019,...,0.858497,1.67556,2.31907,1.59872,0.162958,0.005703,3.00454,0.803669,0.444266,0.165671


In [None]:
pd.testing.assert_frame_equal(
    lincs_data,
    data_again,
    check_exact=False,
    rtol=0.0,
    atol=5e-5,
)

# Project into MultiPLIER

In [30]:
from multiplier import MultiplierProjection

In [31]:
mproj = MultiplierProjection()

In [32]:
lincs_projection = mproj.transform(lincs_data)

In [33]:
lincs_projection.shape

(987, 1170)

In [34]:
lincs_projection.head()

perturbagen,DB00014,DB00091,DB00121,DB00130,DB00131,DB00132,DB00136,DB00140,DB00146,DB00150,...,DB08995,DB09002,DB09004,DB09009,DB09010,DB09015,DB09019,DB09020,DB09022,DB09023
LV1,0.036115,-0.35861,0.091067,0.008923,0.046469,0.136747,0.036151,0.034024,-0.027445,0.060644,...,0.06993,0.021537,0.020511,-0.00625,-0.00266,-0.01721,-0.004029,-0.14549,0.102504,0.057621
LV2,0.012281,0.009738,0.00465,-0.006554,0.021758,0.010425,0.000794,-0.000603,0.011534,-0.004425,...,0.000344,0.001123,-0.012675,-0.004849,0.000375,0.002623,-0.003252,-0.066209,0.025378,0.012453
LV3,-0.005533,-0.059174,0.013454,0.008906,-0.004939,0.033663,-0.010045,0.003824,0.009814,-0.007492,...,-0.021941,-0.002417,0.000155,0.002941,-0.0151,-0.016372,0.012249,0.009216,0.018254,-0.003516
LV4,-0.004151,-0.032884,0.005934,-0.003991,-0.028524,0.002709,0.007001,0.024704,0.005058,-0.004345,...,0.013806,0.003197,0.013099,0.002643,-0.008836,0.023852,0.028957,0.014681,0.000826,-0.009739
LV5,-0.015156,-0.005276,0.025747,0.000346,-0.013932,-0.010911,0.041403,-0.018285,0.003507,-0.007621,...,-0.011577,-0.015956,-0.027614,-0.001707,-0.000542,-0.016918,0.001959,0.024147,0.013004,-0.014166


In [35]:
assert not lincs_projection.isna().any().any()

## Save

In [36]:
output_file = Path(OUTPUT_DATA_DIR, "lincs-projection.pkl").resolve()
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/drug_disease_analyses/lincs/lincs-projection.pkl')

In [37]:
lincs_projection.to_pickle(output_file)

### RDS format

In [45]:
output_rds_file = output_file.with_suffix(".rds")
display(output_rds_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-mashr-zscores.rds')

In [12]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(lincs_projection)

In [13]:
data_r

20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,...,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
0.169468,0.102558,0.239545,...,1.377624,0.738444,0.298259
1.358856,1.846875,0.139324,,0.101731,1.012735,0.945167
0.151008,1.173202,1.179426,,0.083316,3.493196,0.991948
1.302722,0.841524,1.578926,,1.704863,1.351619,1.027297
...,...,...,,...,...,...
1.320478,0.622330,1.817328,,1.520303,0.313632,0.324707
1.116961,0.297000,0.320781,,0.528771,2.371080,1.548179
0.402507,1.662981,0.632502,,0.352208,1.247501,1.352347
0.155576,1.060303,0.737441,,0.805663,0.026323,0.732112


In [14]:
saveRDS(data_r, str(output_rds_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f2e11835a00> [RTYPES.NILSXP]

In [46]:
# testing: load the rds file again
data_r = readRDS(str(output_rds_file))

In [47]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)
#     data_again.index = data_again.index.astype(int)

In [48]:
data_again.shape

(22515, 4091)

In [49]:
data_again.head()

Unnamed: 0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
ENSG00000000457,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
ENSG00000000460,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
ENSG00000000938,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
ENSG00000000971,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


In [53]:
pd.testing.assert_frame_equal(
    lincs_projection,
    data_again,
    check_names=False,
    check_exact=True,
    #     rtol=0.0,
    #     atol=1e-50,
    #     check_dtype=False,
)

### Text format

In [9]:
# tsv format
output_text_file = output_file.with_suffix(".tsv.gz")
display(output_text_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/gene_assoc/smultixcan-mashr-zscores.tsv.gz')

In [10]:
lincs_projection.to_csv(output_text_file, sep="\t", index=True, float_format="%.5e")

In [11]:
# testing
# data2 = data.copy()
# data2.index = list(range(0, data2.shape[0]))

data_again = pd.read_csv(output_text_file, sep="\t", index_col=0)

# data_again.index = list(data_again.index)
# data_again["part_k"] = data_again["part_k"].astype(float)

In [12]:
data_again.shape

(22515, 4091)

In [13]:
data_again.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.31345,1.47215,0.72616,1.51637,1.29977,1.06809,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.37762,0.738444,0.298259
ENSG00000000457,1.35886,1.84687,0.139324,0.12953,0.757757,1.10398,0.612418,1.82233,2.03537,1.00806,...,1.44179,0.654791,2.54565,1.20298,0.514244,0.237223,0.414171,0.101731,1.01274,0.945167
ENSG00000000460,0.151008,1.1732,1.17943,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.03331,0.482261,0.695624,0.33648,0.083316,3.4932,0.991948
ENSG00000000938,1.30272,0.841524,1.57893,0.72134,0.139314,4.38702,0.125959,1.24712,0.215124,0.892083,...,0.126657,0.048048,1.88636,0.540496,0.127524,1.4945,0.056432,1.70486,1.35162,1.0273
ENSG00000000971,1.33881,0.262339,0.689379,1.70202,0.325859,0.063161,1.14113,0.882682,0.035533,1.81019,...,0.858497,1.67556,2.31907,1.59872,0.162958,0.005703,3.00454,0.803669,0.444266,0.165671


In [None]:
pd.testing.assert_frame_equal(
    lincs_projection,
    data_again,
    check_exact=False,
    rtol=0.0,
    atol=5e-5,
)