# Description

This notebook exports results into other more accessible data formats.

# Modules

In [1]:
from pathlib import Path

import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from tqdm import tqdm

import conf
from gls import GLSPhenoplier

In [2]:
readRDS = ro.r["readRDS"]

In [3]:
saveRDS = ro.r["saveRDS"]

# Settings

In [4]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

assert OUTPUT_DIR.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls')

# Get results files

In [5]:
input_filepath = OUTPUT_DIR / "gls_phenotypes-combined-phenomexcan.pkl"
display(input_filepath)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-combined-phenomexcan.pkl')

In [6]:
data = pd.read_pickle(input_filepath)

In [7]:
data.shape

(5782, 8)

In [8]:
data.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
0,29,22,100002_raw-Energy,LV246,0.002401,0.424213,0.848425,0.680765
1,29,22,100002_raw-Energy,LV607,-0.006323,0.691499,0.617002,0.87335
2,29,22,100002_raw-Energy,LV612,-0.000822,0.525786,0.948428,0.763843
3,29,22,100002_raw-Energy,LV74,-0.006035,0.685313,0.629373,0.870769
4,29,22,100002_raw-Energy,LV838,0.024454,0.023446,0.046891,0.087403


In [9]:
assert not data.isna().any().any()

# Save

## RDS format

In [10]:
output_file = input_filepath.with_suffix(".rds")
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-combined-phenomexcan.rds')

In [11]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(data)

In [12]:
data_r

part_k,cluster_id,phenotype,...,pvalue,pvalue_twosided,fdr
29,22,'100002_r...,...,0.424213,0.848425,0.680765
29,22,'100002_r...,,0.691499,0.617002,0.873350
29,22,'100002_r...,,0.525786,0.948428,0.763843
29,22,'100002_r...,,0.685313,0.629373,0.870769
...,...,...,,...,...,...
29,4,'reticulo...,,0.000929,0.001858,0.006172
29,4,'reticulo...,,0.003248,0.006495,0.017371
29,4,'reticulo...,,0.000000,0.000000,0.000000
29,4,'reticulo...,,0.000000,0.000000,0.000000


In [13]:
saveRDS(data_r, str(output_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f407abf53c0> [RTYPES.NILSXP]

In [14]:
# testing: load the rds file again
data_r = readRDS(str(output_file))

In [15]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)
    data_again.index = data_again.index.astype(int)

In [16]:
data_again.shape

(5782, 8)

In [17]:
data_again.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
0,29,22,100002_raw-Energy,LV246,0.002401,0.424213,0.848425,0.680765
1,29,22,100002_raw-Energy,LV607,-0.006323,0.691499,0.617002,0.87335
2,29,22,100002_raw-Energy,LV612,-0.000822,0.525786,0.948428,0.763843
3,29,22,100002_raw-Energy,LV74,-0.006035,0.685313,0.629373,0.870769
4,29,22,100002_raw-Energy,LV838,0.024454,0.023446,0.046891,0.087403


In [18]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_dtype=False,
)

## Text format

In [19]:
# tsv format
output_file = input_filepath.with_suffix(".tsv.gz")
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/results/gls/gls_phenotypes-combined-phenomexcan.tsv.gz')

In [20]:
data.to_csv(output_file, sep="\t", index=False, float_format="%.5e")

In [21]:
# testing
data2 = data.copy()
data2.index = list(range(0, data2.shape[0]))

data_again = pd.read_csv(output_file, sep="\t")

data_again.index = list(data_again.index)
data_again["part_k"] = data_again["part_k"].astype(float)

In [22]:
data_again.shape

(5782, 8)

In [23]:
data_again.head()

Unnamed: 0,part_k,cluster_id,phenotype,lv,coef,pvalue,pvalue_twosided,fdr
0,29.0,22,100002_raw-Energy,LV246,0.002401,0.424213,0.848425,0.680765
1,29.0,22,100002_raw-Energy,LV607,-0.006323,0.691499,0.617002,0.87335
2,29.0,22,100002_raw-Energy,LV612,-0.000822,0.525786,0.948428,0.763843
3,29.0,22,100002_raw-Energy,LV74,-0.006035,0.685313,0.629373,0.870769
4,29.0,22,100002_raw-Energy,LV838,0.024454,0.023446,0.046891,0.087403


In [24]:
pd.testing.assert_frame_equal(
    data2,
    data_again,
    check_dtype=False,
)