# Description

This notebook exports results into other more accessible data formats.

# Modules

In [1]:
from pathlib import Path
import shutil

import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

import conf
from utils import remove_all_file_extensions

In [2]:
readRDS = ro.r["readRDS"]

In [3]:
saveRDS = ro.r["saveRDS"]

# Settings

In [4]:
OUTPUT_DIR = Path(conf.__file__).parent.parent / "data" / "gls"
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/opt/code/data/gls')

# Get results files

In [5]:
input_filepath = conf.RESULTS["GLS"] / "gls-summary-emerge.pkl.gz"
display(input_filepath)

PosixPath('/opt/data/results/gls/gls-summary-emerge.pkl.gz')

In [6]:
data = pd.read_pickle(input_filepath)

In [7]:
data.shape

(304983, 5)

In [8]:
data.head()

Unnamed: 0,phenotype,phenotype_desc,lv,pvalue,fdr
0,EUR_440.2,Atherosclerosis of the extremities,LV472,1.033637e-07,0.000658
1,EUR_440.2,Atherosclerosis of the extremities,LV182,3.710244e-07,0.001432
2,EUR_440.2,Atherosclerosis of the extremities,LV348,7.379936e-07,0.002558
3,EUR_440.2,Atherosclerosis of the extremities,LV504,1.534424e-06,0.0045
4,EUR_440.2,Atherosclerosis of the extremities,LV445,2.912525e-06,0.007402


In [9]:
assert not data.isna().any().any()

# Save

## Pickle format

In [10]:
shutil.copy(input_filepath, OUTPUT_DIR)

'/opt/code/data/gls/gls-summary-emerge.pkl.gz'

## RDS format

In [11]:
output_file = remove_all_file_extensions(input_filepath).with_suffix(".rds")
output_file = OUTPUT_DIR / output_file.name
display(output_file)

PosixPath('/opt/code/data/gls/gls-summary-emerge.rds')

In [12]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_r = ro.conversion.py2rpy(data)

In [13]:
data_r

phenotype,phenotype_desc,lv,pvalue,fdr
...,...,...,...,...


In [14]:
saveRDS(data_r, str(output_file))

<rpy2.rinterface_lib.sexp.NULLType object at 0x7f85ac069c00> [RTYPES.NILSXP]

In [15]:
# testing: load the rds file again
data_r = readRDS(str(output_file))

In [16]:
with localconverter(ro.default_converter + pandas2ri.converter):
    data_again = ro.conversion.rpy2py(data_r)
    data_again.index = data_again.index.astype(int)

In [17]:
data_again.shape

(304983, 5)

In [18]:
data_again.head()

Unnamed: 0,phenotype,phenotype_desc,lv,pvalue,fdr
0,EUR_440.2,Atherosclerosis of the extremities,LV472,1.033637e-07,0.000658
1,EUR_440.2,Atherosclerosis of the extremities,LV182,3.710244e-07,0.001432
2,EUR_440.2,Atherosclerosis of the extremities,LV348,7.379936e-07,0.002558
3,EUR_440.2,Atherosclerosis of the extremities,LV504,1.534424e-06,0.0045
4,EUR_440.2,Atherosclerosis of the extremities,LV445,2.912525e-06,0.007402


In [19]:
pd.testing.assert_frame_equal(
    data,
    data_again,
    check_dtype=False,
)

## Text format

In [20]:
# tsv format
output_file = remove_all_file_extensions(input_filepath).with_suffix(".tsv.gz")
output_file = OUTPUT_DIR / output_file.name
display(output_file)

PosixPath('/opt/code/data/gls/gls-summary-emerge.tsv.gz')

In [21]:
data.to_csv(output_file, sep="\t", index=False, float_format="%.5e")

In [22]:
# testing
data2 = data.copy()
data2.index = list(range(0, data2.shape[0]))

data_again = pd.read_csv(output_file, sep="\t")
data_again.index = list(data_again.index)

In [23]:
data_again.shape

(304983, 5)

In [24]:
data_again.head()

Unnamed: 0,phenotype,phenotype_desc,lv,pvalue,fdr
0,EUR_440.2,Atherosclerosis of the extremities,LV472,1.03364e-07,0.000658
1,EUR_440.2,Atherosclerosis of the extremities,LV182,3.71024e-07,0.001432
2,EUR_440.2,Atherosclerosis of the extremities,LV348,7.37994e-07,0.002558
3,EUR_440.2,Atherosclerosis of the extremities,LV504,1.53442e-06,0.0045
4,EUR_440.2,Atherosclerosis of the extremities,LV445,2.91253e-06,0.007402


In [25]:
pd.testing.assert_frame_equal(
    data2,
    data_again,
    check_categorical=False,
    check_dtype=False,
)