In [2]:
%load_ext autoreload
%autoreload 1
%aimport prepare_data

from prepare_data import * 
from pygenesig.tools import collapse_matrix
from pygenesig.file_formats import * 

import numpy as np
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preprocess GTEx data
For the crossvalidation we need a
* matrix of gene expression data
* list of target classes (signatures) 

## Load data

In [5]:
!ls ../../data/v6/

GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct
GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct
GTEx_Data_V6_Annotations_SampleAttributesDS.txt


In [16]:
hgnc_symbols = pd.read_csv("../../data/hgnc_complete_set.txt", sep="\t").symbol.tolist()

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
pdata = pd.read_csv("../../data/v6/GTEx_Data_V6_Annotations_SampleAttributesDS.txt", sep="\t", index_col=0)

In [10]:
gct = pd.read_csv("../../data/v6/GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct", sep="\t", skiprows=2, index_col=0)

## Process data

In [29]:
pdata = pdata.loc[gct.columns[1:], :]
target = pdata.SMTSD.apply(sanitze_name).as_matrix()
exprs = gct.iloc[:, 1:].as_matrix()
gene_symbols = gct.Description.apply(lambda x: x.split(".")[0])

In [30]:
pdata.shape, target.shape, exprs.shape, gene_symbols.shape

((8555, 63), (8555,), (56238, 8555), (56238,))

### Collapse gene expression by symbol

In [24]:
exprs_collapsed = collapse_matrix(exprs, gene_symbols.tolist(), axis=0, aggregate_fun=np.sum)

In [25]:
exprs_collapsed.shape

(46226, 8555)

In [26]:
exprs_collapsed = exprs_collapsed[exprs_collapsed.index.isin(hgnc_symbols)]

In [27]:
exprs_collapsed.shape

(31986, 8555)

## Store results

In [33]:
!mkdir -p ../../data_processed/v6

In [34]:
write_target(target, "../../data_processed/v6/target.csv")
write_rosetta(exprs_collapsed.index.tolist(), "../../data_processed/v6/rosetta.csv")

In [35]:
write_expr(exprs_collapsed.as_matrix(), "../../data_processed/v6/exprs.npy")

In [37]:
np.unique(target)

array(['Adipose_Subcutaneous', 'Adipose_Visceral_Omentum', 'Adrenal_Gland',
       'Artery_Aorta', 'Artery_Coronary', 'Artery_Tibial', 'Bladder',
       'Brain_Amygdala', 'Brain_Anterior_cingulate_cortex_BA24',
       'Brain_Caudate_basal_ganglia', 'Brain_Cerebellar_Hemisphere',
       'Brain_Cerebellum', 'Brain_Cortex', 'Brain_Frontal_Cortex_BA9',
       'Brain_Hippocampus', 'Brain_Hypothalamus',
       'Brain_Nucleus_accumbens_basal_ganglia',
       'Brain_Putamen_basal_ganglia', 'Brain_Spinal_cord_cervical_c-1',
       'Brain_Substantia_nigra', 'Breast_Mammary_Tissue',
       'Cells_EBV-transformed_lymphocytes',
       'Cells_Transformed_fibroblasts', 'Cervix_Ectocervix',
       'Cervix_Endocervix', 'Colon_Sigmoid', 'Colon_Transverse',
       'Esophagus_Gastroesophageal_Junction', 'Esophagus_Mucosa',
       'Esophagus_Muscularis', 'Fallopian_Tube', 'Heart_Atrial_Appendage',
       'Heart_Left_Ventricle', 'Kidney_Cortex', 'Liver', 'Lung',
       'Minor_Salivary_Gland', 'Muscle_Skelet