In [2]:
import sys
sys.path.append("..")

%load_ext autoreload
%autoreload 1
%aimport pygenesig
%matplotlib inline

import numpy as np
import pandas as pd
from collections import OrderedDict


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preprocess GTEx data
For the crossvalidation we need a
* matrix of gene expression data
* list of target classes (signatures) 

Here, we extract the relevant information from the gtex `.gct` and annotation files. 

In [3]:
!ls ../data/gtex

GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct	sigmat.all.gini
GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct	sigmat.rank1.gini
GTEx-UDISDataSetID5681-sampleAnnotation.txt		v6_UDIS.txt


In [4]:
annotation = pd.read_csv("../data/gtex/GTEx-UDISDataSetID5681-sampleAnnotation.txt", sep="\t", index_col=0)
tissue = pd.read_csv("../data/gtex/v6_UDIS.txt", sep="\t", index_col=0)

In [5]:
gct = pd.read_csv("../data/gtex/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct", sep="\t", skiprows=2, index_col=0)

In [6]:
genenames = gct.Description
gct = gct.iloc[:, 1:]

In [7]:
assert len(gct.index) == len(genenames)
ensemble2hgnc = OrderedDict(
    (ens, hgnc.split(".")[0]) for (ens, hgnc) in zip(gct.index, genenames)
)
assert list(ensemble2hgnc.keys()) == list(gct.index)

In [8]:
col_vars = annotation.join(tissue, how="inner")

In [9]:
col_vars = col_vars[col_vars.index.isin(gct.columns)]

In [10]:
grouped = col_vars.groupby("SMTS")

In [11]:
tissue_count = grouped.count().iloc[:,1]
tissue_count

SMTS
Adipose Tissue      577
Adrenal Gland       145
Bladder              11
Blood               511
Blood Vessel        689
Brain              1259
Breast              214
Cervix Uteri         11
Colon               345
Esophagus           686
Fallopian Tube        6
Heart               412
Kidney               32
Liver               119
Lung                320
Muscle              430
Nerve               304
Ovary                97
Pancreas            171
Pituitary           103
Prostate            106
Salivary Gland       57
Skin                890
Small Intestine      88
Spleen              104
Stomach             192
Testis              172
Thyroid             323
Uterus               83
Vagina               96
Name: Experimental readout, dtype: int64

In [12]:
col_vars = col_vars[col_vars.SMTS.isin(tissue_count[tissue_count > 30].index)]

## Store results

In [25]:
exprs = gct.as_matrix()
exprs.tofile("../data/gtex/exprs.mat")

In [23]:
target = np.array(col_vars.SMTS)
np.savetxt("../data/gtex/target.csv", target, delimiter=",", fmt="%s")