In [2]:
import sys
sys.path.append("..")

%load_ext autoreload
%autoreload 1
%aimport pygenesig
%matplotlib inline

import numpy as np
import pandas as pd
from collections import OrderedDict


# Preprocess GTEx data
For the crossvalidation we need a
* matrix of gene expression data
* list of target classes (signatures) 

Here, we extract the relevant information from the gtex `.gct` and annotation files. 

In [3]:
!ls ../data/gtex

exprs.npy						sigmat.all.gini
GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct	sigmat.rank1.gini
GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct	target.csv
GTEx-UDISDataSetID5681-sampleAnnotation.txt		v6_UDIS.txt


In [4]:
annotation = pd.read_csv("../data/gtex/GTEx-UDISDataSetID5681-sampleAnnotation.txt", sep="\t", index_col=0)
tissue = pd.read_csv("../data/gtex/v6_UDIS.txt", sep="\t", index_col=0)

In [5]:
gct = pd.read_csv("../data/gtex/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct", sep="\t", skiprows=2, index_col=0)

In [6]:
genenames = gct.Description
gct = gct.iloc[:, 1:]

In [7]:
assert len(gct.index) == len(genenames)
ensemble2hgnc = OrderedDict(
    (ens, hgnc.split(".")[0]) for (ens, hgnc) in zip(gct.index, genenames)
)
assert list(ensemble2hgnc.keys()) == list(gct.index)

In [8]:
col_vars = annotation.join(tissue, how="inner")

In [9]:
grouped = col_vars.groupby("SMTS")
tissue_count = grouped.count().iloc[:,1]
tissue_count

SMTS
Adipose Tissue      655
Adrenal Gland       161
Bladder              13
Blood              1965
Blood Vessel        831
Bone Marrow         127
Brain              1632
Breast              222
Cervix Uteri         11
Colon               387
Esophagus           814
Fallopian Tube        7
Heart               555
Kidney               38
Liver               143
Lung                497
Muscle              566
Nerve               390
Ovary               112
Pancreas            204
Pituitary           128
Prostate            123
Salivary Gland       71
Skin               1046
Small Intestine     106
Spleen              121
Stomach             210
Testis              209
Thyroid             437
Uterus               93
Vagina               99
Name: Experimental readout, dtype: int64

In [10]:
col_vars = col_vars[col_vars.SMTS.isin(tissue_count[tissue_count > 30].index)]

In [11]:
col_vars.shape, gct.shape

((11942, 32), (56318, 8555))

In [12]:
col_vars = col_vars[col_vars.index.isin(gct.columns)]

In [13]:
col_vars.shape, gct.shape

((8525, 32), (56318, 8555))

In [14]:
gct = gct.loc[:, gct.columns.isin(col_vars.index)]

In [15]:
col_vars.shape, gct.shape

((8525, 32), (56318, 8525))

## Store results

In [15]:
exprs = gct.as_matrix()
np.save("../data/gtex/exprs.npy", exprs)

**make sure target is in the same order as exprs columns**

In [16]:
assert np.all(gct.columns == col_vars.loc[gct.columns].SMTS.index), "order of exprs and target are not identical"
assert sum(col_vars.loc[gct.columns]["SMTS"].isnull()) == 0, "tissue annotation contains NaN's"

In [17]:
target = np.array([str(x) for x in col_vars.loc[gct.columns].SMTS])
np.savetxt("../data/gtex/target.csv", target, delimiter=",", fmt="%s")

In [18]:
np.unique(target)

array(['Adipose Tissue', 'Adrenal Gland', 'Blood', 'Blood Vessel', 'Brain',
       'Breast', 'Colon', 'Esophagus', 'Heart', 'Kidney', 'Liver', 'Lung',
       'Muscle', 'Nerve', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate',
       'Salivary Gland', 'Skin', 'Small Intestine', 'Spleen', 'Stomach',
       'Testis', 'Thyroid', 'Uterus', 'Vagina'], 
      dtype='<U15')

In [22]:
covariates = col_vars.loc[gct.columns, ("SMTS", "Gender", "RIN")]
covariates.to_csv("../data/gtex/covariates.csv")

In [20]:
covariates

Unnamed: 0,SMTS,Gender,RIN
GTEX-111CU-1826-SM-5GZYN,Adipose Tissue,Male,7.5
GTEX-111FC-0226-SM-5N9B8,Adipose Tissue,Male,7.3
GTEX-111VG-2326-SM-5N9BK,Adipose Tissue,Male,7.7
GTEX-111YS-2426-SM-5GZZQ,Adipose Tissue,Male,6.6
GTEX-1122O-2026-SM-5NQ91,Adipose Tissue,Female,6.3
GTEX-1128S-2126-SM-5H12U,Adipose Tissue,Female,6.5
GTEX-113IC-0226-SM-5HL5C,Adipose Tissue,Male,6.7
GTEX-117YX-2226-SM-5EGJJ,Adipose Tissue,Male,7.4
GTEX-11DXW-0326-SM-5H11W,Adipose Tissue,Male,7.3
GTEX-11DXX-2326-SM-5Q5A2,Adipose Tissue,Female,8.1
