In [3]:
import sys
sys.path.append("..")

%load_ext autoreload
%autoreload 1
%aimport pygenesig
%matplotlib inline

import numpy as np
import pandas as pd
from collections import OrderedDict
import dask.dataframe as dd


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preprocess GTEx data
For the crossvalidation we need a
* matrix of gene expression data
* list of target classes (signatures) 

Here, we extract the relevant information from the gtex `.gct` and annotation files. 

In [2]:
!ls ../data/gtex

covariates.csv
exprs_counts.npy
exprs.npy
gene_symbols.csv
GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct
GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct
GTEx-UDISDataSetID5681-sampleAnnotation.txt
sigmat.all.gini
sigmat.rank1.gini
target.csv
v6_UDIS.txt


In [3]:
annotation = pd.read_csv("../data/gtex/GTEx-UDISDataSetID5681-sampleAnnotation.txt", sep="\t", index_col=0)
tissue = pd.read_csv("../data/gtex/v6_UDIS.txt", sep="\t", index_col=0)

In [4]:
gct_head_rpkm = !head -n3 ../data/gtex/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct
gct_head_reads = !head -n3 ../data/gtex/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct
assert gct_head_rpkm == gct_head_reads

In [5]:
col1_rpkm = !cut -f1 ../data/gtex/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct
col1_reads = !cut -f1 ../data/gtex/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct
assert col1_rpkm == col1_reads

In [6]:
gct = pd.read_csv("../data/gtex/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct", sep="\t", skiprows=2, index_col=0)

In [7]:
genenames = gct.Description
gct = gct.iloc[:, 1:]

In [8]:
genenames = genenames.apply(lambda x: x.split(".")[0])

In [9]:
assert len(gct.index) == len(genenames)
ensemble2hgnc = OrderedDict(
    (ens, hgnc.split(".")[0]) for (ens, hgnc) in zip(gct.index, genenames)
)
assert list(ensemble2hgnc.keys()) == list(gct.index)

In [10]:
col_vars = annotation.join(tissue, how="inner")

In [11]:
grouped = col_vars.groupby("SMTS")
tissue_count = grouped.count().iloc[:,1]
tissue_count

SMTS
Adipose Tissue      655
Adrenal Gland       161
Bladder              13
Blood              1965
Blood Vessel        831
Bone Marrow         127
Brain              1632
Breast              222
Cervix Uteri         11
Colon               387
Esophagus           814
Fallopian Tube        7
Heart               555
Kidney               38
Liver               143
Lung                497
Muscle              566
Nerve               390
Ovary               112
Pancreas            204
Pituitary           128
Prostate            123
Salivary Gland       71
Skin               1046
Small Intestine     106
Spleen              121
Stomach             210
Testis              209
Thyroid             437
Uterus               93
Vagina               99
Name: Experimental readout, dtype: int64

In [12]:
col_vars = col_vars[col_vars.SMTS.isin(tissue_count[tissue_count > 30].index)]

In [13]:
col_vars.shape, gct.shape

((11942, 32), (56318, 8555))

In [14]:
col_vars = col_vars[col_vars.index.isin(gct.columns)]

In [15]:
col_vars.shape, gct.shape

((8525, 32), (56318, 8555))

In [16]:
gct = gct.loc[:, gct.columns.isin(col_vars.index)]

In [17]:
col_vars.shape, gct.shape

((8525, 32), (56318, 8525))

In [18]:
genenames = genenames[gct.index]

In [25]:
np.array([1,2,3]).dtype

dtype('int64')

In [24]:
gct.as_matrix().dtype == 'float64'

TypeError: data type "floatd64" not understood

## Store results

In [47]:
exprs = gct.as_matrix()
np.save("../data/gtex/exprs.npy", exprs)

**make sure target is in the same order as exprs columns**

In [16]:
assert np.all(gct.columns == col_vars.loc[gct.columns].SMTS.index), "order of exprs and target are not identical"
assert sum(col_vars.loc[gct.columns]["SMTS"].isnull()) == 0, "tissue annotation contains NaN's"

In [17]:
target = np.array([str(x) for x in col_vars.loc[gct.columns].SMTS])
np.savetxt("../data/gtex/target.csv", target, delimiter=",", fmt="%s")

In [18]:
np.unique(target)

array(['Adipose Tissue', 'Adrenal Gland', 'Blood', 'Blood Vessel', 'Brain',
       'Breast', 'Colon', 'Esophagus', 'Heart', 'Kidney', 'Liver', 'Lung',
       'Muscle', 'Nerve', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate',
       'Salivary Gland', 'Skin', 'Small Intestine', 'Spleen', 'Stomach',
       'Testis', 'Thyroid', 'Uterus', 'Vagina'], 
      dtype='<U15')

In [40]:
covariates = col_vars.loc[gct.columns, ("SMTS", "Gender", "RIN")]
covariates.to_csv("../data/gtex/covariates.csv")

In [41]:
genenames.to_csv("../data/gtex/gene_symbols.csv")