In [1]:
import sys

%load_ext autoreload
%autoreload 1
%aimport pygenesig
%matplotlib inline

import numpy as np
import pandas as pd
from collections import OrderedDict
import dask.dataframe as dd
from pygenesig.tools import collapse_matrix


# Preprocess GTEx data
For the crossvalidation we need a
* matrix of gene expression data
* list of target classes (signatures) 

Here, we extract the relevant information from the gtex `.gct` and annotation files. 

In [2]:
!ls ../data

exp.tissuemark.gtex.roche.symbols.gmt
GTEx_Analysis_v6p_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct.gz
GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct
GTEx_Data_V6_Annotations_SampleAttributesDS.txt
GTEx-UDISDataSetID5681-sampleAnnotation.txt
mouseGNF
v6_UDIS.txt


In [3]:
hgnc_symbols = pd.read_csv("../../pygenesig/data/baseline/hgnc_complete_set.txt", sep="\t").symbol.tolist()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
pdata = pd.read_csv("../data/GTEx_Data_V6_Annotations_SampleAttributesDS.txt", sep="\t", index_col=0)

In [5]:
gct = pd.read_csv("../data/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct", sep="\t", skiprows=2, index_col=0)

In [6]:
pdata = pdata.loc[gct.columns[1:], :]

In [8]:
gene_symbols = gct.Description.apply(lambda x: x.split(".")[0])

In [9]:
grouped = pdata.groupby("SMTSD")
tissue_count = grouped.count().iloc[:,1]
tissue_count

SMTSD
Adipose - Subcutaneous                       350
Adipose - Visceral (Omentum)                 227
Adrenal Gland                                145
Artery - Aorta                               224
Artery - Coronary                            133
Artery - Tibial                              332
Bladder                                       11
Brain - Amygdala                              72
Brain - Anterior cingulate cortex (BA24)      84
Brain - Caudate (basal ganglia)              117
Brain - Cerebellar Hemisphere                105
Brain - Cerebellum                           125
Brain - Cortex                               114
Brain - Frontal Cortex (BA9)                 108
Brain - Hippocampus                           94
Brain - Hypothalamus                          96
Brain - Nucleus accumbens (basal ganglia)    113
Brain - Putamen (basal ganglia)               97
Brain - Spinal cord (cervical c-1)            71
Brain - Substantia nigra                      63
Breast - Mamma

In [10]:
pdata = pdata[pdata.SMTSD.isin(tissue_count[tissue_count > 30].index)]

In [12]:
exprs = gct.iloc[:,1:].loc[:,pdata.index]

In [13]:
pdata.shape, gct.shape, exprs.shape

((8527, 63), (56318, 8556), (56318, 8527))

In [16]:
exprs_collapsed = collapse_matrix(exprs, gene_symbols[exprs.index].tolist(), axis=0, aggregate_fun=np.sum)

In [17]:
exprs_collapsed.shape

(46169, 8527)

In [18]:
exprs_collapsed = exprs_collapsed[exprs_collapsed.index.isin(hgnc_symbols)]

In [19]:
exprs_collapsed.shape

(31819, 8527)

## Store results

In [20]:
np.save("../results/exprs.npy", exprs.as_matrix())
np.save("../results/exprs_by_gene.npy", exprs_collapsed.as_matrix())

In [23]:
fdata = pd.Series(exprs.index)
fdata_collapsed = pd.Series(exprs_collapsed.index)

In [24]:
fdata.to_csv("../results/fdata.tsv", sep="\t")
fdata_collapsed.to_csv("../results/fdata_by_gene.tsv", sep="\t")

**make sure target is in the same order as exprs columns**

In [30]:
assert np.all(exprs_collapsed.columns == pdata.SMTSD.index), "order of exprs and target are not identical"
assert sum(pdata.SMTSD.isnull()) == 0, "tissue annotation contains NaN's"

In [31]:
target = np.array([str(x) for x in pdata.SMTSD])
np.savetxt("../results/target.csv", target, delimiter=",", fmt="%s")

In [32]:
np.unique(target)

array(['Adipose - Subcutaneous', 'Adipose - Visceral (Omentum)',
       'Adrenal Gland', 'Artery - Aorta', 'Artery - Coronary',
       'Artery - Tibial', 'Brain - Amygdala',
       'Brain - Anterior cingulate cortex (BA24)',
       'Brain - Caudate (basal ganglia)', 'Brain - Cerebellar Hemisphere',
       'Brain - Cerebellum', 'Brain - Cortex',
       'Brain - Frontal Cortex (BA9)', 'Brain - Hippocampus',
       'Brain - Hypothalamus', 'Brain - Nucleus accumbens (basal ganglia)',
       'Brain - Putamen (basal ganglia)',
       'Brain - Spinal cord (cervical c-1)', 'Brain - Substantia nigra',
       'Breast - Mammary Tissue', 'Cells - EBV-transformed lymphocytes',
       'Cells - Transformed fibroblasts', 'Colon - Sigmoid',
       'Colon - Transverse', 'Esophagus - Gastroesophageal Junction',
       'Esophagus - Mucosa', 'Esophagus - Muscularis',
       'Heart - Atrial Appendage', 'Heart - Left Ventricle',
       'Kidney - Cortex', 'Liver', 'Lung', 'Minor Salivary Gland',
       'Muscl