In [1]:
import sys
sys.path.append("../../pygenesig")

%load_ext autoreload
%autoreload 1
%aimport pygenesig
%aimport pygenesig.tools
%matplotlib inline

import numpy as np
import pandas as pd
from collections import OrderedDict
import dask.dataframe as dd
from pygenesig.tools import collapse_matrix
from pygenesig.file_formats import * 

# Preprocess FANTOM5 data

In this notebook, we process the fantom5 data for the use with *pygenesig* so that we can easily create and validate signatures. 

For the crossvalidation we need a
* matrix of gene expression data
* list of target classes (signatures) 

Here, we extract the relevant information from the fantom5 expression and annotation files

## Load data

In [2]:
!ls ../data

annotation_notes.csv	     fantom5-S1.xls
biolayout		     ff-phase2-140729.corr.obo
column_vars.processed.csv    ff-phase2-140729.obo
column_vars.txt		     hg19.cage_peak_phase1and2combined_ann.txt
corr_mat.primary.tsv	     hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt
delimiter_nodes.tsv	     missing_samples.txt
f5_eset_primary_cells.Rdata  process_sample_descriptions.log
f5_expressionset.Rdata	     pygenesig
fantom5_head2000.txt	     tmp


In [3]:
col_vars = pd.read_csv("../data/column_vars.processed.csv", sep=",", index_col=0)

In [4]:
gct = pd.read_csv("../data/hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt", sep="\t", 
                  index_col=0, comment='#')
gct = gct.iloc[4:, ] # remove stats

## process data

In [5]:
gene_id_map = pd.read_csv("../data/pygenesig/FANTOM5cage_TX_ELEMENTS", sep="\t")
f5_to_geneid = dict(zip(gene_id_map.PROMOTERID, gene_id_map.GENEID))

In [6]:
gct = gct[gct.short_description.isin(f5_to_geneid.keys())] # gene symbols only 

In [7]:
geneids = gct.short_description.apply(lambda x: f5_to_geneid[x])
fdata = gct.iloc[:,:6].assign(ora_gene_id=geneids)

In [8]:
exprs = gct.iloc[:,6:].as_matrix()

In [9]:
fdata.shape, exprs.shape, col_vars.shape

((91036, 7), (91036, 1829), (1829, 9))

### aggregate by gene

In [10]:
exprs_by_gene_sum = collapse_matrix(exprs, fdata.ora_gene_id.tolist(), axis=0, aggregate_fun=np.sum)
fdata_by_gene_sum = pd.DataFrame().assign(ora_id=exprs_by_gene_sum.index)
exprs_by_gene_sum = exprs_by_gene_sum.as_matrix()

## store fdata

In [11]:
!mkdir -p ../data/pygenesig/by_promotor ../data/pygenesig/by_gene_sum

In [12]:
fdata.to_csv("../data/pygenesig/by_promotor/fdata.csv")
fdata_by_gene_sum.to_csv("../data/pygenesig/by_gene_sum/fdata.csv")

### annotate oracle ids

In [13]:
!Rscript 07_annotate_oracle_id.R

[1] "Estabilshin RJDBC connection to bin"
Parsed with column specification:
cols(
  X1 = col_integer(),
  ora_id = col_integer()
)
Missing column names filled in: 'X1' [1] 
[1] "Using estabilshed database connection to bin"
[1] TRUE
[1] TRUE


In [14]:
gene_symbols = pd.read_csv("../data/pygenesig/by_gene_sum/gene_symbols.csv", header=None)

In [15]:
exprs_by_gene_sum.shape, fdata_by_gene_sum.shape, gene_symbols.shape

((20029, 1829), (20029, 1), (20029, 1))

## tissues
Extract data for tissue samples

In [17]:
pdata_t = col_vars[col_vars.sample_type == "tissue"]
pdata_t = pdata_t.assign(tissue=pdata_t.name.apply(lambda x: x.split(",")[0].lower()))

In [37]:
exprs_t = exprs[:, pdata_t.index]
exprs_by_gene_sum_t = exprs_by_gene_sum[:, pdata_t.index]

In [38]:
grouped = pdata_t.groupby("tissue")
tissue_count = grouped.count().name
tissue_count

tissue
achilles tendon                                  1
adipose                                          4
adipose tissue                                   1
adrenal gland                                    1
amygdala                                         2
aorta                                            1
appendix                                         1
artery                                           1
bladder                                          1
blood                                            1
bone marrow                                      1
brain                                            3
breast                                           1
caudate nucleus                                  3
cerebellum                                       2
cerebellum - adult                               1
cerebral meninges                                1
cerebrospinal fluid                              1
cervix                                           1
clontech human universal

In [39]:
target_t = pdata_t.tissue.as_matrix()

In [40]:
exprs_t.shape, exprs_by_gene_sum_t.shape, target_t.shape

((91036, 174), (20029, 174), (174,))

### store results

In [41]:
write_expr(exprs_t, "../data/pygenesig/by_promotor/tissue_exprs.npy")
write_expr(exprs_by_gene_sum_t, "../data/pygenesig/by_gene_sum/tissue_exprs.npy")

In [42]:
np.savetxt("../data/pygenesig/tissue_target.csv", target_t, delimiter=",", fmt="%s")

## primary cells
Extract data for primary cell samples. 

We distinguish between *fine* and *coarse*. We take the pragmatic approach to rely on the sample name as annotation. 
We split the sample name at the dash. The part before the dash is 'coarse' the part behind 'fine'. 

For the *coarse* dataset, we discard all cell types that do not have at least 2 samples. 

In [43]:
pdata_p = col_vars[col_vars.sample_type == "primary cell"]

In [44]:
pdata_p = pdata_p.assign(type_fine=pdata_p.name.apply(lambda x: x.split(",")[0].lower()))
pdata_p = pdata_p.assign(type_coarse=pdata_p.type_fine.apply(lambda x: x.split("-")[0].strip().lower()))

In [45]:
grouped = pdata_p.groupby("type_fine")
tissue_count = grouped.count().name
tissue_count

type_fine
adipocyte - breast                                                      2
adipocyte - omental                                                     3
adipocyte - perirenal                                                   1
adipocyte - subcutaneous                                                3
alveolar epithelial cells                                               2
amniotic epithelial cells                                               3
amniotic membrane cells                                                 3
anulus pulposus cell                                                    2
aortic smooth muscle cell response to fgf2                              2
aortic smooth muscle cell response to il1b                              2
astrocyte - cerebellum                                                  3
astrocyte - cerebral cortex                                             3
basophils                                                               3
bronchial epithelial cell   

In [46]:
grouped = pdata_p.groupby("type_coarse")
with pd.option_context('display.max_rows', None):
    tissue_count = grouped.count().name
    print(tissue_count)

type_coarse
adipocyte                                                   9
alveolar epithelial cells                                   2
amniotic epithelial cells                                   3
amniotic membrane cells                                     3
anulus pulposus cell                                        2
aortic smooth muscle cell response to fgf2                  2
aortic smooth muscle cell response to il1b                  2
astrocyte                                                   6
basophils                                                   3
bronchial epithelial cell                                   7
cardiac myocyte                                             3
cd133                                                       2
cd14                                                       42
cd14+ monocyte derived endothelial progenitor cells         3
cd19                                                       11
cd34                                                      

Choos only cells that have at least 2 samples (otherwise we can't do crossvalidation and are even more likely to learn batch effects) 

In [47]:
multi_sample_tissues = list(tissue_count.index[tissue_count >= 2])
pdata_p = pdata_p[pdata_p.type_coarse.isin(multi_sample_tissues)]

In [48]:
exprs_p = exprs[:, pdata_p.index]
exprs_by_gene_sum_p = exprs_by_gene_sum[:, pdata_p.index]

In [49]:
target_p = pdata_p.type_coarse.as_matrix()

In [50]:
exprs_p.shape, exprs_by_gene_sum_p.shape, target_p.shape

((91036, 774), (20029, 774), (774,))

### store data

In [51]:
np.save("../data/pygenesig/by_promotor/primary_exprs.npy", exprs_p)
np.save("../data/pygenesig/by_gene_sum/primary_exprs.npy", exprs_by_gene_sum_p)

In [52]:
np.savetxt("../data/pygenesig/primary_target_fine.csv", pdata_p.type_fine, delimiter=",", fmt="%s")
np.savetxt("../data/pygenesig/primary_target_coarse.csv", pdata_p.type_coarse, delimiter=",", fmt="%s")

In [53]:
pdata_p.to_csv("../data/pygenesig/primary_col_vars.csv")