In [1]:
import sys
sys.path.append("../../pygenesig")

%load_ext autoreload
%autoreload 1
%aimport pygenesig
%aimport pygenesig.tools
%matplotlib inline

import numpy as np
import pandas as pd
from collections import OrderedDict
import dask.dataframe as dd
from pygenesig.tools import collapse_matrix

# Preprocess FANTOM5 data

In this notebook, we process the fantom5 data for the use with *pygenesig* so that we can easily create and validate signatures. 

For the crossvalidation we need a
* matrix of gene expression data
* list of target classes (signatures) 

Here, we extract the relevant information from the fantom5 expression and annotation files

In [2]:
!ls ../data

annotation_notes.csv	     fantom5-S1.xls
biolayout		     ff-phase2-140729.corr.obo
column_vars.processed.csv    ff-phase2-140729.obo
column_vars.txt		     hg19.cage_peak_phase1and2combined_ann.txt
corr_mat.primary.tsv	     hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt
delimiter_nodes.tsv	     missing_samples.txt
f5_eset_primary_cells.Rdata  process_sample_descriptions.log
f5_expressionset.Rdata	     pygenesig
fantom5_head2000.txt	     tmp


In [3]:
col_vars = pd.read_csv("../data/column_vars.processed.csv", sep=",", index_col=0)

In [4]:
gct = pd.read_csv("../data/hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt", sep="\t", 
                  index_col=0, comment='#')
gct = gct.iloc[4:, ] # remove stats

In [5]:
gene_id_map = pd.read_csv("../data/pygenesig/FANTOM5cage_TX_ELEMENTS", sep="\t")
f5_to_geneid = dict(zip(gene_id_map.PROMOTERID, gene_id_map.GENEID))

In [6]:
gct = gct[gct.short_description.isin(f5_to_geneid.keys())] # gene symbols only 

In [7]:
geneids = gct.short_description.apply(lambda x: f5_to_geneid[x])
fdata = gct.iloc[:,:6].assign(ora_gene_id=geneids)

## tissues
Extract data for tissue samples

In [8]:
col_vars_t = col_vars[col_vars.sample_type == "tissue"]
gct_t = gct[col_vars_t.index]

In [9]:
col_vars_t["tissue"] = col_vars_t.name.apply(lambda x: x.split(",")[0].lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [10]:
grouped = col_vars_t.groupby("tissue")
tissue_count = grouped.count().name
tissue_count

tissue
achilles tendon                                  1
adipose                                          4
adipose tissue                                   1
adrenal gland                                    1
amygdala                                         2
aorta                                            1
appendix                                         1
artery                                           1
bladder                                          1
blood                                            1
bone marrow                                      1
brain                                            3
breast                                           1
caudate nucleus                                  3
cerebellum                                       2
cerebellum - adult                               1
cerebral meninges                                1
cerebrospinal fluid                              1
cervix                                           1
clontech human universal

### store results

In [11]:
exprs = gct_t.as_matrix()
np.save("../data/pygenesig/by_promotor/tissue_exprs.npy", exprs)

In [12]:
target = col_vars_t.tissue
np.savetxt("../data/pygenesig/tissue_target.csv", target, delimiter=",", fmt="%s")

## primary cells
Extract data for primary cell samples. 

We distinguish between *fine* and *coarse*. We take the pragmatic approach to rely on the sample name as annotation. 
We split the sample name at the dash. The part before the dash is 'coarse' the part behind 'fine'. 

For the *coarse* dataset, we discard all cell types that do not have at least 2 samples. 

In [13]:
col_vars_p = col_vars[col_vars.sample_type == "primary cell"]
gct_p = gct[col_vars_p.index]

In [14]:
col_vars_p["type_fine"] = col_vars_p.name.apply(lambda x: x.split(",")[0].lower())
col_vars_p["type_coarse"] = col_vars_p.type_fine.apply(lambda x: x.split("-")[0].strip().lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [15]:
grouped = col_vars_p.groupby("type_fine")
tissue_count = grouped.count().name
tissue_count

type_fine
adipocyte - breast                                                      2
adipocyte - omental                                                     3
adipocyte - perirenal                                                   1
adipocyte - subcutaneous                                                3
alveolar epithelial cells                                               2
amniotic epithelial cells                                               3
amniotic membrane cells                                                 3
anulus pulposus cell                                                    2
aortic smooth muscle cell response to fgf2                              2
aortic smooth muscle cell response to il1b                              2
astrocyte - cerebellum                                                  3
astrocyte - cerebral cortex                                             3
basophils                                                               3
bronchial epithelial cell   

In [16]:
grouped = col_vars_p.groupby("type_coarse")
with pd.option_context('display.max_rows', None):
    tissue_count = grouped.count().name
    print(tissue_count)

type_coarse
adipocyte                                                   9
alveolar epithelial cells                                   2
amniotic epithelial cells                                   3
amniotic membrane cells                                     3
anulus pulposus cell                                        2
aortic smooth muscle cell response to fgf2                  2
aortic smooth muscle cell response to il1b                  2
astrocyte                                                   6
basophils                                                   3
bronchial epithelial cell                                   7
cardiac myocyte                                             3
cd133                                                       2
cd14                                                       42
cd14+ monocyte derived endothelial progenitor cells         3
cd19                                                       11
cd34                                                      

Choos only cells that have at least 2 samples (otherwise we can't do crossvalidation and are even more likely to learn batch effects) 

In [17]:
multi_sample_tissues = list(tissue_count.index[tissue_count >= 2])

In [18]:
col_vars_p = col_vars_p[col_vars_p.type_coarse.isin(multi_sample_tissues)]
gct_p = gct[col_vars_p.index]

In [19]:
geneids = geneids[gct_p.index]

In [31]:
exprs = gct_p.as_matrix()

### Aggregate by gene

In [32]:
exprs_by_gene_sum = collapse_matrix(exprs, fdata.ora_gene_id.tolist(), axis=0, aggregate_fun=np.sum)
fdata_by_gene_sum = pd.DataFrame().assign(ora_id=exprs_by_gene_sum.index)
exprs_by_gene_sum = exprs_by_gene_sum.as_matrix()

### store data

In [33]:
exprs_by_gene_sum.shape

(20029, 774)

In [34]:
np.save("../data/pygenesig/by_promotor/primary_exprs.npy", exprs)
np.save("../data/pygenesig/by_gene_sum/primary_exprs.npy", exprs_by_gene_sum)

In [35]:
np.savetxt("../data/pygenesig/primary_target_fine.csv", col_vars_p.type_fine, delimiter=",", fmt="%s")
np.savetxt("../data/pygenesig/primary_target_coarse.csv", col_vars_p.type_coarse, delimiter=",", fmt="%s")

In [36]:
col_vars_p.to_csv("../data/pygenesig/primary_col_vars.csv")

In [37]:
fdata.to_csv("../data/pygenesig/by_promotor/fdata.csv")
fdata_by_gene_sum.to_csv("../data/pygenesig/by_gene_sum/fdata.csv")

In [25]:
fdata

Unnamed: 0_level_0,short_description,description,association_with_transcript,entrezgene_id,hgnc_id,uniprot_id,ora_gene_id
00Annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"chr10:100174900..100174956,-",p1@PYROXD2,CAGE_peak_1_at_PYROXD2_5end,"0bp_to_ENST00000370575,ENST00000462874_5end",entrezgene:84795,HGNC:23517,uniprot:Q8N2H3,84795
"chr10:100174957..100174982,-",p2@PYROXD2,CAGE_peak_2_at_PYROXD2_5end,"0bp_to_NM_032709,uc001kpc.2,uc001kpd.2,uc010qp...",entrezgene:84795,HGNC:23517,uniprot:Q8N2H3,84795
"chr10:100206642..100206717,-",p1@HPS1,CAGE_peak_1_at_HPS1_5end,"0bp_to_ENST00000325103,ENST00000338546,ENST000...",entrezgene:3257,HGNC:5163,"uniprot:Q92902,uniprot:Q658M9,uniprot:Q8WXE5",3257
"chr10:100995440..100995474,-",p1@HPSE2,CAGE_peak_1_at_HPSE2_5end,84bp_to_AJ299720_5end,entrezgene:60495,HGNC:18374,uniprot:Q8WWQ2,60495
"chr10:100995540..100995551,-",p3@HPSE2,CAGE_peak_3_at_HPSE2_5end,"67bp_to_uc001kpo.1,uc009xwc.1,uc009xwd.1_5end",entrezgene:60495,HGNC:18374,uniprot:Q8WWQ2,60495
"chr10:100995586..100995597,-",p4@HPSE2,CAGE_peak_4_at_HPSE2_5end,"21bp_to_ENST00000370549,ENST00000370552,uc001k...",entrezgene:60495,HGNC:18374,uniprot:Q8WWQ2,60495
"chr10:100995603..100995614,-",p2@HPSE2,CAGE_peak_2_at_HPSE2_5end,"4bp_to_ENST00000370549,ENST00000370552,uc001kp...",entrezgene:60495,HGNC:18374,uniprot:Q8WWQ2,60495
"chr10:100995626..100995635,-",p5@HPSE2,CAGE_peak_5_at_HPSE2_5end,"0bp_to_NM_001166244,NM_001166245,NM_001166246,...",entrezgene:60495,HGNC:18374,"uniprot:Q8WWQ2,uniprot:Q2M1H9",60495
"chr10:101088836..101088861,+",p2@CNNM1,CAGE_peak_2_at_CNNM1_5end,"0bp_to_ENST00000356713,NM_020348,uc001kpp.3,uc...",entrezgene:26507,HGNC:102,uniprot:Q9NRU3,26507
"chr10:101089107..101089141,+",p1@CNNM1,CAGE_peak_1_at_CNNM1_5end,"-3bp_to_ENST00000370528,ENST00000446890_5end",entrezgene:26507,HGNC:102,uniprot:Q9NRU3,26507
