In [1]:
import sys
sys.path.append("..")

%load_ext autoreload
%autoreload 1
%aimport pygenesig
%matplotlib inline

import numpy as np
import pandas as pd
from collections import OrderedDict
import dask.dataframe as dd


# Preprocess FANTOM5 data

For the crossvalidation we need a
* matrix of gene expression data
* list of target classes (signatures) 

Here, we extract the relevant information from the gtex `.gct` and annotation files. 

In [2]:
!ls ../../f5-cell-signatures/data/

annotation_notes.csv	     fantom5-S1.xls
biolayout		     ff-phase2-140729.corr.obo
column_vars.processed.csv    ff-phase2-140729.obo
column_vars.txt		     hg19.cage_peak_phase1and2combined_ann.txt
corr_mat.primary.tsv	     hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt
delimiter_nodes.tsv	     missing_samples.txt
f5_eset_primary_cells.Rdata  process_sample_descriptions.log
f5_expressionset.Rdata	     tmp
fantom5_head2000.txt


In [3]:
col_vars = pd.read_csv("../../f5-cell-signatures/data/column_vars.processed.csv", sep=",", index_col=0)

In [6]:
gct = pd.read_csv("../../f5-cell-signatures/data/hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt", sep="\t", 
                  index_col=0, comment='#')
gct = gct.iloc[4:, ] # remove stats

In [7]:
gene_id_map = pd.read_csv("../data/f5/FANTOM5cage_TX_ELEMENTS", sep="\t")
f5_to_geneid = dict(zip(gene_id_map.PROMOTERID, gene_id_map.GENEID))

In [8]:
gct = gct[gct.short_description.isin(f5_to_geneid.keys())] # gene symbols only 

In [9]:
geneids = gct.short_description.apply(lambda x: f5_to_geneid[x])

## tissues

In [10]:
col_vars_t = col_vars[col_vars.sample_type == "tissue"]
gct_t = gct[col_vars_t.index]

In [11]:
col_vars_t["tissue"] = col_vars_t.name.apply(lambda x: x.split(",")[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [12]:
grouped = col_vars_t.groupby("tissue")
tissue_count = grouped.count().name
tissue_count

tissue
Clontech Human Universal Reference Total RNA         1
Fingernail (including nail plate                     1
SABiosciences XpressRef Human Universal Total RNA    1
Skin - palm                                          1
Smooth Muscle Cells - Bronchial                      1
Universal RNA - Human Normal Tissues Biochain        1
Urethra                                              1
achilles tendon                                      1
adipose                                              4
adipose tissue                                       1
adrenal gland                                        1
amygdala                                             2
aorta                                                1
appendix                                             1
artery                                               1
bladder                                              1
blood                                                1
bone marrow                                          1
bra

### store results

In [14]:
exprs = gct_t.as_matrix()
np.save("../data/f5/tissue_exprs.npy", exprs)

In [15]:
target = col_vars_t.tissue
np.savetxt("../data/f5/tissue_target.csv", target, delimiter=",", fmt="%s")

In [16]:
geneids = geneids[gct_t.index]
geneids.to_csv("../data/f5/tissue_gene_ids.csv")

## primary cells

In [17]:
col_vars_p = col_vars[col_vars.sample_type == "primary cell"]
gct_p = gct[col_vars_p.index]

In [18]:
col_vars_p["type_fine"] = col_vars_p.name.apply(lambda x: x.split(",")[0])
col_vars_p["type_coarse"] = col_vars_p.type_fine.apply(lambda x: x.split("-")[0].strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [19]:
grouped = col_vars_p.groupby("type_fine")
tissue_count = grouped.count().name
tissue_count

type_fine
Adipocyte - breast                                                    2
Adipocyte - omental                                                   3
Adipocyte - perirenal                                                 1
Adipocyte - subcutaneous                                              3
Alveolar Epithelial Cells                                             2
Amniotic Epithelial Cells                                             3
Anulus Pulposus Cell                                                  2
Aortic smooth muscle cell response to FGF2                            2
Aortic smooth muscle cell response to IL1b                            2
Astrocyte - cerebellum                                                3
Astrocyte - cerebral cortex                                           3
Basophils                                                             3
Bronchial Epithelial Cell                                             7
CD133-positive stem cells - adult bone marrow derived 

In [20]:
grouped = col_vars_p.groupby("type_coarse")
tissue_count = grouped.count().name
tissue_count

type_coarse
Adipocyte                                               9
Alveolar Epithelial Cells                               2
Amniotic Epithelial Cells                               3
Anulus Pulposus Cell                                    2
Aortic smooth muscle cell response to FGF2              2
Aortic smooth muscle cell response to IL1b              2
Astrocyte                                               6
Basophils                                               3
Bronchial Epithelial Cell                               7
CD133                                                   2
CD14                                                   42
CD14+ monocyte derived endothelial progenitor cells     3
CD19                                                   11
CD34                                                    4
CD4                                                     3
CD4+CD25                                               10
CD4+CD25+CD45RA                                         6
CD

In [21]:
geneids = geneids[gct_p.index]

### store data

In [22]:
exprs = gct_p.as_matrix()
np.save("../data/f5/primary_exprs.npy", exprs)

In [23]:
np.savetxt("../data/f5/primary_target_fine.csv", col_vars_p.type_fine, delimiter=",", fmt="%s")
np.savetxt("../data/f5/primary_target_coarse.csv", col_vars_p.type_coarse, delimiter=",", fmt="%s")

In [24]:
geneids = geneids[gct_p.index]
geneids.to_csv("../data/f5/primary_gene_ids.csv")