In [9]:
import sys
sys.path.append("..")

%load_ext autoreload
%autoreload 1
%aimport pyfantom.parse_ontology

import pandas as pd
from itertools import repeat
import re
from orangecontrib.bio.ontology import OBOOntology
import logging

from pyfantom.parse_ontology import *
pd.set_option('display.max_colwidth', -1)

In [1]:
def enable_logging(): 
    !rm '../data/process_sample_descriptions.log'
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    fh = logging.FileHandler('../data/process_sample_descriptions.log')
    fh.setLevel(logging.DEBUG)
    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    fh.setFormatter(formatter)
    logger.handlers[0].setLevel(logging.WARNING) # adjust stream handler
    logger.handlers[0].propagate = True
    logger.addHandler(fh)

### Read column headers from fantom5 data. 
Read the column headers and extract sample information from it. 

In [2]:
!ls ../data

annotation_notes.csv	     fantom5-S1.xls
biolayout		     ff-phase2-140729.corr.2.obo
column_vars.processed.csv    ff-phase2-140729.corr.3.obo
column_vars.txt		     ff-phase2-140729.obo
corr_mat.primary.tsv	     hg19.cage_peak_phase1and2combined_ann.txt
delimiter_nodes.2.csv	     hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt
delimiter_nodes.csv	     missing_samples.txt
f5_eset_primary_cells.Rdata  process_sample_descriptions.log
f5_expressionset.Rdata	     singletons.tsv
fantom5_head2000.txt


In [3]:
!grep "^##ColumnVariables" ../data/hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt | cut -d"=" -f2 | head

CAGE peak id
short form of the description below. Common descriptions in the long descriptions has been omited
description of the CAGE peak
transcript which 5end is the nearest to the the CAGE peak
entrezgene (genes) id associated with the transcript
hgnc (gene symbol) id associated with the transcript
uniprot (protein) id associated with the transcript
tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4
tpm of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5
tpm of 293SLAM rinderpest infection, 00hr, biol_rep3.CNhs14408.13543-145H6
cut: write error: Broken pipe


In [4]:
!grep "^##ColumnVariables" ../data/hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt | cut -d"=" -f2 | tail -n+8 | head

tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4
tpm of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5
tpm of 293SLAM rinderpest infection, 00hr, biol_rep3.CNhs14408.13543-145H6
tpm of 293SLAM rinderpest infection, 06hr, biol_rep1.CNhs14410.13544-145H7
tpm of 293SLAM rinderpest infection, 06hr, biol_rep2.CNhs14411.13545-145H8
tpm of 293SLAM rinderpest infection, 06hr, biol_rep3.CNhs14412.13546-145H9
tpm of 293SLAM rinderpest infection, 12hr, biol_rep1.CNhs14413.13547-145I1
tpm of 293SLAM rinderpest infection, 12hr, biol_rep2.CNhs14414.13548-145I2
tpm of 293SLAM rinderpest infection, 12hr, biol_rep3.CNhs14415.13549-145I3
tpm of 293SLAM rinderpest infection, 24hr, biol_rep1.CNhs14416.13550-145I4
tail: write error: Broken pipe
tail: write error
cut: write error: Broken pipe


In [5]:
!grep "^##ColumnVariables" ../data/hg19.cage_peak_phase1and2combined_tpm_ann.osc.txt | cut -d"=" -f2 | tail -n+8 > ../data/column_vars.txt

In [6]:
sample_infos = !cat ../data/column_vars.txt

In [7]:
sample_infos[:5]

['tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4',
 'tpm of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5',
 'tpm of 293SLAM rinderpest infection, 00hr, biol_rep3.CNhs14408.13543-145H6',
 'tpm of 293SLAM rinderpest infection, 06hr, biol_rep1.CNhs14410.13544-145H7',
 'tpm of 293SLAM rinderpest infection, 06hr, biol_rep2.CNhs14411.13545-145H8']

### Retreiving Information from the ontoloty. 

The column headers are difficult to parse (inconsistent commata, etc.). 
We found an ontology on the fantom5 web page. [1]

First, we check, if all the ids from the column headers appear in the ontology. 

[1] http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Ontology/ff-phase2-140729.obo.txt

In [10]:
OBO_ID_REGEX = re.compile(r'CNhs\d+.(\w+)-(\w+)')

In [11]:
for info_line in sample_infos:
    ff_id = "-".join(OBO_ID_REGEX.search(info_line).groups())
    res = !grep {ff_id} ../data/ff-phase2-140729.obo.txt 
    assert len(res) > 0

that seems to be the case...

#### Try out the Orange Bioinformatics Python library for parsing and manipulating ontologies

In [12]:
obo = OBOOntology()
obo.load(open("../data/ff-phase2-140729.obo"))

In [13]:
print(obo.term("FF:1394-42H2").tags())

[('id', 'FF:1394-42H2', None, None), ('name', 'lung, neonate N30, rep1', None, None), ('namespace', 'FANTOM5', None, None), ('subset', 'phase1', None, None), ('subset', 'phase2', None, None), ('subset', 'update022', None, None), ('is_a', 'EFO:0002091', None, 'biological replicate'), ('is_a', 'FF:0011489', None, 'mouse lung- neonate N30 sample')]


In [14]:
obo.term("FF:1394-42H2").name

'lung, neonate N30, rep1'

#### All 'samples' are annotated as some sort of 'sample' in the ontology: 

In [15]:
sample = "FF:0000001" # most general sample id 
for info_line in sample_infos[:2]: 
    ff_id = "FF:" + "-".join(OBO_ID_REGEX.search(info_line).groups())
    ids = [term.id for term in obo.super_terms(ff_id)]
    assert sample in ids

## Preliminary checks positive $\to$ Let's get started!
Using the ontology, we at least don't run into massive comma-parsing trouble again. 

I wrote the python module `parse_ontology.py`. It takes care of
* make sure that there is no inconsistent information between ontology and the sample name (aka. `sample_infos`)
* if information is missing in the ontology, complement it with information from the sample name.
* In such a case, write an entry to `annot_notes`, that we can improve the ontology later on. 

#### Replicates
* There are technical and biological replicates. 
* Technical replicates have the same obo_id (`FF:?????-?????`), but different library ids (`CNhs??????`). 
* The library id is unique:

* We can identify biological replicates from the sample name with the keywords `biol_rep`, `rep` and `donor`. Having different donors is also a way of having biological replicates. 

As this would be redundant information, we would expect no sample to have both 'biol_rep' and 'donor' in the sample name: 

In [17]:
[x for x in sample_infos if get_donor(x) is not None and get_biol_replicate(x) is not None ]

[]

That seems to be the case. 

### Do the parsing and store the processed information as `csv` files. 

In [18]:
enable_logging()

There is additional information stored in the supplementary information table from the FANTOM5 Paper. 
We retrieve the 'sample_type' information additionally from this data source: 

In [19]:
si_table = pd.read_excel("../data/fantom5-S1.xls", sheetname=1)
pd.set_option('display.max_colwidth', 40)
si_table.set_index("Library_id", inplace=True)
si_table.columns

Index(['Sample type', 'species', 'description', 'supplier', 'sample id',
       'Catalog number', 'external URL', 'lot number', 'donor(cell lot)',
       'sex', 'age', 'RIN', 'Q20 mapped tags', 'fraction under robust DPI',
       'Number of peaks called', 'Number of 5' EST/cDNA supported peaks',
       'Fraction peaks corresponding to known 5' end',
       'RIKEN Yokohama ethics application', 'marker check',
       'used for peak calling', 'used for expression analysis',
       'top 3 most correlated samples'],
      dtype='object')

In [20]:
def get_sample_si(sample_info):
    info_si = {
        "lib_id": get_lib_id(sample_info),
        "obo_id": get_obo_id(sample_info)
    }
    try:
        info_si["sample_type"] = si_table.loc[info_si["lib_id"]]["Sample type"]
    except KeyError:
        info_si["sample_type"] = None
    return info_si

#### Run the python module
`annot_notes` will contain an entry for every piece of information that is missing in the ontology. 
`annotations` contains the collated information from the three data sources
* ontolgy
* column names in the data table
* supplementary information of the FANTOM5 Paper

In [21]:
annot_notes = []
annotations = []
for sample_info in sample_infos: 
    logging.info("Processing Sample: '{}'".format(sample_info))
    info_n = process_sample_name(sample_info)
    info_o = process_sample_ontology(obo, sample_info)
    info_si = get_sample_si(sample_info)
    annotations.append(merge_sample_info(info_n, info_o, info_si, annot_notes))



The log output indicates, that there are inconsistencies between the supplementary information and the ontology when it comes to the sample type. Moreover, the ontology specifies more than one sample type for some samples. 

In [22]:
annotations_df = pd.DataFrame(annotations)
annot_notes_df = pd.DataFrame(annot_notes)

In [23]:
annotations_df.tail()

Unnamed: 0,biol_rep,donor,lib_id,name,name_orig,obo_id,sample_type,tech_rep,time
1824,True,,CNhs11676,"uterus, adult, pool1","tpm of uterus, adult, pool1.CNhs1167...",FF:10100-102D1,tissue,False,
1825,True,donor1,CNhs11763,"uterus, fetal, donor1","tpm of uterus, fetal, donor1.CNhs117...",FF:10055-101H1,tissue,False,
1826,True,,CNhs12854,"vagina, adult, rep1","tpm of vagina, adult.CNhs12854.10204...",FF:10204-103F6,tissue,False,
1827,True,,CNhs12844,"vein, adult, rep1","tpm of vein, adult.CNhs12844.10191-1...",FF:10191-103E2,tissue,False,
1828,True,,CNhs11813,xeroderma pigentosum b cell line:XPL...,tpm of xeroderma pigentosum b cell l...,FF:10563-108A5,cell line,False,


In [24]:
annot_notes_df.tail()

Unnamed: 0,field_name,lib_id,new_value,obo_id
542,biol_rep,CNhs14223,donor10258,FF:10370-105G1
543,tech_rep,CNhs14223,tech_rep1,FF:10370-105G1
544,biol_rep,CNhs14551,donor10258,FF:10370-105G1
545,tech_rep,CNhs14551,tech_rep2,FF:10370-105G1
546,biol_rep,CNhs14084,donor10223,FF:10366-105F6


The number of informations we ammended to the ontology: 

In [25]:
len(annot_notes_df[annot_notes_df.field_name == "biol_rep"])

278

### Write everything to a csv file

In [15]:
annotations_df.to_csv("../data/column_vars.processed.csv")
annot_notes_df.to_csv("../data/annotation_notes.csv")