In [1]:
import pandas as pd
from itertools import repeat
import re
from orangecontrib.bio.ontology import OBOOntology
import logging

### Read column headers from fantom5 data. 
Read the column headers and extract sample information from it. 

In [2]:
!ls data

column_vars.processed.csv		   human_samples_nature13182-s2
column_vars.txt				   process_sample_descriptions.log
fantom5_ds.txt				   samples1829
fantom5_head2000.txt			   samples1829_LIBRARY_IDs
ff-phase2-140729.obo			   samples1829_simplified
hg19.cage_peak_phase1and2combined_ann.txt


In [3]:
!grep "^##ColumnVariables" data/fantom5_ds.txt | cut -d"=" -f2 | head

CAGE peak id
short form of the description below. Common descriptions in the long descriptions has been omited
description of the CAGE peak
transcript which 5end is the nearest to the the CAGE peak
entrezgene (genes) id associated with the transcript
hgnc (gene symbol) id associated with the transcript
uniprot (protein) id associated with the transcript
tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4
tpm of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5
tpm of 293SLAM rinderpest infection, 00hr, biol_rep3.CNhs14408.13543-145H6
cut: write error: Broken pipe


In [4]:
!grep "^##ColumnVariables" data/fantom5_ds.txt | cut -d"=" -f2 | tail -n+8 | head

tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4
tpm of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5
tpm of 293SLAM rinderpest infection, 00hr, biol_rep3.CNhs14408.13543-145H6
tpm of 293SLAM rinderpest infection, 06hr, biol_rep1.CNhs14410.13544-145H7
tpm of 293SLAM rinderpest infection, 06hr, biol_rep2.CNhs14411.13545-145H8
tpm of 293SLAM rinderpest infection, 06hr, biol_rep3.CNhs14412.13546-145H9
tpm of 293SLAM rinderpest infection, 12hr, biol_rep1.CNhs14413.13547-145I1
tpm of 293SLAM rinderpest infection, 12hr, biol_rep2.CNhs14414.13548-145I2
tpm of 293SLAM rinderpest infection, 12hr, biol_rep3.CNhs14415.13549-145I3
tpm of 293SLAM rinderpest infection, 24hr, biol_rep1.CNhs14416.13550-145I4
tail: write error: Broken pipe
tail: write error
cut: write error: Broken pipe


In [5]:
!grep "^##ColumnVariables" data/fantom5_ds.txt | cut -d"=" -f2 | tail -n+8 > data/column_vars.txt

In [6]:
sample_infos = !cat data/column_vars.txt

In [7]:
sample_infos[:5]

['tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4',
 'tpm of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5',
 'tpm of 293SLAM rinderpest infection, 00hr, biol_rep3.CNhs14408.13543-145H6',
 'tpm of 293SLAM rinderpest infection, 06hr, biol_rep1.CNhs14410.13544-145H7',
 'tpm of 293SLAM rinderpest infection, 06hr, biol_rep2.CNhs14411.13545-145H8']

### Retreiving Information from the ontoloty. 

The column headers are difficult to parse (inconsistent commata, etc.). 
We found an ontology on the fantom5 web page. [1]

First, we check, if all the ids from the column headers appear in the ontology. 

[1] http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Ontology/ff-phase2-140729.obo.txt

In [8]:
OBO_ID_REGEX = re.compile(r'CNhs\d+.(\w+)-(\w+)')

In [9]:
for info_line in sample_infos:
    ff_id = "-".join(OBO_ID_REGEX.search(info_line).groups())
    res = !grep {ff_id} data/ff-phase2-140729.obo.txt 
    assert len(res) > 0

that seems to be the case...

#### Try out the Ontology Parser

In [10]:
obo = OBOOntology()
obo.load(open("data/ff-phase2-140729.obo"))

In [11]:
print(obo.term("FF:1394-42H2").tags())

[('id', 'FF:1394-42H2', None, None), ('name', 'lung, neonate N30, rep1', None, None), ('namespace', 'FANTOM5', None, None), ('subset', 'phase1', None, None), ('subset', 'phase2', None, None), ('subset', 'update022', None, None), ('is_a', 'EFO:0002091', None, 'biological replicate'), ('is_a', 'FF:0011489', None, 'mouse lung- neonate N30 sample')]


In [12]:
obo.term("FF:1394-42H2").name

'lung, neonate N30, rep1'

#### Are all 'samples' annotated as some sort of sample? 

In [13]:
sample = "FF:0000001" # most general sample id 
for info_line in sample_infos[:2]: 
    ff_id = "FF:" + "-".join(OBO_ID_REGEX.search(info_line).groups())
    ids = [term.id for term in obo.super_terms(ff_id)]
    assert sample in ids

seems to be the case, too

#### Parse the ontology. 

at least, we don't run into massive comma-parsing trouble again. There remain issues, though:
* sometimes, the time/donor/replicate is not annotated using the ontology but only appears in the 'name'. In that case parse using regex and check if it is consistent with the ongology annotation, if available. 
* I need to figure out a way how to determine the cell type from the name. Is it enough to rely on the 'derives_from' annotation? 

The logic for parsing the Onotology File now sits in the python module `parse_ontology.py`. 

In [14]:
%load_ext autoreload
%autoreload 2
from parse_ontology import process_sample_description

Log detailed information to file: 

In [15]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('data/process_sample_descriptions.log')
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
logger.removeHandler(logger.handlers[0]) # remove the stream handler
logger.addHandler(fh)

In [16]:
annotations = [process_sample_description(obo, sample_info) for sample_info in sample_infos]

In [17]:
annotations_df = pd.DataFrame(annotations)

In [18]:
annotations_df

Unnamed: 0,biol_rep,donor,name,obo_id,tech_rep,time
0,True,,"293SLAM rinderpest infection, 00hr, biol_rep1",FF:13541-145H4,False,00hr
1,True,,"293SLAM rinderpest infection, 00hr, biol_rep2",FF:13542-145H5,False,00hr
2,True,,"293SLAM rinderpest infection, 00hr, biol_rep3",FF:13543-145H6,False,00hr
3,True,,"293SLAM rinderpest infection, 06hr, biol_rep1",FF:13544-145H7,False,06hr
4,True,,"293SLAM rinderpest infection, 06hr, biol_rep2",FF:13545-145H8,False,06hr
5,True,,"293SLAM rinderpest infection, 06hr, biol_rep3",FF:13546-145H9,False,06hr
6,True,,"293SLAM rinderpest infection, 12hr, biol_rep1",FF:13547-145I1,False,12hr
7,True,,"293SLAM rinderpest infection, 12hr, biol_rep2",FF:13548-145I2,False,12hr
8,True,,"293SLAM rinderpest infection, 12hr, biol_rep3",FF:13549-145I3,False,12hr
9,True,,"293SLAM rinderpest infection, 24hr, biol_rep1",FF:13550-145I4,False,24hr


In [19]:
annotations_df.to_csv("data/column_vars.processed.csv")