In [9]:
import pandas as pd
from itertools import repeat
import re
from orangecontrib.bio.ontology import OBOOntology

### Read column headers from fantom5 data. 
Read the column headers and extract sample information from it. 

In [47]:
!ls data

column_vars.txt				   human_samples_nature13182-s2
fantom5_ds.txt				   samples1829
fantom5_head2000.txt			   samples1829_LIBRARY_IDs
ff-phase2-140729.obo.txt		   samples1829_simplified
hg19.cage_peak_phase1and2combined_ann.txt


In [62]:
!grep "^##ColumnVariables" data/fantom5_ds.txt | cut -d"=" -f2 | head

CAGE peak id
short form of the description below. Common descriptions in the long descriptions has been omited
description of the CAGE peak
transcript which 5end is the nearest to the the CAGE peak
entrezgene (genes) id associated with the transcript
hgnc (gene symbol) id associated with the transcript
uniprot (protein) id associated with the transcript
tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4
tpm of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5
tpm of 293SLAM rinderpest infection, 00hr, biol_rep3.CNhs14408.13543-145H6
cut: write error: Broken pipe


In [64]:
!grep "^##ColumnVariables" data/fantom5_ds.txt | cut -d"=" -f2 | tail -n+8 | head

tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4
tpm of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5
tpm of 293SLAM rinderpest infection, 00hr, biol_rep3.CNhs14408.13543-145H6
tpm of 293SLAM rinderpest infection, 06hr, biol_rep1.CNhs14410.13544-145H7
tpm of 293SLAM rinderpest infection, 06hr, biol_rep2.CNhs14411.13545-145H8
tpm of 293SLAM rinderpest infection, 06hr, biol_rep3.CNhs14412.13546-145H9
tpm of 293SLAM rinderpest infection, 12hr, biol_rep1.CNhs14413.13547-145I1
tpm of 293SLAM rinderpest infection, 12hr, biol_rep2.CNhs14414.13548-145I2
tpm of 293SLAM rinderpest infection, 12hr, biol_rep3.CNhs14415.13549-145I3
tpm of 293SLAM rinderpest infection, 24hr, biol_rep1.CNhs14416.13550-145I4
tail: write error: Broken pipe
tail: write error
cut: write error: Broken pipe


In [66]:
!grep "^##ColumnVariables" data/fantom5_ds.txt | cut -d"=" -f2 | tail -n+8 > data/column_vars.txt

In [67]:
sample_info = !cat data/column_vars.txt

In [68]:
sample_info[:5]

['tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4',
 'tpm of 293SLAM rinderpest infection, 00hr, biol_rep2.CNhs14407.13542-145H5',
 'tpm of 293SLAM rinderpest infection, 00hr, biol_rep3.CNhs14408.13543-145H6',
 'tpm of 293SLAM rinderpest infection, 06hr, biol_rep1.CNhs14410.13544-145H7',
 'tpm of 293SLAM rinderpest infection, 06hr, biol_rep2.CNhs14411.13545-145H8']

### Retreiving Information from the ontoloty. 

The column headers are difficult to parse (inconsistent commata, etc.). 
We found an ontology on the fantom5 web page. [1]

First, we check, if all the ids from the column headers appear in the ontology. 

[1] http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Ontology/ff-phase2-140729.obo.txt

In [10]:
OBO_ID_REGEX = re.compile(r'CNhs\d+.(\w+)-(\w+)')

In [11]:
for info_line in sample_info:
    ff_id = "-".join(OBO_ID_REGEX.search(info_line).groups())
    res = !grep {ff_id} data/ff-phase2-140729.obo.txt 
    assert len(res) > 0

that seems to be the case...

#### Try out the Ontology Parser

In [12]:
obo = OBOOntology()
obo.load(open("data/ff-phase2-140729.obo.txt"))

In [13]:
print(obo.term("FF:1394-42H2").tags())

[('id', 'FF:1394-42H2', None, None), ('name', 'lung, neonate N30, rep1', None, None), ('namespace', 'FANTOM5', None, None), ('subset', 'phase1', None, None), ('subset', 'phase2', None, None), ('subset', 'update022', None, None), ('is_a', 'EFO:0002091', None, 'biological replicate'), ('is_a', 'FF:0011489', None, 'mouse lung- neonate N30 sample')]


In [15]:
obo.term("FF:1394-42H2").name

'lung, neonate N30, rep1'

#### Parse the ontology. 

at least, we don't run into massive comma-parsing trouble again. There remain issues, though:
* sometimes, the time/donor/replicate is not annotated using the ontology but only appears in the 'name'. In that case parse using regex and check if it is consistent with the ongology annotation, if available. 
* I need to figure out a way how to determine the cell type from the name. Is it enough to rely on the 'derives_from' annotation? 

In [33]:
OBO_ID_REGEX = re.compile(r'CNhs\d+.(\w+)-(\w+)')
LIB_ID_REGEX = re.compile(r'CNhs(\d+)')
TIME_REGEX = re.compile(r'(\d+)(hr|min)|day(\d+)')
DONOR_REGEX = re.compile(r'donor(\d+)')
REPLICATE_REGEX = re.compile(r'((biol|tech)_)?rep(\d+)')
def get_rex_value(regex, str): 
    try: 
        return regex.search(str).group()
    except AttributeError:
        return None

# Here we want the exception to throw 
def get_obo_id(str): 
    return "FF:" + "-".join(OBO_ID_REGEX.search(str).groups())
def get_lib_id(str):
    return LIB_ID_REGEX.search(str).group()

# Here, we don't
def get_time(str):
    return get_rex_value(TIME_REGEX, str)
def get_donor(str):
    return get_rex_value(DONOR_REGEX, str)
def get_replicate(str):
    return get_rex_value(REPLICATE_REGEX, str)

In [26]:
assert get_obo_id("tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4") == "13541-145H4"
assert get_lib_id("tpm of 293SLAM rinderpest infection, 00hr, biol_rep1.CNhs14406.13541-145H4") == "CNhs14406"
assert get_time("tpm of 293SLAM rinderpestday01 infectionday01, 00hr, biol_rep1.CNhs14406.13541-145H4") == "day01"
assert get_time("tpm of 293SLAM rinderpestdayd01 infectionddady01, 00hr, biol_rep1.CNhs14406.13541-145H4") == "00hr"
assert get_time("tpm of 293SLAM rinderpestdayd01 infectionddady01, 15min biol_rep1.CNhs14406.13541-145H4") == "15min"
assert get_donor("tpm of 293SLAM rinderpestdayd01 infection, 00hr, donor1, biol_rep1.CNhs14406.13541-145H4") == "donor1"
assert get_replicate("tpm of 293SLAM rinderpestdayd01 infection, 00hr, donor1, biol_rep1.CNhs14406.13541-145H4") == "biol_rep1"
assert get_replicate("tpm of 293SLAM rinderpestdayd01 infection, 00hr, donor1, rep1.CNhs14406.13541-145H4") == "rep1"
assert get_donor("tpm of 293SLAM rinderpestdayd01 infection, 00hr, dondor1, biol_rep1.CNhs14406.13541-145H4") is None

Get ontology structure for time/donor/replicate

In [27]:
def contains_term(term, tag_name, tag_value):
    """Check if a set of tags (from a certain term) contains a certain annotation (e.g. is_a: 24-hour-sample)"""
    return any(a == tag_name and b == tag_value for a, b in term.tags())

In [57]:
hour_terms = [term.id for term in obo.child_terms("UO:0000032")]
minute_terms = [term.id for term in obo.child_terms("UO:0000031")]
day_terms = [term.id for term in obo.child_terms("UO:0000033")]
tech_rep = "EFO:0002090"
biol_rep = "EFO:0002091"

In [56]:
NO_REGEX = re.compile(r'(\d+)')
def compare_time(string, obo_id): 
    """ return true if the two times are considered to be identical. """
    if 'day' in string: 
        if obo_term not in day_terms:
            return False
    if 'hr' in string:
        if obo_term not in hour_terms:
            return False
    if 'min' in string:
        if obo_term not in minute_terms:
            return False
    term_name = obo.term(obo_id).name
    if NO_REGEX.search(string).group() == NO_REGEX.search(term_name).group(): 
        return True
    else: 
        return False

In [58]:
def compare_replicates(string, obo_id): 
    """return true, if the replicate type is considered to be identical"""
    if "tech_rep" in string and obo_id == tech_rep:
        return True
    elif "biol_rep" in string and obo_id == biol_rep:
        return True
    else:
        return False

In [60]:
for info_line in sample_info[:1]: 
    obo_id = get_obo_id(info_line)
    obo_term = obo.term(obo_id)
    tags = obo_term.tags()
    name = obo_term.name
    annot = {}
    annot["name"] = name
    # values parsed from name
    donor_n = get_donor(name)
    time_n = get_time(name)
    replicate_n = get_replicate(name)
    # values parsed from ontology 
    replicate_o = None
    time_o = None
    for tag, tag_value, _, _ in tags: 
        if tag == "is_a": 
            if tag_value == tech_rep: 
                assert replicate_o is None, 'multiple matches'
                replicate_o = tech_rep
            if tag_value == biol_rep:
                assert replicate_o is None, 'multiple matches'
                replicate_o = biol_rep
            if tag_value in hour_terms:
                assert time_o is None, 'multiple matches'
                time_o = tag_value
            if tag_value in minute_terms:
                assert time_o is None, 'multiple matches'
                time_o = tag_value
            if tag_value in day_terms:
                assert time_o is None, 'multiple matches'
                time_o = tag_value
    if donor_n: 
        annot["donor"] = donor_n
    if time_n and time_o: 
        assert compare_time(time_n, time_o), "mismatching times: {0}, {1}".format(time_n, time_o)
    if replicate_n and replicate_o: 
        assert compare_replicates(replicate_n, replicate_o), "mismatching replicates: {0}, {1}".format(replicate_n, replicate_o)
    

AssertionError: mismatching times: 00hr, FF:0000357

In [61]:
compare_time("00hr", "FF:0000357")

False

In [55]:
time_o

'FF:0000357'

In [47]:
hour_terms.values()

dict_values([OBOObject(id='FF:0000357', name=0 hr sample, ...), OBOObject(id='FF:0000358', name=1 hr sample, ...), OBOObject(id='FF:0000657', name=66 hr sample, ...), OBOObject(id='FF:0000360', name=3 hr sample, ...), OBOObject(id='FF:0000361', name=4 hr sample, ...), OBOObject(id='FF:0000362', name=5 hr sample, ...), OBOObject(id='FF:0000363', name=6 hr sample, ...), OBOObject(id='FF:0000364', name=7 hr sample, ...), OBOObject(id='FF:0000376', name=72 hr sample, ...), OBOObject(id='FF:0000366', name=10 hr sample, ...), OBOObject(id='FF:0000367', name=12 hr sample, ...), OBOObject(id='FF:0000359', name=2 hr sample, ...), OBOObject(id='FF:0000368', name=14 hr sample, ...), OBOObject(id='FF:0000379', name=144 hr sample, ...), OBOObject(id='FF:0000370', name=18 hr sample, ...), OBOObject(id='FF:0000371', name=20 hr sample, ...), OBOObject(id='FF:0000372', name=22 hr sample, ...), OBOObject(id='FF:0000373', name=24 hr sample, ...), OBOObject(id='FF:0300028', name=28 hr sample, ...), OBOObj