In [3]:
import requests
import os
import itertools
import multiprocessing
import pickle

In [None]:
def request_synonyms(oid, iri):
    j = requests.get('http://www.ebi.ac.uk/ols/api/ontologies/' + oid + '/terms?iri=' + iri).json()
    return j["_embedded"]["terms"][0]["synonyms"] if "_embedded" in j else []

def search_others(query, doids):
    j = requests.get('http://www.ebi.ac.uk/ols/api/search?q=' + query + '&ontology=efo, hp, ogms, ordo').json()
    
    results = []
    syns = set()
    for item in j['response']['docs']:
        if 'iri' in item:
            ss = request_synonyms(item['ontology_name'], item['iri'])
            if ss:
                for s in ss:
                    syns.add(s.lower().strip())            

    for syn in syns:
        r = search_doid(syn, True, doids)
        if len(r['matches']) > 0:
            results.append(r)

    return results

def search_doid(query, exact, doids):
    j = requests.get('http://www.ebi.ac.uk/ols/api/search?q=' + query + '&ontology=doid&exact=' + str(exact).lower()).json()
    
    matches = []
    for item in j['response']['docs']:
        if 'obo_id' in item and item['obo_id'] not in doids:
            matches.append({
                             'id':item['obo_id'],
                             'desc':next(iter(item['description'])) if 'description' in item else '',
                             'label':item['label'],
                            })
            doids.add(item['obo_id'])
    
    return {'query': query, 'matches': matches}

def search(query):
    doids = set()
    doid_results = search_doid(query, False, doids)
    other_results = search_others(query, doids)
    return {'top_matches': doid_results, 'synonym_matches': other_results}
                     
def parse(filename):
    with open(filename, 'rt') as f:
        lines = list(iter(f))
        def text(line):
            return line.split(':')[1]
        return filename, [text(lines[0]).strip()] + [s.strip() for s in text(lines[3]).split(',')]

def mapper(file):
    if file.endswith(".txt"):
        filename, terms = parse(os.path.join("/Users/elvirakinzina/src/Medhack/data", file))
        return {'filename':str(file), 'terms': [search(s) for s in terms]}


def parseFiles():
    
    pool = multiprocessing.Pool(20)
    result = pool.map(mapper, itertools.islice(os.listdir("/Users/elvirakinzina/src/Medhack/data"), 10000))
        
    return result

parsed = parseFiles()

pickle.dump( parsed, open( "lookup.p", "wb" ) )

In [93]:
parsed

[None,
 {'filename': 's0002_re.txt',
  'terms': [{'exact_matches': {'matches': [{'desc': '',
       'id': u'DOID:8805',
       'label': u'intermediate coronary syndrome'},
      {'desc': '', 'id': u'DOID:4558', 'label': u"Ludwig's angina"},
      {'desc': '', 'id': u'HP:0001681', 'label': u'Angina pectoris'},
      {'desc': '',
       'id': u'DOID:13924',
       'label': u'necrotizing ulcerative gingivitis'},
      {'desc': u'A viral infectious disease that results_in inflammation, located_in pharynx, has_material_basis_in Human herpesvirus 4 and has_symptom fever, has_symptom fatigue, has_symptom lymphadenopathy, and has_symptom splenomegaly.',
       'id': u'DOID:8568',
       'label': u'infectious mononucleosis'},
      {'desc': u'An eye disease that is characterized by an unstable or a sustained increase in the intraocular pressure which the eye cannot withstand without damage to its structure or impairment of its function.',
       'id': u'DOID:1686',
       'label': u'glaucoma'}]

In [52]:
next(iter(parsed))

{'filename': 's0002_re.txt',
 'terms': [{'matches': [{'desc': '',
     'id': u'DOID:8805',
     'label': u'intermediate coronary syndrome'},
    {'desc': '', 'id': u'DOID:4558', 'label': u"Ludwig's angina"},
    {'desc': '', 'id': u'HP:0001681', 'label': u'Angina pectoris'},
    {'desc': '',
     'id': u'DOID:13924',
     'label': u'necrotizing ulcerative gingivitis'},
    {'desc': u'A viral infectious disease that results_in inflammation, located_in pharynx, has_material_basis_in Human herpesvirus 4 and has_symptom fever, has_symptom fatigue, has_symptom lymphadenopathy, and has_symptom splenomegaly.',
     'id': u'DOID:8568',
     'label': u'infectious mononucleosis'},
    {'desc': u'An eye disease that is characterized by an unstable or a sustained increase in the intraocular pressure which the eye cannot withstand without damage to its structure or impairment of its function.',
     'id': u'DOID:1686',
     'label': u'glaucoma'}],
   'query': 'Unstable angina'},
  {'matches': [{'de