In [70]:
import pycurl
import json
from subprocess import check_output
from io import BytesIO


In [71]:
gcp_token = check_output(["gcloud", "auth", "print-access-token"], encoding='UTF-8').strip()

curl -X POST     -H "Authorization: Bearer $(gcloud auth print-access-token)"     -H "Content-Type: application/json"     -d @request.json  "https://healthcare.googleapis.com/v1/projects/neo4j-dashboard/locations/us-central1/services/nlp:analyzeEntities"

In [72]:
offset_text = []


begin = 0
with open('conditions.txt') as f:
    for line in f.readlines():
        end = begin + len(line)
        offset_text.append((begin, end, line.strip()))

        begin += len(line)

In [73]:
offset_text

[(0, 5, 'AIDS'),
 (5, 25, 'acellular pertussis'),
 (25, 39, 'acne vulgaris'),
 (39, 63, 'acute coronary syndrome'),
 (63, 86, 'acute myeloid leukemia'),
 (86, 111, 'adrenocortical carcinoma'),
 (111, 134, 'advanced breast cancer'),
 (134, 145, 'alcoholism'),
 (145, 169, 'allergic conjunctivitis'),
 (169, 178, 'alopecia'),
 (178, 198, "alzheimer's disease"),
 (198, 210, 'amyloidosis'),
 (210, 218, 'anaemia'),
 (218, 228, 'analgesia'),
 (228, 251, 'ankylosing spondylitis'),
 (251, 269, 'anxiety disorders'),
 (269, 289, 'arrhythmia, cardiac'),
 (289, 311, 'arthritis, rheumatoid'),
 (311, 318, 'asthma'),
 (318, 338, 'asthma and rhinitis'),
 (338, 354, 'atherosclerosis'),
 (354, 374, 'atrophy, geographic'),
 (374, 392, 'atrophy, muscular'),
 (392, 440, 'attention deficit hyperactivity disorder (adhd)'),
 (440, 460, 'autoimmune diseases'),
 (460,
  590,
  'b-cell acute lymphoblastic leukemia, relapsed b-cell acute lymphoblastic leukemia, refractory b-cell acute lymphoblastic leukemia'),
 (59

In [74]:
def get_healthcare_json(text: str, gcp_token: str) -> str:
    data = {'documentContent': f'{text}'}
    post_data = json.dumps(data)
    headers = ['Content-Type: application/json', f'Authorization: Bearer {gcp_token}']
    buffer = BytesIO()

    c = pycurl.Curl()
    c.setopt(c.URL, 'https://healthcare.googleapis.com/v1/projects/neo4j-dashboard/locations/us-central1/services/nlp:analyzeEntities')
    c.setopt(c.POSTFIELDS, post_data)
    c.setopt(c.HTTPHEADER, headers)
    c.setopt(c.WRITEDATA, buffer)
    c.perform()
    c.close()

    response = buffer.getvalue().decode('utf-8')

    return response

In [75]:
def get_raw_problem_tsv(json_form: str, offset_text) -> str:
    content = ""

    linkedEntities = {entity["entityId"]: {"preferredTerm": entity["preferredTerm"], "vocabularyCodes": entity["vocabularyCodes"]} 
                  for entity in json_form["entities"]}
    
    print (linkedEntities)
    temp_raw_ner = {}
    for entity in json_form["entityMentions"]:
        
        if entity["type"] == "PROBLEM" and entity["confidence"] > 0.4 and "linkedEntities" in entity:
            #print (entity["text"], entity["linkedEntities"], entity["confidence"])
            begin_offset = entity["text"]["beginOffset"]


            for i, iterator in enumerate(offset_text):
                if iterator[0] <= begin_offset and begin_offset < iterator[1]:
                    if i not in temp_raw_ner:
                        temp_raw_ner[i] = []
                    temp_raw_ner[i].append((entity["text"], entity["linkedEntities"], entity["confidence"]))
                    break

    print (temp_raw_ner)
    #dereplacate
    raw_ner = {}
    for i in temp_raw_ner:
        current_confidence = 0
        current_linkedEntities = None
        
        olinkedEntities = temp_raw_ner[i][0][1]
        confidence = temp_raw_ner[i][0][2]

        if confidence > current_confidence:
            current_confidence = confidence
            current_linkedEntities = olinkedEntities

        raw_ner[i] = (temp_raw_ner[i][0][0]["content"], current_linkedEntities, current_confidence)

    print (raw_ner)

    for i, entity in enumerate(offset_text):
        if i in raw_ner:
            entityids = [j['entityId'] for j in raw_ner[i][1]]

            preferredTerms = [linkedEntities[e]["preferredTerm"] for e in entityids]

            HPO = []
            MSH = []

            for e in entityids:
                for id in linkedEntities[e]["vocabularyCodes"]:
                    if id.startswith("HPO"):
                        HPO.append(id)
                    elif id.startswith("MSH"):
                        MSH.append(id)

            content += f"{entity[2]}\t{';'.join(preferredTerms)}\t{';'.join(entityids)}\t{raw_ner[i][2]}\t{';'.join(HPO)}\t{';'.join(MSH)}\n"
        else:
            content += f"{entity[2]}\t\t\t\t\t\n"

    return content

#content = get_raw_problem_tsv(json_form, offset_text)

In [76]:
gcp_token = check_output(["gcloud", "auth", "print-access-token"], encoding='UTF-8').strip()
#test_text = ";".join([x[2] for x in offset_text])

test_text = ""
content = ""

for x in offset_text:

    if len(test_text) + len(x[2]) < 10000:

        test_text += x[2] + ";"

    else:
        test_text = test_text[:-1]

        result = get_healthcare_json(test_text, gcp_token)

        json_form = json.loads(result)
        
        content += get_raw_problem_tsv(json_form, offset_text)

        test_text = x[2] + ";"


if test_text != "":
    test_text = test_text[:-1]

    result = get_healthcare_json(test_text, gcp_token)

    json_form = json.loads(result)

    content += get_raw_problem_tsv(json_form, offset_text)

with open('condition_gcp_ner.tsv', 'w') as f:
    f.write(content)


{'UMLS/C0001144': {'preferredTerm': 'Acne Vulgaris', 'vocabularyCodes': ['MSH/D000152', 'MTH/NOCODE', 'NCI/C27195']}, 'UMLS/C0001175': {'preferredTerm': 'Acquired Immunodeficiency Syndrome', 'vocabularyCodes': ['LNC/LA10430-9', 'MEDLINEPLUS/1', 'MSH/D000163', 'MTH/NOCODE', 'NCI/C2851']}, 'UMLS/C0001519': {'preferredTerm': 'Adie Syndrome', 'vocabularyCodes': ['MSH/D000270', 'MTH/NOCODE', 'NCI/C34357', 'OMIM/103100']}, 'UMLS/C0001529': {'preferredTerm': 'Adiposis Dolorosa', 'vocabularyCodes': ['MSH/D000274', 'NCI/C84540', 'OMIM/103200']}, 'UMLS/C0001617': {'preferredTerm': 'Adrenal Cortex Hormones', 'vocabularyCodes': ['LNC/LP31653-6', 'MEDLINEPLUS/4557', 'MSH/D000305', 'MTH/NOCODE', 'NCI/C211', 'NCI/C2322', 'VANDF/4021625']}, 'UMLS/C0001683': {'preferredTerm': 'Advance Directives', 'vocabularyCodes': ['LNC/LP133261-0', 'LNC/LP74455-4', 'LNC/MTHU021127', 'LNC/MTHU047657', 'MEDLINEPLUS/4151', 'MSH/D016223', 'MTH/NOCODE', 'NCI/C93142']}, 'UMLS/C0001815': {'preferredTerm': 'Primary Myelofib