In [202]:
import pycurl
import json
from subprocess import check_output
from io import BytesIO


In [203]:
gcp_token = check_output(["gcloud", "auth", "print-access-token"], encoding='UTF-8').strip()

curl -X POST     -H "Authorization: Bearer $(gcloud auth print-access-token)"     -H "Content-Type: application/json"     -d @request.json  "https://healthcare.googleapis.com/v1/projects/neo4j-dashboard/locations/us-central1/services/nlp:analyzeEntities"

In [205]:
offset_text = []


begin = 0
with open('test_condition.txt') as f:
    for line in f.readlines():
        end = begin + len(line)
        offset_text.append((begin, end, line.strip()))

        begin += len(line)

In [206]:
offset_text

[(0, 20, 'autoimmune diseases'),
 (20,
  150,
  'b-cell acute lymphoblastic leukemia, relapsed b-cell acute lymphoblastic leukemia, refractory b-cell acute lymphoblastic leukemia'),
 (150, 156, 'hello'),
 (156, 172, 'bipolar disorder')]

In [207]:
def get_healthcare_json(text: str, gcp_token: str) -> str:
    data = {'documentContent': f'{text}'}
    post_data = json.dumps(data)
    headers = ['Content-Type: application/json', f'Authorization: Bearer {gcp_token}']
    buffer = BytesIO()

    c = pycurl.Curl()
    c.setopt(c.URL, 'https://healthcare.googleapis.com/v1/projects/neo4j-dashboard/locations/us-central1/services/nlp:analyzeEntities')
    c.setopt(c.POSTFIELDS, post_data)
    c.setopt(c.HTTPHEADER, headers)
    c.setopt(c.WRITEDATA, buffer)
    c.perform()
    c.close()

    response = buffer.getvalue().decode('utf-8')

    return response

In [208]:
gcp_token = check_output(["gcloud", "auth", "print-access-token"], encoding='UTF-8').strip()
test_text = ";".join([x[2] for x in offset_text])

In [209]:
test_text


'autoimmune diseases;b-cell acute lymphoblastic leukemia, relapsed b-cell acute lymphoblastic leukemia, refractory b-cell acute lymphoblastic leukemia;hello;bipolar disorder'

In [210]:
result = get_healthcare_json(test_text, gcp_token)

In [211]:
json_form = json.loads(result)

In [213]:
def get_raw_problem_tsv(json_form: str, offset_text) -> str:
    content = ""

    linkedEntities = {entity["entityId"]: {"preferredTerm": entity["preferredTerm"], "vocabularyCodes": entity["vocabularyCodes"]} 
                  for entity in json_form["entities"]}
    
    temp_raw_ner = {}
    for entity in json_form["entityMentions"]:
        if entity["type"] == "PROBLEM" and entity["confidence"] > 0.5:
            #print (entity["text"], entity["linkedEntities"], entity["confidence"])
            begin_offset = entity["text"]["beginOffset"]


            for i, iterator in enumerate(offset_text):
                if iterator[0] <= begin_offset and begin_offset < iterator[1]:
                    if i not in temp_raw_ner:
                        temp_raw_ner[i] = []
                    temp_raw_ner[i].append((entity["text"], entity["linkedEntities"], entity["confidence"]))
                    break

    #print (temp_raw_ner)
    #dereplacate
    raw_ner = {}
    for i in temp_raw_ner:
        current_confidence = 0
        current_linkedEntities = None
        
        olinkedEntities = temp_raw_ner[i][0][1]
        confidence = temp_raw_ner[i][0][2]

        if confidence > current_confidence:
            current_confidence = confidence
            current_linkedEntities = olinkedEntities

        raw_ner[i] = (temp_raw_ner[i][0][0]["content"], current_linkedEntities, current_confidence)

    #print (raw_ner)

    for i, entity in enumerate(offset_text):
        if i in raw_ner:
            entityids = [j['entityId'] for j in raw_ner[i][1]]

            preferredTerms = [linkedEntities[e]["preferredTerm"] for e in entityids]

            HPO = []
            MSH = []

            for e in entityids:
                for id in linkedEntities[e]["vocabularyCodes"]:
                    if id.startswith("HPO"):
                        HPO.append(id)
                    elif id.startswith("MSH"):
                        MSH.append(id)

            content += f"{entity[2]}\t{';'.join(preferredTerms)}\t{';'.join(entityids)}\t{raw_ner[i][2]}\t{';'.join(HPO)}\t{';'.join(MSH)}\n"
        else:
            content += f"{entity[2]}\t\t\t\t\t\n"

    return content

print (get_raw_problem_tsv(json_form, offset_text))

autoimmune diseases	Autoimmune Diseases	UMLS/C0004364	0.8982720375061035	HPO/HP:0002960	MSH/D001327
b-cell acute lymphoblastic leukemia, relapsed b-cell acute lymphoblastic leukemia, refractory b-cell acute lymphoblastic leukemia	Acute lymphocytic leukemia;Precursor Cell Lymphoblastic Leukemia Lymphoma	UMLS/C0023449;UMLS/C1961102	0.687578558921814	HPO/HP:0006721	MSH/D054198
hello					
bipolar disorder	Bipolar Disorder	UMLS/C0005586	0.9830496907234192	HPO/HP:0007302	MSH/D001714

