In [None]:
# Run once per environment
%cd /notebooks/cerulean-notebook
%pip install -r requirements.txt
%reset -f

In [None]:
from dotenv import load_dotenv
import pandas as pd
from azure.ai.textanalytics import TextAnalyticsClient, HealthcareEntityRelation
from azure.core.credentials import AzureKeyCredential

# Set up environment variables
load_dotenv()

# Set flags
DEBUG = False

data_dir = %env DATA_DIR
azure_key = %env AZURE_KEY
azure_endpoint = %env AZURE_ENDPOINT

# Load SQL extension
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False


# Have DuckDB use in-memory storage (comment out to user $DATABASE_URL file)
%sql duckdb:///:memory:

In [None]:
# Load data from parquet

# Load schema metadata from ${data_dir}physionet_schema.csv
tables = pd.read_csv(f'{data_dir}/physionet_schema.csv', delimiter='\t', usecols=['schema', 'table']).drop_duplicates().reset_index(drop=True)

# Filter to mimiciv_note schema for now
tables = tables[tables['schema'] == 'mimiciv_note']

# Load data from parquet
for (schema, table) in tables.values:
    print(f'Loading {schema}.{table}')
    %sql DROP TABLE IF EXISTS {{schema}}.{{table}}
    %sql CREATE SCHEMA IF NOT EXISTS {{schema}}
    %sql CREATE TABLE {{schema}}.{{table}} AS SELECT * FROM read_parquet('{{data_dir}}/parquet/{{schema}}/{{table}}.parquet')

In [None]:
# Convenience function for client auth using key and endpoint
def azure_auth_client ():
    ta_credential = AzureKeyCredential (azure_key)
    text_analytics_client = TextAnalyticsClient (
        endpoint=azure_endpoint,
        credential=ta_credential)
    return text_analytics_client

# Authenticate the client
azure_client = azure_auth_client()

In [None]:
# Convenience function for running Azure NLP for healthcare
def azure_health(client, documents):
    poller = client.begin_analyze_healthcare_entities(documents)
    result = poller.result()
    
    # Translate result to dataframe of entities
    
    # results_df = pd.DataFrame(result)

    return result

In [None]:
# Select a random sample of 10 patient notes
sample = %sql SELECT text, subject_id, charttime FROM mimiciv_note.discharge USING SAMPLE 10

In [None]:
# Test the function
documents = sample['text'].tolist()

if DEBUG:
    documents = [
        "Patient has a history of hypertension and type 2 diabetes. He is currently taking metformin and lisinopril."
    ]

# results = azure_health(azure_client, documents)
# Save the results to a dataframe
results = azure_health(azure_client, documents)
sample['az_result'] = pd.DataFrame(results)

In [None]:
# Print the results to a file
if DEBUG:
    print(sample['az_result'][0], file=open("debug/result_0.json", "w"))

In [None]:
ds = sample['az_result'][0].entities[0].data_sources
# display(ds)

# make a dict with the data sources mapping name to entity id
ds_dict = {d.name: d.entity_id for d in ds}['UMLS']
display(ds_dict)

# ds_umls = [d for d in ds if d.entity_id.startswith('UMLS')]

In [None]:
# Create a new column sample['entity'] with the normalized text and category from each entity in sample['az_result']
sample['entity'] = sample['az_result'].apply(lambda x: [(e.normalized_text, e.category) for e in x.entities])

In [None]:
ds