In [1]:
# This is a helper notebook to find and prepare the neighbours of each entry of the test set
# Ideally the neighbours should be resolved online, but having a precomputed set of neighbours
#   related to each entry of the test set speeds up experimentation

In [2]:
import json
import requests
import pandas as pd
import numpy as np
import re
from pathlib import Path

from gsicbr.mincbr import CBR

In [3]:
## Project root path
pjpath = ''

# Hacky way of finding the project's root path. Do not rely on this, set your own pjpath!
for p in Path.cwd().parents:
    if p.stem == 'medical-cbr':
        pjpath = p
        break

print(f'> Project path is {pjpath}')

> Project path is /home/daucco/ownCloud-UPM/CBR/medical-cbr


In [4]:
# Relevant paths
mimicpath = pjpath / 'datasets/mimiciv'

In [None]:
# Globals

SEED = 42

# Controls which data to load
samp_size = 20000
balanced_data = True

# Embeddings (precomputed best for k=1)
emb_modname, emb_mod_truncation = ('alldistilrobertav1', 'middle')

summarizer_model = 'iv_ll3_summarizer'  # Needs to be available

# Slice long input: Just keep up to max_words of each text
max_chars = 25000
subsamp_size = 1000  # 100 Number of entries to test model with

# LLM parameters
# NOTE TODO: Fixed if just one, if using multiple, deifne a list of dictionaries each one containing the specific info for each model, the run in batch to generate summaries
input_type = 'S'    # (R)eport, (R)eport and (C)hart data as json

# This is the collection of columns that contains the relevant patient info that will be provided to the LLM with the text report
# Remapping some column names to make them more significant in the prompt
pdc_remap = {
    'age': 'AGE',
    'gender': 'GENDER',
    'marital_status': 'MARITAL STATUS',
    'race': 'RACE',
    'diagnose_group_description': 'BROAD DIAGNOSIS',
    'diagnose_group_mortality': 'MORTALITY RISK',
    'insurance': 'INSURANCE',
    #'text': 'REPORT'
}

n_ctx = 32   # Context length (x 1024)
m_span = 'amonth' # How to ask the LLM for mortality (count days vs month)
temp = 0.1 # Temperature option for the LLM. The greater, the more creative the answer (def 0.1)
top_k = 20
top_p = 0.5

In [6]:
# Load precomputed dataframe. Keeps only hadm_id and delta_days_dod (to find patients that died after n days discharge)
# Transform to boolean (patient died within 30 days after discharge)
#df = pd.read_csv(mimicpath / f'mimiciv_4_mortality_S{samp_size}.csv.gz')[['hadm_id', 'delta_days_dod']]
df = pd.read_csv(mimicpath / f'mimiciv_4_mortality_S{samp_size}{'_balanced' if balanced_data else ''}.csv.gz')
df['DIES'] = df['delta_days_dod'].apply(lambda x: x > 0 and x <= 30)

# Load precomputed splits
with open(mimicpath / f'hadmid_splits_S{samp_size}{'_balanced' if balanced_data else ''}.json', 'r') as ifile:
    splits_hadmids = json.load(ifile)

# Load sorted hadm_ids from disk. This is required to be able to locate the appropriate embeddings for each entry
with open(mimicpath / f'hadmid_sorted_S{samp_size}{'_balanced' if balanced_data else ''}.json', 'r') as ifile:
    emb_hadmids = json.load(ifile)['HADM_ID']

In [7]:
# Embedding load
emb_modname = re.sub('[^a-zA-Z0-9]+', '', emb_modname)
emb_mod_fname = f'embeddings_{emb_modname}_S{samp_size}_T{emb_mod_truncation}{'_balanced' if balanced_data else ''}.npy'
print(f'> Loading embeddings from {emb_mod_fname}...')
embeddings = np.load(mimicpath / emb_mod_fname)

# Locates data (case definitions (X) + solutions (y))
hadm2idx = {hadm: i for i, hadm in enumerate(emb_hadmids)}

# Entries to be used as CB
X_cb = embeddings[[hadm2idx[hadm] for hadm in splits_hadmids['cb']]]
df_cb = df.set_index('hadm_id').loc[splits_hadmids['cb']]
y_cb = df_cb['DIES'].values[:, np.newaxis]

# Train data for CBR model
X_train = embeddings[[hadm2idx[hadm] for hadm in splits_hadmids['train']]]
df_train = df.set_index('hadm_id').loc[splits_hadmids['train']]
y_train = df_train['DIES'].values[:, np.newaxis]

# Entries to be used as test
df_test = df.set_index('hadm_id').loc[splits_hadmids['test']]
# Gets test subsample
if subsamp_size:
    print(f'> Subsampling test data to {subsamp_size}...')
    df_test = df_test.sample(subsamp_size, random_state=SEED)

y_test = df_test['DIES'].values[:, np.newaxis]
X_test = embeddings[[hadm2idx[hadm] for hadm in df_test.index]]

> Loading embeddings from embeddings_alldistilrobertav1_S20000_Tmiddle_balanced.npy...
> Subsampling test data to 100...


In [8]:
# Initialize CBR module
# Fixed params
agg_strat = 'vot'
# Recovery params
rec_kwargs = {
    'retrieve_algo': 'brute',
    'retrieve_metric': 'cosine'
}
cbr = CBR(X_cb, y_cb, agg_strat=agg_strat, rec_kwargs=rec_kwargs, seed=SEED)

# Fits CBR components
cbr.fit(X_train, y_train, k=1, weighted=True)

In [9]:
instance = 'http://localhost:11434/api/generate'
auth_cookie = ''

model = summarizer_model

responses = {}
ii=1

# Prepends additional patient data to text report before feeding summarizer
df_test['text'] = df_test.apply(lambda x: ''.join([f'{p_cremap}: {str(x[p_cname]).replace('_', ' ')}\n' for p_cname, p_cremap in pdc_remap.items()]) + x['text'], axis=1)

for i, itrow in enumerate(df_test.iterrows()):
    print(f'>> Processing row {ii} out of {len(df_test)}', end='\r')

    # Get text from entry
    #text = row['text']

    # Gets neigbhour from cb using index
    hadm_idx, case = itrow
    sims, neigh_idx = cbr.cb.get_neighbours(X_test[i][np.newaxis, :], k=1, get_solutions=False)
    neigh_idx = neigh_idx[0, 0]
    similar_case = df_cb.iloc[neigh_idx]

    # We only want the text and mortality from the similar case
    sc_text = similar_case['text']
    sc_dies = similar_case['DIES']

    # Apply prepro to remove clutter
    # <Nothing to do>

    # Truncate middle if resulting text is longer than max_chars
    if len(sc_text) > max_chars:
        print(f'(i) Text exceeds the max char limit ({len(sc_text)}) in entry {similar_case.index}. Middle-truncating to {max_chars}...')
        sc_text = sc_text[:(max_chars//2)] + sc_text[-(max_chars//2):]
        print(f'... Result truncate: {len(sc_text)}')

    # Wrap into appropriate format
    formatted_input = json.dumps({'REPORT': sc_text})

    # Prepare query
    data = {'model': model,  # Explicit model to use
        'options': {
            'num_ctx': n_ctx * 1024,
            'temperature': temp,
            'seed': SEED,
            'top_k': top_k,
            'top_p': top_p
            },
        'keep-alive': -1,  # Keep connection open
        'prompt': formatted_input,
        'stream': False,  # Wait and return all the result at once
        'format': {    
        'type': 'object',
        'properties': {
            'SUMMARY': {
                'type': 'string'
            }
        },
        'required': [
            'SUMMARY'
        ]
        }
    }
    # Prepares query
    data = json.dumps(data)
    cookies = {
        '_oauth2_proxy': auth_cookie}
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }

    response = requests.post(instance, cookies=cookies, headers=headers, data=data)
    json_response = json.loads(response.text)['response']
    dict_response = json.loads(json_response)
    dict_response['DIES'] = 'YES' if sc_dies else 'NO' # Mortality outcome of neighbour
    responses[hadm_idx] = dict_response # Keeps the dictionary version of the json response
    ii+=1

(i) Text exceeds the max char limit (37342) in entry Index(['note_id', 'subject_id', 'charttime', 'text', 'gender', 'dod',
       'anchor_age', 'anchor_year', 'admittime', 'admission_type', 'insurance',
       'marital_status', 'race', 'diagnose_group_description', 'drg_mortality',
       'diagnose_group_mortality', 'age', 'delta_days_dod', 'DIES'],
      dtype='object'). Middle-truncating to 25000...
... Result truncate: 25000
>> Processing row 100 out of 100

In [10]:
# Export summarizations
df_responses = pd.DataFrame(responses).T
summary_id = f'neighbour_summary_{max_chars}mc_ss{subsamp_size}'
df_responses.to_csv(f'{mimicpath}/summaries/{summary_id}.csv')