<a href="https://colab.research.google.com/github/fhsu4976/CS598/blob/main/CS598_finalproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab Notebook for Query-Focused EHR Extractive Summarization
# =============================================================
# This notebook demonstrates an end-to-end pipeline using a small subset of preprocessed
# MIMIC-III .data files to reproduce key experiments from:
# "Query-Focused Extractive Summarization of Electronic Health Records" (ArXiv:2004.04645).


In [1]:
# ## 1. Setup
# Mount Google Drive, clone the official repo, install dependencies, and download spaCy model.

from google.colab import drive
import os, sys


# Force remount Google Drive to ensure fresh mount
drive.mount('/content/drive', force_remount=True)  # Force remount to avoid mountpoint errors

# Debug: List top-level of mounted Drive to verify folders
print("/content/drive contents:", os.listdir('/content/drive'))
print("/content/drive/MyDrive contents:", os.listdir('/content/drive/MyDrive'))

# Change to working directory
os.chdir('/content')

# Clone repository
target_repo = '/content/ehr-extraction-models'
!rm -rf {target_repo}
!git clone https://github.com/dmcinerney/ehr-extraction-models.git {target_repo}

# Replace the default requirements.txt with your custom one from Drive
# (assumes custom file at /content/drive/MyDrive/CS598/DLH/requirements.txt)
custom_req = '/content/drive/MyDrive/CS598/DLH/requirements.txt'
!cp {custom_req} {target_repo}/requirements.txt

# Install dependencies
!pip install -r {target_repo}/requirements.txt
!pip install -e git+https://github.com/dmcinerney/pytt.git@4a15322f696fe85a264dd4854fcdb82c9e801c06#egg=pytt

# Download spaCy English model
!python -m spacy download en_core_web_sm


Mounted at /content/drive
/content/drive contents: ['MyDrive', '.shortcut-targets-by-id', '.file-revisions-by-id', 'Shareddrives', '.Trash-0']
/content/drive/MyDrive contents: ['Colab Notebooks', 'Chat history for CS598.txt', 'IMG_3594.png', 'CS598']
Cloning into '/content/ehr-extraction-models'...
remote: Enumerating objects: 844, done.[K
remote: Counting objects: 100% (124/124), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 844 (delta 68), reused 111 (delta 60), pack-reused 720 (from 1)[K
Receiving objects: 100% (844/844), 3.00 MiB | 20.31 MiB/s, done.
Resolving deltas: 100% (492/492), done.
Collecting setuptools==68.0.0 (from -r /content/ehr-extraction-models/requirements.txt (line 1))
  Downloading setuptools-68.0.0-py3-none-any.whl.metadata (6.4 kB)
Collecting asgiref==3.2.3 (from -r /content/ehr-extraction-models/requirements.txt (line 3))
  Downloading asgiref-3.2.3-py2.py3-none-any.whl.metadata (8.2 kB)
Collecting boto3==1.11.15 (from -r /content/

In [2]:
# ## 2. Data Preparation
# Create a small subset (e.g., 1,000 samples per split) of the preprocessed MIMIC-III .data files.

# %%
import gzip
import json
import os

# Paths in your Google Drive
DATA_DIR = '/content/drive/MyDrive/CS598/DLH'      # @param {type:"string"}
SUBSET_DIR = '/content/drive/MyDrive/CS598/mimic_subset'       # @param {type:"string"}
MAX_SAMPLES = 1000                                      # @param {type:"integer"}

os.makedirs(SUBSET_DIR, exist_ok=True)

for split in ['train', 'val', 'test']:
    src = os.path.join(DATA_DIR, f"{split}.data")
    dst = os.path.join(SUBSET_DIR, f"{split}.data")
    with gzip.open(src, 'rt', encoding='utf-8') as fin, \
         open(dst, 'w', encoding='utf-8') as fout:
        for i, line in enumerate(fin):
            if i >= MAX_SAMPLES:
                break
            fout.write(line)

print(f"Created subset files in {SUBSET_DIR}")


Created subset files in /content/drive/MyDrive/CS598/mimic_subset


In [46]:
%%bash
FILE=/content/src/pytt/pytt/batching/standard_batch_iterator.py

# Wrap the queue.put line in IteratorQueueWrapper.__next__ so it won't crash on deepcopy
sed -i '/self.iterators.put(copy.deepcopy(self.last_iterator))/c\
        try:\
            self.iterators.put(copy.deepcopy(self.last_iterator))\
        except Exception:\
            pass' "$FILE"

echo "Patched IteratorQueueWrapper to skip deepcopy errors."


Patched IteratorQueueWrapper to skip deepcopy errors.


In [44]:
# Patch SubbatchIndicesIterator to avoid list fallback (use original iterator instead)
!sed -i "s/self.indices_iterator_lookahead = \[\]/self.indices_iterator_lookahead = self.indices_iterator/g" /content/src/pytt/pytt/batching/standard_batch_iterator.py


In [19]:
# ## 1.x Generate `vocab.txt` for ClinicalBERT
# If your ClinicalBERT weights directory is missing `vocab.txt`, you can download it directly from the Hugging Face Hub or extract it via the Transformers library.

# %%
from transformers import BertTokenizer

# Directory where your ClinicalBERT weights are stored
dir_path = '/content/drive/MyDrive/CS598/DLH/clinical-bert-weights/ClinicalBERT_pretraining_pytorch_checkpoint'  # update if needed

# Option 1: Download directly from the HF Hub
!wget -qO {dir_path}/vocab.txt \
    https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT/resolve/main/vocab.txt
print('Downloaded vocab.txt to', dir_path)

# Now `vocab.txt` should exist alongside your model weights.


Downloaded vocab.txt to /content/drive/MyDrive/CS598/DLH/clinical-bert-weights/ClinicalBERT_pretraining_pytorch_checkpoint


In [109]:
%%bash
DATA_DIR="/content/drive/MyDrive/CS598/mimic_subset"
cd "$DATA_DIR"

# 1. gzip-compress contents in place, overwriting each foo.data → foo.data.gz
gzip -f --no-name *.data

# 2. rename foo.data.gz → foo.data so extension is unchanged
for f in *.data.gz; do
  mv "$f" "${f%.data.gz}.data"
done

echo "Files in $DATA_DIR are now gzipped streams with a .data extension:"
ls -lh *.data | sed 's/^/  /'


# Show the first two bytes of one of the files
head -c 2 "$DATA_DIR/val.data" | xxd



Files in /content/drive/MyDrive/CS598/mimic_subset are now gzipped streams with a .data extension:
  -rw------- 1 root root 25K May  5 22:06 test.data
  -rw------- 1 root root 15K May  5 22:06 train.data
  -rw------- 1 root root 14K May  5 22:06 val.data
00000000: 1f8b                                     ..


In [118]:
%%bash
cat > /content/ehr-extraction-models/processing/dataset.py << 'EOF'
import pandas as pd
from pytt.preprocessing.raw_dataset import RawDataset

class Dataset(RawDataset):
    def __init__(self, df):
        self.df = df

    def __getitem__(self, i):
        dictionary = super(Dataset, self).__getitem__(i)
        # reconstruct the reports DataFrame
        dictionary['reports'] = pd.DataFrame(eval(dictionary['reports']))
        dictionary['reports']['date'] = pd.to_datetime(dictionary['reports']['date'])
        # parse list fields
        dictionary['targets'] = eval(dictionary['targets'])
        dictionary['labels']  = eval(dictionary['labels'])
        return dictionary

def init_dataset(filename, limit_rows=None):
    """
    Load the dataset from a gzipped JSON-lines file (the .data format),
    optionally limiting the number of rows.
    """
    df = pd.read_json(filename, lines=True, compression='gzip')
    if limit_rows is not None:
        df = df.head(limit_rows)
    return Dataset(df)

def split_dataset(filename, split=0.9):
    """
    Load the dataset from a gzipped JSON-lines file, shuffle, and split.
    """
    df = pd.read_json(filename, lines=True, compression='gzip').sample(frac=1)
    n = int(round(split * len(df)))
    return Dataset(df.iloc[:n]), Dataset(df.iloc[n:])
EOF

echo "✅ dataset.py rewritten to read JSON‑lines instead of CSV."


✅ dataset.py rewritten to read JSON‑lines instead of CSV.


In [119]:
%%bash
FILE=/content/ehr-extraction-models/processing/dataset.py

# Insert the CSV field‐size bump immediately after the first import
sed -i "1s|^|import sys, csv\ncsv.field_size_limit(sys.maxsize)\n|" "$FILE"

echo "✅ Increased CSV field_size_limit in dataset.py"


✅ Increased CSV field_size_limit in dataset.py


In [120]:
# ## 3. Training
# Train the sentence-attention model on the small subset.

# Path to your ICD code graph pickle (adjust if necessary)
# CODE_GRAPH_FILE = "'/content/synthetic_hierarchy_dict.pkl'"
CODE_GRAPH_FILE = "'/content/drive/MyDrive/CS598/DLH/code_graph.pkl'"


!mkdir -p /content/supervised
!cp /content/drive/MyDrive/CS598/mimic_subset/val.data /content/supervised/supervised.data

In [124]:
!python {target_repo}/train.py \
  --data_dir {DATA_DIR} \
  --code_graph_file {CODE_GRAPH_FILE} \
  --supervised_data_dir /content/supervised \
  --device cpu \
  code_supervision

2025-05-06 03:22:25.017766: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746501745.062664   78810 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746501745.076120   78810 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-06 03:22:25.131340: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
^C


In [None]:
# ## 4. Evaluation
# Evaluate on the validation split and parse the resulting metrics.

!python test.py \
  --data_dir /content/drive/MyDrive/mimic_subset \
  sentence_attention checkpoints/sentence_attention_small

# Parse scores.txt
metrics = {}
with open('checkpoints/sentence_attention_small/scores.txt') as f:
    for line in f:
        if ':' in line:
            k, v = line.strip().split(':')
            try:
                metrics[k] = float(v)
            except:
                pass
print("Validation metrics:", metrics)

In [None]:
# ## 5. Unsupervised Baselines
# Implement TF-IDF and Clinical BERT similarity baselines on the same subset.

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, average_precision_score

# Load validation data
val_items = []
with open(os.path.join(SUBSET_DIR, 'val.data')) as f:
    for i, line in enumerate(f):
        if i >= MAX_SAMPLES:
            break
        val_items.append(json.loads(line))

# Example: Extract sentences and query text for TF-IDF
sentences = [ ' '.join(item['sentences']) for item in val_items ]
queries = [ q_text for q_text in val_items[0]['queries'].values() ]

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(sentences)
Q = vectorizer.transform(queries)

# Compute cosine similarities and evaluation metrics (placeholder)
# ... your code here ...
print("TF-IDF baseline evaluation placeholder")

In [None]:
# ## 6. Interactive Demo
# Use `interface.py` to load a trained checkpoint and visualize attention weights for sample queries.

from interface import QueryInterface

# Initialize interface
qi = QueryInterface(
    model_type='sentence_attention',
    checkpoint_path='checkpoints/sentence_attention_small'
)

# Example EHR text and query
ehr_text = val_items[0]['sentences']
query_key = list(val_items[0]['queries'].keys())[0]
summary, attention = qi.summarize(ehr_text, query_key)
print("Query:", query_key)
print("Extracted summary:", summary)
print("Attention scores:", attention)
