In [None]:
# Authenticate Google Cloud account (manual step will pop up)
from google.colab import auth
auth.authenticate_user()

In [None]:
# Install BigQuery and GCS dependencies
!pip install --quiet google-cloud-bigquery google-cloud-storage pandas

# Import required libraries
from google.cloud import bigquery, storage
import pandas as pd
import os

# Set GCP project ID
PROJECT_ID = "ai-healthcare-463514"
BQ_DATASET = "mimic_data"

In [None]:
# Initialize BigQuery client
bq_client = bigquery.Client(project=PROJECT_ID)

# Sample query for discharge notes
query = f"""
SELECT subject_id, hadm_id, text
FROM `{PROJECT_ID}.{BQ_DATASET}.discharge`
LIMIT 5
"""

discharge_df = bq_client.query(query).to_dataframe()
discharge_df.head()

Unnamed: 0,subject_id,hadm_id,text
0,10003299,27373340,\nName: ___ Unit No: ___\n...
1,10010231,28743978,\nName: ___ Unit No: ___\n...
2,10025268,24915440,\nName: ___ Unit No: ___\n ...
3,10047700,27410784,\nName: ___ Unit No: ___\...
4,10067539,27995837,\nName: ___ Unit No: ___...


In [None]:
# Mount GCS via GCS client
gcs_client = storage.Client(project=PROJECT_ID)
bucket_name = "synthea-gestrada"
folder_path = "synthea"

# Define which files to load
csv_files = ["patients.csv", "conditions.csv", "medications.csv"]


for csv_file in csv_files:
    blob = gcs_client.bucket(bucket_name).blob(f"{folder_path}/{csv_file}")
    blob.download_to_filename(csv_file)

patients_df = pd.read_csv("patients.csv")
patients_df.head()

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,MIDDLE,LAST,...,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,e2087a14-e3ec-d556-05a0-06956cbe5902,2007-04-03,,999-59-8852,S99965203,,Ms.,Eve68,Cristen212,Donnelly343,...,Palmer,Massachusetts,Hampden County,25013.0,1095,42.253058,-72.345153,57934.77,0.0,123160
1,0932dc47-79f1-39c4-b60b-7ef33d23bec2,2016-11-17,,999-47-6417,,,,Miki234,Shandra823,Davis923,...,Pittsfield,Massachusetts,Berkshire County,25003.0,1201,42.423793,-73.313008,1812.82,21266.58,1371
2,c53132d1-e22c-df16-3451-0de8b0244e69,1981-09-18,,999-41-6400,S99921597,X72834363X,Ms.,Tatum703,Amee396,Dietrich576,...,Danvers,Massachusetts,Essex County,25009.0,1937,42.621877,-70.981753,291411.7,503097.69,156004
3,632dd60e-2e30-cdea-9915-be3092d5b53f,1968-10-14,,999-97-2664,S99922027,X13469899X,Mrs.,Deidre679,Meggan475,O'Connell601,...,Dighton,Massachusetts,Bristol County,,0,41.833599,-71.147184,512838.4,287825.94,102324
4,302bcf9d-f7fa-ab7c-d239-dd1197429942,2019-03-06,,999-86-8041,,,,Elton404,Neville893,Goldner995,...,Blackstone,Massachusetts,Worcester County,,0,42.029054,-71.577584,14338.47,680.31,41452


In [None]:
conditions_df = pd.read_csv("conditions.csv")
conditions_df.head()


Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,SYSTEM,CODE,DESCRIPTION
0,2015-03-31,2017-04-11,e2087a14-e3ec-d556-05a0-06956cbe5902,e3c2b2d1-e696-0677-53f8-b70205e588eb,http://snomed.info/sct,314529007,Medication review due (situation)
1,2016-09-09,2016-10-02,e2087a14-e3ec-d556-05a0-06956cbe5902,2c78bd75-2de7-7ecc-4872-1963731e8ff8,http://snomed.info/sct,312608009,Laceration - injury (disorder)
2,2016-09-09,2016-10-02,e2087a14-e3ec-d556-05a0-06956cbe5902,2c78bd75-2de7-7ecc-4872-1963731e8ff8,http://snomed.info/sct,284551006,Laceration of foot (disorder)
3,2016-11-17,2016-11-17,0932dc47-79f1-39c4-b60b-7ef33d23bec2,badfd8da-534b-a128-3133-c787dc561adb,http://snomed.info/sct,314529007,Medication review due (situation)
4,2016-10-14,2016-10-29,e2087a14-e3ec-d556-05a0-06956cbe5902,b845b65e-e6fc-8b7f-3cbc-e8e051269201,http://snomed.info/sct,10509002,Acute bronchitis (disorder)


In [None]:

medications_df = pd.read_csv("medications.csv")
medications_df.head()

Unnamed: 0,START,STOP,PATIENT,PAYER,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,PAYER_COVERAGE,DISPENSES,TOTALCOST,REASONCODE,REASONDESCRIPTION
0,2016-09-09T11:10:14Z,2016-10-02T11:10:14Z,e2087a14-e3ec-d556-05a0-06956cbe5902,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,2c78bd75-2de7-7ecc-4872-1963731e8ff8,313820,Acetaminophen 160 MG Chewable Tablet,45.86,0.0,1,45.86,,
1,2016-10-14T17:27:02Z,2016-10-29T17:27:02Z,e2087a14-e3ec-d556-05a0-06956cbe5902,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,b845b65e-e6fc-8b7f-3cbc-e8e051269201,313782,Acetaminophen 325 MG Oral Tablet,85.07,0.0,1,85.07,10509002.0,Acute bronchitis (disorder)
2,2020-05-05T10:07:37Z,2020-05-14T13:07:37Z,e2087a14-e3ec-d556-05a0-06956cbe5902,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,28502f0d-e9c6-7303-a8a9-f11ebdc4fde3,834061,Penicillin V Potassium 250 MG Oral Tablet,227.0,0.0,1,227.0,43878008.0,Streptococcal sore throat (disorder)
3,2021-05-18T12:19:13Z,2021-05-18T12:19:13Z,e2087a14-e3ec-d556-05a0-06956cbe5902,26aab0cd-6aba-3e1b-ac5b-05c8867e762c,743955f0-6883-1a4d-6840-bb79074575ae,1535362,sodium fluoride 0.0272 MG/MG Oral Gel,129.94,0.0,1,129.94,103697008.0,Patient referral for dental care (procedure)
4,2018-10-08T21:59:16Z,2018-10-22T21:59:16Z,0932dc47-79f1-39c4-b60b-7ef33d23bec2,df166300-5a78-3502-a46a-832842197811,a923f3e7-f8ee-d846-e5c4-049970034690,308192,Amoxicillin 500 MG Oral Tablet,61.01,11.01,1,61.01,,


In [None]:
print(conditions_df.columns)
print(patients_df.columns)

Index(['START', 'STOP', 'PATIENT', 'ENCOUNTER', 'SYSTEM', 'CODE',
       'DESCRIPTION'],
      dtype='object')
Index(['Id', 'BIRTHDATE', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX',
       'FIRST', 'MIDDLE', 'LAST', 'SUFFIX', 'MAIDEN', 'MARITAL', 'RACE',
       'ETHNICITY', 'GENDER', 'BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE',
       'COUNTY', 'FIPS', 'ZIP', 'LAT', 'LON', 'HEALTHCARE_EXPENSES',
       'HEALTHCARE_COVERAGE', 'INCOME'],
      dtype='object')


In [None]:
# Example: Generate diagnosis QA from Synthea conditions + patients

# Merge patients and conditions
merged_df = conditions_df.merge(patients_df, left_on="PATIENT", right_on="Id", how="inner")

# Generate QA pairs
qa_examples = []
for _, row in merged_df.head(5).iterrows():
    context = f"Patient {row['Id']} was diagnosed with {row['DESCRIPTION']}."
    question = "What diagnosis did the patient have?"
    answer_text = row['DESCRIPTION']
    answer_start = context.find(answer_text)

    qa_examples.append({
        "context": context,
        "question": question,
        "answer_text": answer_text,
        "answer_start": answer_start
    })

qa_df = pd.DataFrame(qa_examples)
qa_df.head()

Unnamed: 0,context,question,answer_text,answer_start
0,Patient e2087a14-e3ec-d556-05a0-06956cbe5902 w...,What diagnosis did the patient have?,Medication review due (situation),64
1,Patient e2087a14-e3ec-d556-05a0-06956cbe5902 w...,What diagnosis did the patient have?,Laceration - injury (disorder),64
2,Patient e2087a14-e3ec-d556-05a0-06956cbe5902 w...,What diagnosis did the patient have?,Laceration of foot (disorder),64
3,Patient 0932dc47-79f1-39c4-b60b-7ef33d23bec2 w...,What diagnosis did the patient have?,Medication review due (situation),64
4,Patient e2087a14-e3ec-d556-05a0-06956cbe5902 w...,What diagnosis did the patient have?,Acute bronchitis (disorder),64


In [None]:
# Save to local JSONL file
qa_df.to_json("synthea_qa.jsonl", orient="records", lines=True)

# Upload to GCS
output_bucket = "mimic-data-jgestrada"
destination_blob = "qa_datasets/synthea_qa.jsonl"

blob = gcs_client.bucket(output_bucket).blob(destination_blob)
blob.upload_from_filename("synthea_qa.jsonl")

print(f"Uploaded QA dataset to gs://{output_bucket}/{destination_blob}")

✅ Uploaded QA dataset to gs://mimic-data-jgestrada/qa_datasets/synthea_qa.jsonl


In [None]:
import pandas as pd
from datetime import datetime

# Load the CSVs (already downloaded from GCS)
patients_df = pd.read_csv("patients.csv")
conditions_df = pd.read_csv("conditions.csv")
medications_df = pd.read_csv("medications.csv")

# Initialize final QA list
qa_examples = []

# ========== 1. DIAGNOSIS QA ========== #
merged_diag = conditions_df.merge(patients_df, left_on="PATIENT", right_on="Id", how="inner")

for _, row in merged_diag.iterrows():
    if pd.isna(row['DESCRIPTION']):
        continue
    context = f"Patient {row['Id']} was diagnosed with {row['DESCRIPTION']}."
    question = "What diagnosis did the patient have?"
    answer_text = row['DESCRIPTION']
    answer_start = context.find(answer_text)

    if answer_start != -1:
        qa_examples.append({
            "context": context,
            "question": question,
            "answer_text": answer_text,
            "answer_start": answer_start
        })

# ========== 2. MEDICATION QA ========== #
merged_meds = medications_df.merge(patients_df, left_on="PATIENT", right_on="Id", how="inner")

for _, row in merged_meds.iterrows():
    if pd.isna(row['DESCRIPTION']):
        continue
    context = f"The patient was prescribed {row['DESCRIPTION']}."
    question = "What medication was the patient prescribed?"
    answer_text = row['DESCRIPTION']
    answer_start = context.find(answer_text)

    if answer_start != -1:
        qa_examples.append({
            "context": context,
            "question": question,
            "answer_text": answer_text,
            "answer_start": answer_start
        })

# ========== 3. AGE QA ========== #
for _, row in patients_df.iterrows():
    try:
        birthdate = pd.to_datetime(row['BIRTHDATE'], errors='coerce')
        age = int((pd.Timestamp('today') - birthdate).days / 365.25)
        context = f"The patient is {age} years old."
        question = "How old is the patient?"
        answer_text = str(age)
        answer_start = context.find(answer_text)

        if answer_start != -1:
            qa_examples.append({
                "context": context,
                "question": question,
                "answer_text": answer_text,
                "answer_start": answer_start
            })
    except:
        continue

In [None]:
# Save to JSONL
qa_df = pd.DataFrame(qa_examples)
qa_df.to_json("synthea_qa.jsonl", orient="records", lines=True)
print(f"Saved {len(qa_df)} QA pairs to synthea_qa.jsonl")

Saved 5944 QA pairs to synthea_qa.jsonl


In [None]:
# Upload to GCS (terminal or ! in Colab)
!gsutil cp synthea_qa.jsonl gs://mimic-data-jgestrada/qa_datasets/

Copying file://synthea_qa.jsonl [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/1.3 MiB.                                      


## EPIC 2

In [None]:
!pip install -q transformers datasets evaluate accelerate

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import pandas as pd

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m960.3 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Load your .jsonl file (assumes already downloaded from GCS or in local Colab)
qa_df = pd.read_json("synthea_qa.jsonl", lines=True)
qa_df.head()

Unnamed: 0,context,question,answer_text,answer_start
0,Patient e2087a14-e3ec-d556-05a0-06956cbe5902 w...,What diagnosis did the patient have?,Medication review due (situation),64
1,Patient e2087a14-e3ec-d556-05a0-06956cbe5902 w...,What diagnosis did the patient have?,Laceration - injury (disorder),64
2,Patient e2087a14-e3ec-d556-05a0-06956cbe5902 w...,What diagnosis did the patient have?,Laceration of foot (disorder),64
3,Patient 0932dc47-79f1-39c4-b60b-7ef33d23bec2 w...,What diagnosis did the patient have?,Medication review due (situation),64
4,Patient e2087a14-e3ec-d556-05a0-06956cbe5902 w...,What diagnosis did the patient have?,Acute bronchitis (disorder),64


In [None]:
dataset = Dataset.from_pandas(qa_df)
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
# Load ClinicalBERT tokenizer and model
model_checkpoint = "emilyalsentzer/Bio_ClinicalBERT"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
max_length = 384
doc_stride = 128

def prepare_qa_features(examples):
    # Tokenize questions and contexts
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sample_idx = sample_mapping[i]
        answers = examples["answer_text"][sample_idx]
        start_char = examples["answer_start"][sample_idx]
        end_char = start_char + len(answers)

        sequence_ids = tokenized.sequence_ids(i)

        # Find the start and end of the context in tokens
        token_start_index = sequence_ids.index(1)
        token_end_index = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        # If answer not fully in context, label as CLS
        if not (start_char < offsets[token_end_index][1] and end_char > offsets[token_start_index][0]):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            # Tokenize answer span
            for idx in range(token_start_index, token_end_index + 1):
                if offsets[idx][0] <= start_char and offsets[idx][1] > start_char:
                    start_positions.append(idx)
                if offsets[idx][0] < end_char and offsets[idx][1] >= end_char:
                    end_positions.append(idx)
                    break

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

tokenized_dataset = dataset.map(prepare_qa_features, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/5349 [00:00<?, ? examples/s]

Map:   0%|          | 0/595 [00:00<?, ? examples/s]

In [None]:
!pip uninstall -y transformers

Found existing installation: transformers 4.41.1
Uninstalling transformers-4.41.1:
  Successfully uninstalled transformers-4.41.1


In [None]:
!rm -rf /usr/local/lib/python*/dist-packages/transformers

In [None]:
!pip install transformers==4.41.1 datasets==2.19.0 evaluate==0.4.1 -q

In [None]:
!pip install -U "transformers==4.41.1" "datasets==2.19.0" "evaluate==0.4.1" -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.3.1 which is incompatible.
torch 

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
import transformers
print(transformers.__version__)  # should be 4.41.1

4.41.1


In [None]:
from transformers import Trainer, TrainingArguments

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
cannot import name 'EncoderDecoderCache' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [None]:
from transformers import Trainer
from transformers.training_args import TrainingArguments
import evaluate
import numpy as np

output_dir = "clinicalbert-qa-synthea"

training_args = TrainingArguments(
    output_dir="clinicalbert-qa-synthea",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="clinicalbert-qa-synthea/logs",
    logging_steps=50,
    report_to="none"
)

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
cannot import name 'EncoderDecoderCache' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)