# Create datasets with question templates
Enter a hugging face dataset and create a new dataset with the question template with help from gemini to train the nano vlm

In [4]:
!git clone https://huggingface.co/datasets/itsanmolgupta/mimic-cxr-dataset

Cloning into 'mimic-cxr-dataset'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 12 (delta 1), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (12/12), 3.62 KiB | 1.21 MiB/s, done.
Filtering content: 100% (2/2), 756.16 MiB | 44.54 MiB/s, done.


In [None]:
import pandas as pd
from google import genai
import getpass

In [6]:
df = pd.read_parquet("mimic-cxr-dataset/data")

In [25]:
client = genai.Client(api_key=getpass.getpass())

··········


In [92]:
def call_gemini(source_type, text):
  prompt_template = '''
  You are a board-certified radiologist helping to train medical students in interpreting chest X-rays.

  Given the following {source_type} text from a radiology report:

  "{text}"

  Generate a **multiple-choice question** related to the key medical insights or diagnoses implied by this text. The question should test clinical understanding or interpretation.

  **Instructions:**
  - Ask one clear, focused question based on the text
  - Provide **4 distinct answer choices** and include them inside the question string of the JSON and separate them in an A), B), C), D) format
  - Clearly mark the **correct answer**
  - Ensure the question makes sense without needing the image, but reflects the medical content of the findings
  - The information needed to conclude the right answer NEEDS to be inside the given text, do not assume anything that is not inside the text

  **Return your output in the following JSON format:**

  ```
  {{
    "question": "...",
    "answer": "..."
  }}
  '''

  filled_prompt = prompt_template.format(
    source_type = source_type,
    text = text
  )

  response = client.models.generate_content(
    model="gemini-2.0-flash", contents=filled_prompt
  )

  text = response.text

  if "```json" in text:
    start = text.index("```json") + len("```json")
    end = text.index("```", start)
    json_str = text[start:end].strip()
    return json_str
  else:
    return text


In [94]:
dataset = []

for row in df.head().itertuples():
  # generate and add question of impressions to dataset
  dataset.append({
    "image": row.image,
    "texts": call_gemini("impression", row.impression)
  })

  # generate and add question of findings to dataset
  dataset.append({
    "image": row.image,
    "texts": call_gemini("findings", row.findings)
  })

final_df = pd.DataFrame(dataset)

In [107]:
final_df.to_parquet("data.parquet")