In [1]:
from datasets import load_dataset

ds = load_dataset("eltorio/ROCOv2-radiology")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torchvision import models, transforms


In [3]:
from PIL import Image

In [4]:
import torch

In [5]:
# Inspect the dataset structure
print(ds)
print(ds['train'][0])  # Print the first example in the training set

DatasetDict({
    train: Dataset({
        features: ['image', 'image_id', 'caption', 'cui'],
        num_rows: 59962
    })
    validation: Dataset({
        features: ['image', 'image_id', 'caption', 'cui'],
        num_rows: 9904
    })
    test: Dataset({
        features: ['image', 'image_id', 'caption', 'cui'],
        num_rows: 9927
    })
})
{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=682x748 at 0x7F2198845F00>, 'image_id': 'ROCOv2_2023_train_000001', 'caption': 'Head CT demonstrating left parotiditis.', 'cui': ['C0040405']}


In [6]:
# Extract images and captions from the dataset
images = [example["image"] for example in ds["train"]]
captions = [example["caption"] for example in ds["train"]]

# Print the first 5 captions to verify
print(captions[:5])

['Head CT demonstrating left parotiditis.', 'Acquired renal cysts in end-stage renal failure: 16-year-old girl with Alport syndrome and peritoneal dialysis from the age of 2\xa0years', 'Computed tomography of the chest showing the right breast nodule with irregular margins', 'Lateral view of the sacrum showing the low contrast between bone and soft tissue.', 'Thoracic CT scan showing perihilar pulmonary lymphadenomegaly']


In [7]:
import wandb
import psutil
import GPUtil
print(GPUtil.getGPUs())


[<GPUtil.GPUtil.GPU object at 0x7f2198845ea0>, <GPUtil.GPUtil.GPU object at 0x7f1e86e59480>, <GPUtil.GPUtil.GPU object at 0x7f2198845de0>, <GPUtil.GPUtil.GPU object at 0x7f1e86cbc910>, <GPUtil.GPUtil.GPU object at 0x7f1e86cbc490>, <GPUtil.GPUtil.GPU object at 0x7f1e86cbc4f0>]


In [8]:

# Get all available GPUs
gpus = GPUtil.getGPUs()

# Print details for each GPU
for gpu in gpus:
    print(f"GPU ID: {gpu.id}")
    print(f"Name: {gpu.name}")
    print(f"Load: {gpu.load * 100:.2f}%")
    print(f"Memory Free: {gpu.memoryFree}MB")
    print(f"Memory Used: {gpu.memoryUsed}MB")
    print(f"Memory Total: {gpu.memoryTotal}MB")
    print(f"Temperature: {gpu.temperature}°C")
    print("-" * 40)

GPU ID: 0
Name: Tesla P100-PCIE-12GB
Load: 0.00%
Memory Free: 13.0MB
Memory Used: 12180.0MB
Memory Total: 12288.0MB
Temperature: 34.0°C
----------------------------------------
GPU ID: 1
Name: Tesla P100-PCIE-16GB
Load: 0.00%
Memory Free: 16274.0MB
Memory Used: 2.0MB
Memory Total: 16384.0MB
Temperature: 38.0°C
----------------------------------------
GPU ID: 2
Name: Tesla P100-PCIE-12GB
Load: 0.00%
Memory Free: 12191.0MB
Memory Used: 2.0MB
Memory Total: 12288.0MB
Temperature: 36.0°C
----------------------------------------
GPU ID: 3
Name: Tesla P100-PCIE-12GB
Load: 0.00%
Memory Free: 12191.0MB
Memory Used: 2.0MB
Memory Total: 12288.0MB
Temperature: 31.0°C
----------------------------------------
GPU ID: 4
Name: Tesla P100-PCIE-16GB
Load: 0.00%
Memory Free: 16274.0MB
Memory Used: 2.0MB
Memory Total: 16384.0MB
Temperature: 31.0°C
----------------------------------------
GPU ID: 5
Name: Tesla P100-PCIE-16GB
Load: 0.00%
Memory Free: 16274.0MB
Memory Used: 2.0MB
Memory Total: 16384.0MB
Temp

In [9]:
import os

# Set GPU ID 4 as the visible device
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [10]:
import numpy as np
import json

In [11]:
# Initialize wandb
wandb.init(project="medical-image-captioning", name="feature-extraction_testrun2")

# Log configuration
wandb.config.update({
    "model": "ResNet50",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "image_size": (224, 224),
    "normalization_mean": [0.485, 0.456, 0.406],
    "normalization_std": [0.229, 0.224, 0.225]
})

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load a pre-trained ResNet model and move it to the GPU
resnet = models.resnet50(pretrained=True).to(device)
resnet.eval()

# Define a transformation pipeline for the images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to extract features from an image
def extract_features(image):
    # Ensure the image is in RGB format
    image = image.convert("RGB")
    
    # Apply transformations
    image_tensor = transform(image).unsqueeze(0).to(device)  # Add batch dimension and move to GPU
    
    # Extract features using the model
    with torch.no_grad():
        features = resnet(image_tensor)
    
    return features.squeeze().cpu().numpy()  # Move back to CPU for further processing

# Initialize lists to store features and captions
all_features = []
all_captions = []

# Extract features for all images and log to wandb
for idx, (image, caption) in enumerate(zip(images, captions)):
    features = extract_features(image)
    all_features.append(features)
    all_captions.append(caption)
    
    # Log features and captions to wandb
    wandb.log({
        "image_index": idx,
        "caption": caption,
        "features": features.tolist(),  # Log features as a list
        "image": wandb.Image(image, caption=caption)
    })

# Save features and captions to local files
np.save("features.npy", np.array(all_features))  # Save features as a NumPy array
with open("captions.json", "w") as f:
    json.dump(all_captions, f)  # Save captions as a JSON file

print("Features and captions saved locally.")

# Finish wandb run
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmedtekki[0m ([33mmedtekki-university-of-stavanger[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using device: cuda




Features and captions saved locally.


0,1
image_index,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇████

0,1
caption,Right shoulder of a ...
image_index,59961


In [12]:
import os
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report


In [13]:
api = wandb.Api()
runs = api.runs("medtekki-university-of-stavanger/medical-image-captioning")
for run in runs:
    print(f"Run Name: {run.name}, Run ID: {run.id}")

Run Name: feature-extraction, Run ID: stdmmc54
Run Name: feature-extraction2, Run ID: htt1q4t7
Run Name: feature-extraction3, Run ID: pon7o8gv
Run Name: feature-extraction_ichi, Run ID: sgqh7c5b
Run Name: feature-extraction_ni, Run ID: tcm61qyd
Run Name: feature-extraction_ni, Run ID: zp4419tw
Run Name: feature-extraction_san, Run ID: so58vuk3
Run Name: feature-extraction_ni, Run ID: y8i2dwf9
Run Name: feature-extraction_chi, Run ID: lity27dr
Run Name: feature-extraction_go, Run ID: ompi5td7
Run Name: feature-extraction_testrun, Run ID: 6lunze22
Run Name: feature-extraction_testrun2, Run ID: pr9gob5i
Run Name: decision-tree-mps, Run ID: k1a5rxte
Run Name: decision-tree-mps_ichi, Run ID: s3yvrg1s
Run Name: decision-tree-mps_ni, Run ID: y2dojsyl


In [14]:
run = api.run(f"medtekki-university-of-stavanger/medical-image-captioning/sgqh7c5b")
for log in run.history():
    print(log)

_runtime
features
_timestamp
image_index
_step
image
caption


In [17]:
print(features.shape)

(1000,)


In [None]:
import wandb
import numpy as np
import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib

# Initialize W&B
wandb.init(project="medical-image-captioning", name="decision-tree-classifier_go4")

# Load the saved features and captions
features = np.load("features.npy")  # Load features from the .npy file
with open("captions.json", "r") as f:
    captions = json.load(f)  # Load captions from the .json file

# Step 1: Convert captions to numerical labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(captions)

# Log label classes
wandb.config.label_classes = label_encoder.classes_.tolist()

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Step 3: Train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 4: Evaluate the Classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)

# Log metrics to W&B
wandb.log({"accuracy": accuracy, "classification_report": report})

# Step 5: Log confusion matrix
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=label_encoder.classes_, cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
wandb.log({"confusion_matrix": wandb.Image(plt)})
plt.close()

# Optional: Save and log the trained model
joblib.dump(clf, "decision_tree_model.pkl")
wandb.save("decision_tree_model.pkl")

# Finish W&B run
wandb.finish()



## TF-IDF + KMeans Clustering

In [3]:
import json
import numpy as np
# Load the saved features and captions
features = np.load("features.npy")  # Load features from the .npy file
with open("captions.json", "r") as f:
    captions = json.load(f)  # Load captions from the .json file

# Verify the loaded captions
print(f"Number of captions loaded: {len(captions)}")
print("First 5 captions:", captions[:5])

Number of captions loaded: 59962
First 5 captions: ['Head CT demonstrating left parotiditis.', 'Acquired renal cysts in end-stage renal failure: 16-year-old girl with Alport syndrome and peritoneal dialysis from the age of 2\xa0years', 'Computed tomography of the chest showing the right breast nodule with irregular margins', 'Lateral view of the sacrum showing the low contrast between bone and soft tissue.', 'Thoracic CT scan showing perihilar pulmonary lymphadenomegaly']


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Convert captions to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(captions)

# Cluster captions into k groups
k = 30  # Choose the number of clusters (adjust based on your dataset)
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

# Map captions to their clusters
clustered_captions = {caption: cluster for caption, cluster in zip(captions, clusters)}

# Replace captions with cluster labels
labels = clusters
print(f"Number of clusters: {k}")

Number of clusters: 30


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import joblib

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Plot confusion matrix
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

# Save the trained model
joblib.dump(clf, "decision_tree_model.pkl")
print("Model saved as decision_tree_model.pkl")

Step 3: Set Up LLaVA
LLaVA requires a model checkpoint. You can use the ollama library to download and run the model locally.

Download the LLaVA Model:

In [None]:
ollama pull llava

Start the LLaVA Server: Run the following command to start the LLaVA server:

In [None]:
ollama serve

Step 4: Predict Captions for Images
Use the ollama Python API to send image data to the LLaVA model and get captions.