# X-Ray Report Generator with Vision-Language Model in Colab (Enhanced)

In [None]:
!pip install transformers datasets torchvision evaluate -q

In [None]:
import torch
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image, ImageOps
import matplotlib.pyplot as plt
import os

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

In [None]:
def preprocess_image(image_path, to_grayscale=False, resize=(224, 224)):
    image = Image.open(image_path).convert("RGB")
    if to_grayscale:
        image = ImageOps.grayscale(image).convert("RGB")
    if resize:
        image = image.resize(resize)
    return image

In [None]:
def generate_report(image):
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
    output_ids = model.generate(pixel_values, max_length=64, num_beams=4)
    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    report = f"""
    ===== RADIOLOGY REPORT =====
    Findings:
    {text}

    Impression:
    Based on visual features, automated findings suggest above observations.

    Confidence: N/A (Not computed)
    =============================
    """
    return report.strip()

In [None]:
input_dir = "/content/xray_samples"  # Directory containing images
output_file = "batch_generated_reports.txt"

os.makedirs(input_dir, exist_ok=True)
image_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('jpg', 'jpeg', 'png'))]

with open(output_file, "w") as f:
    for image_name in image_files:
        image_path = os.path.join(input_dir, image_name)
        image = preprocess_image(image_path, to_grayscale=True)

        plt.imshow(image)
        plt.title(image_name)
        plt.axis('off')
        plt.show()

        report = generate_report(image)
        print("\n\033[1mGenerated Report for:\033[0m", image_name)
        print(report)

        f.write(f"Report for {image_name}:\n{report}\n\n")

## Project Attribution and License

This project uses the pretrained model `nlpconnect/vit-gpt2-image-captioning` from Hugging Face, licensed under the MIT License.
No real patient data is used in this example. The sample images and generated outputs are for educational and research purposes only.
This notebook is intended as a starting point for developing real-world AI applications in medical imaging, but is not validated for clinical use.

**Author**: chiiinmay  
Parts of this project were assisted by OpenAI’s ChatGPT to accelerate prototyping and documentation.

---

### Reuse & Attribution
This project is licensed under the MIT License. If you use this code or parts of it, please provide credit by linking to:  
👉 https://github.com/chiiinmay/xray-report-generator

---

### 📚 Citation
If you use this project or build upon it, please cite it as:

```
@misc{chiiinmay_xrayreport_2025,
  author       = {Chiiinmay},
  title        = {X-Ray Report Generator using Vision-Language Models},
  year         = {2025},
  howpublished = {\url{https://github.com/chiiinmay/xray-report-generator}},
  note         = {Developed using Hugging Face Transformers and assisted by OpenAI's ChatGPT.}
}
```