In [None]:
# Install the dependencies of the required versions

!pip install accelerate --q
!pip install -U "transformers>=4.39.0" --q
!pip install peft bitsandbytes --q
!pip install -U "trl>=0.8.3" --q

In [None]:
# Import the model in the required quantization configuration from Hugging Face

import torch
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig


model_id = "llava-hf/llava-1.5-7b-hf"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)

model = LlavaForConditionalGeneration.from_pretrained(model_id,
                                                      quantization_config=quantization_config,
                                                      torch_dtype=torch.float16)

In [None]:
# Define a chat template to fine-tune the Llava model

LLAVA_CHAT_TEMPLATE = """A chat between a curious user and an artificial intelligence assistant. \
                        The assistant gives helpful, detailed, and polite answers to the user's questions. \
                        {% for message in messages %}{% if message['role'] == 'user' %}\
                        USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}\
                        {% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}"""

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer

In [None]:
# Define the data collator function to collate the images and text inputs together in the required format

class LLavaDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        texts = []
        images = []
        for example in examples:
            messages = example["messages"]
            text = self.processor.tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=False
            )
        #print("collator successfully used")
        texts.append(text)
        images.append(example["images"][0])

        batch = self.processor(texts, images, return_tensors="pt", padding=True)
        self.processor.tokenizer.add_tokens(["<image>", "<pad>"], special_tokens=True) 
        labels = batch["input_ids"].clone()
        if self.processor.tokenizer.pad_token_id is not None:
            labels[labels == self.processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels

        return batch

data_collator = LLavaDataCollator(processor)

In [None]:
import torch.nn as nn

def get_all_supported_modules(model):
    target_modules=[]
    supported_modules=(nn.Linear,nn.Embedding,nn.Conv2d,nn.Conv1d)
    for name,module in model.named_modules():
        if isinstance(module,supported_modules):
            target_modules.append(name)
    return target_modules

target_modules=get_all_supported_modules(model)
target_modules

In [7]:
# Define Training Arguments

training_args = TrainingArguments(
    output_dir="/mnt/DATA/dhanvin/logs/",
    #report_to="tensorboard",
    learning_rate=1.4e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    logging_steps=5,
    num_train_epochs=1,
    #push_to_hub=True,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    fp16=True,
    bf16=False
)

In [8]:
# Define the configuration for the Low Rank Adaptation (LoRA)

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=target_modules
)

In [None]:
# Define the parameters of the Supervised Fine-tuning Trainer

trainer = SFTTrainer(
    model=model,
    args=training_args,
    #train_dataset=train_dataset_1,
    #eval_dataset=eval_dataset_1,
    peft_config=lora_config,
    dataset_text_field="text",  # need a dummy field
    tokenizer=tokenizer,
    data_collator=data_collator,
    dataset_kwargs={"skip_prepare_dataset": True},
)

In [4]:
import pandas as pd
import os
from PIL import Image
df_train=pd.read_csv("chexpert_chunk_4_extracted/filtered_train.csv",low_memory=False)
df_test=pd.read_csv("chexpert_chunk_4_extracted/filtered_test.csv",low_memory=False)

In [None]:
df_train

In [44]:
from PIL import Image
path_to_image = df_train['path_to_image'][0].split('.')[0]+'.png'
image_path = os.path.join('chexpert_chunk_4_extracted', path_to_image)

samp_dict={'messages': [{'content': [{'index': None,
     'text': '"What does the interval extubation and the new findings suggest?',
     'type': 'text'},
    {'index': 0, 'text': None, 'type': 'image'}],
   'role': 'user'},
  {'content': [{'index': None,
     'text': 'The interval extubation and subsequent development of mild-to-moderate pulmonary edema and increasing small bilateral pleural effusions suggest that the removal of the breathing tube might have led to fluid accumulation in the lungs and the space around them.',
     'type': 'text'}],
   'role': 'assistant'},
  {'content': [{'index': None,
     'text': 'What conditions might be indicated by bibasilar opacification?',
     'type': 'text'}],
   'role': 'user'},
  {'content': [{'index': None,
     'text': 'Bibasilar opacification is consistent with either atelectasis, where parts of the lungs collapse or do not inflate properly, or aspiration, where foreign material such as food or liquid enters the lungs.',
     'type': 'text'}],
   'role': 'assistant'},
  {'content': [{'index': None,
     'text': 'What is indicated by the prominent perihilar opacification and reticular prominence?',
     'type': 'text'}],
   'role': 'user'},
  {'content': [{'index': None,
     'text': 'The prominent perihilar opacification and reticular prominence are suggestive of superimposed mild pulmonary edema, indicating fluid accumulation in and around the lung tissue.',
     'type': 'text'}],
   'role': 'assistant'},
  {'content': [{'index': None,
     'text': "Was there any significant change in the patient's condition between the two radiographs?",
     'type': 'text'}],
   'role': 'user'},
  {'content': [{'index': None,
     'text': 'No, there was no significant interval change between the two radiographs besides a mild interval increase in pulmonary edema.',
     'type': 'text'}],
   'role': 'assistant'},
  {'content': [{'index': None,
     'text': 'What remained unchanged in the radiographic findings?',
     'type': 'text'}],
   'role': 'user'},
  {'content': [{'index': None,
     'text': 'The cardiomediastinal silhouette and the presence of a right paratracheal mass remained unchanged in the radiographic findings."',
     'type': 'text'}],
   'role': 'assistant'}],
 'images': [Image.open(image_path).convert("RGB").resize((480, 640))]}

In [None]:
texts=[]
images=[]
messages = samp_dict["messages"]
text = processor.tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=False
)
texts.append(text)
images.append(samp_dict["images"][0])

batch = processor(texts, images, return_tensors="pt", padding=True)


In [None]:
!export MY_API_KEY="xxxxxx"

In [53]:
import requests
def generate_vqa_train(index):

    report = df_train['report'].iloc[index]
    
    prompt = f"""
    Based on the following medical report, generate upto 5 questions and their corresponding answers. 
    The questions and answers should be reflective of what caused the doctor to give this diagnosis. 
    Omit irrelevant information like age and other demographics.
    Also omit the date and time when the x-rays were taken in your responses.
    Focus only on the impressions. 
    The questions and answers should be seperated just by a single newline character:
    
    Medical Report:
    {report}
    
    Questions and Answers:
    """
    
    url = MY_API_KEY 
    payload = {'text': prompt}

    # response_text = ''
    # while not response_text:
    #     response = requests.post(url, json=payload)
    #     response_text = str(response.text)

    response = requests.post(url,json=payload)
    response_text=str(response.text)
    if(response_text==''):
        while(response_text==''):
            response = requests.post(url,json=payload)
            response_text=str(response.text)


    lines=response_text.strip().split('\\n\\n')
    
    qa_pairs=[]
    for line in lines:
        parts=line.split("\\n")

        if(len(parts)==2):
            question,answer=parts
            qa_pairs.append(question.strip())
            qa_pairs.append(answer.strip())
    
    conversation = {'messages': [],'images':[]}
    
    for i in range(0, len(qa_pairs), 2):
        question = qa_pairs[i].strip()
        answer = qa_pairs[i+1].strip()
        
        if(i==0):
            conversation['messages'].append({
            'content': [{'index': None, 'text': question, 'type': 'text'},
                        {'index': 0, 'text': None, 'type': 'image'}],
            'role': 'user'
            })
            conversation['messages'].append({
                'content': [{'index': None, 'text': answer, 'type': 'text'}],
                'role': 'assistant'
            })
        else:
            conversation['messages'].append({
            'content': [{'index': None, 'text': question, 'type': 'text'}],
            'role': 'user'
            })
            conversation['messages'].append({
                'content': [{'index': None, 'text': answer, 'type': 'text'}],
                'role': 'assistant'
            })
            
          
    path_to_image = df_train['path_to_image'][index].split('.')[0]+'.png'
    image_path = os.path.join('chexpert_chunk_4_extracted', path_to_image)
    conversation['images'] = [Image.open(image_path).convert("RGB").resize((480, 640))]

    return conversation

In [None]:
generate_vqa_train(0)

In [60]:
def generate_vqa_test(index):
    

    report = df_test['report'].iloc[index]
    
    prompt = f"""
    Based on the following medical report, generate upto 5 questions and their corresponding answers. 
    The questions and answers should be reflective of what caused the doctor to give this diagnosis. 
    Omit irrelevant information like age and other demographics.
    Also omit the date and time when the x-rays were taken in your responses.
    Focus only on the impressions. 
    The questions and answers should be seperated just by a single newline character:
    
    Medical Report:
    {report}
    
    Questions and Answers:
    """
    
    url = 'http://172.21.23.19:6666/process_text'  
    payload = {'text': prompt}
    
    # response_text = ''
    # while not response_text:
    #     response = requests.post(url, json=payload)
    #     response_text = str(response.text)


    response = requests.post(url,json=payload)
    response_text=str(response.text)
    if(response_text==''):
        while(response_text==''):
            response = requests.post(url,json=payload)
            response_text=str(response.text)
    
    lines=response_text.strip().split('\\n\\n')
    
    qa_pairs=[]
    for line in lines:
        parts=line.split("\\n")

        if(len(parts)==2):
            question,answer=parts
            qa_pairs.append(question.strip())
            qa_pairs.append(answer.strip())
    
    conversation = {'messages': [],'images':[]}
    
    for i in range(0, len(qa_pairs), 2):
        question = qa_pairs[i].strip()
        answer = qa_pairs[i+1].strip()
        
        if(i==0):
            conversation['messages'].append({
            'content': [{'index': None, 'text': question, 'type': 'text'},
                        {'index': 0, 'text': None, 'type': 'image'}],
            'role': 'user'
            })
            conversation['messages'].append({
                'content': [{'index': None, 'text': answer, 'type': 'text'}],
                'role': 'assistant'
            })
        else:
            conversation['messages'].append({
            'content': [{'index': None, 'text': question, 'type': 'text'}],
            'role': 'user'
            })
            conversation['messages'].append({
                'content': [{'index': None, 'text': answer, 'type': 'text'}],
                'role': 'assistant'
            })
            
          
    path_to_image = df_train['path_to_image'][index].split('.')[0]+'.png'
    image_path = os.path.join('chexpert_chunk_4_extracted', path_to_image)
    conversation['images'] = [Image.open(image_path).convert("RGB").resize((480, 640))]

    return conversation

In [None]:
generate_vqa_train(0)

In [55]:
batch_size=32
train_list=[generate_vqa_train(j) for j in range(32)]
#images=[[Image.open(os.path.join("chexpert_chunk_4_extracted",df_train['path_to_image'][j]).split(".")[0]+".png")] for j in range(32)]
#train_dict={'messages': messages,'images': images}
#train_list = [{'messages': message, 'images': image} for message, image in zip(messages, images)]

In [None]:
for i in range(len(test_list)):
    if test_list[i]['messages']==[]:
        print(i)
    else:
        print(f"okay{i}")

In [63]:
batch_size=32
test_list=[generate_vqa_test(j) for j in range(32)]
#images=[[Image.open(os.path.join("chexpert_chunk_4_extracted",df_test['path_to_image'][j]).split(".")[0]+".png")] for j in range(32)]
#eval_dict={'messages': messages,'images': images}
#eval_list = [{'messages': message, 'images': image} for message, image in zip(messages, images)]

In [None]:
train_list

In [65]:
trainer.train_dataset=train_list
trainer.eval_dataset=test_list

In [None]:
trainer.train()

In [None]:
model_output_dir='/mnt/DATA/dhanvin/llava_chexpert_model'
trainer.model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)
processor.save_pretrained(model_output_dir)


In [None]:
pip install sentencepiece

In [None]:
from transformers import LlavaForConditionalGeneration,AutoProcessor,AutoTokenizer,AutoModelForVisualQuestionAnswering

load_directory = "/mnt/DATA/dhanvin/llava_chexpert_model"
model_ft = LlavaForConditionalGeneration.from_pretrained(load_directory)
tokenizer_ft=AutoTokenizer.from_pretrained(load_directory)
processor_ft=AutoProcessor.from_pretrained(load_directory)


In [None]:
processor_ft

In [None]:
from transformers import pipeline

pipe = pipeline("vqa",'/mnt/DATA/dhanvin/llava_chexpert_model')


In [None]:
Image.open(image_path).convert("RGB").resize((480, 640))

In [None]:
import os
from PIL import Image
image_path=os.path.join('chexpert_chunk_4_extracted',df_test['path_to_image'][3400].split('.')[0]+".png")
pipe(question= "USER: \n" + "Describe this Image" + "\nASSISTANT:"
,image = Image.open(image_path).convert("RGB").resize((480, 640)))

In [50]:
# Define a custom pipeline function
def vision_language_pipeline(text, image_path):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB").resize((480, 640))
    
    # Process the inputs
    inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
    
    # Generate outputs
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            pixel_values=inputs['pixel_values']
        )
    
    # Decode the generated tokens
    predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_text

# Create a pipeline for your custom task
def custom_pipeline(text, image_path):
    return vision_language_pipeline(text, image_path)


In [None]:
result = custom_pipeline(text=sample_text, image_path=image_path)
print(result)

In [None]:
from PIL import Image
import os
import torch

# Load a sample image and text
image_path = os.path.join('chexpert_chunk_4_extracted',df_test['path_to_image'][3400].split('.')[0]+".png")

sample_image = Image.open(image_path).convert("RGB").resize((480, 640))

sample_text = "USER: \n" + "Describe this Image" + "\nASSISTANT:"

# Preprocess the inputs
inputs = processor_ft(text=sample_text, images=sample_image, return_tensors="pt", padding=True)
inputs.keys()
# Generate predictions
with torch.no_grad():
    outputs = model_ft.generate(**inputs)

# Decode the generated tokens
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(predicted_text)
