In [None]:
!pip install transformers sentencepiece transformers[sentencepiece] accelerate datasets peft trl bitsandbytes

In [2]:
from huggingface_hub import login
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer

# Login to Hugging Face
# login(token="YOUR_HF_TOKEN_HERE")  # Uncomment and add your token when running


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from PIL import Image
from torch.cuda.amp import GradScaler, autocast
import os
import pandas as pd
import numpy as np
import cv2
import copy
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer

import pandas as pd

2024-07-20 04:56:46.787619: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 04:56:46.787683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-20 04:56:46.789180: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [23]:


################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 8

# Alpha parameter for LoRA scaling
lora_alpha = 8

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# # Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 20

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 1

# Batch size per GPU for evaluation
per_device_eval_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = False

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 5e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj"]
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size = per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    eval_strategy="epoch",  # Evaluate at the end of every epoch
    logging_strategy="epoch",     # Log at the end of every epoch
    report_to=[]
)


In [5]:
from torchvision.models.densenet import DenseNet121_Weights

class AttentionModule(nn.Module):
    def __init__(self, in_features, num_diseases, num_patches):
        super(AttentionModule, self).__init__()
        self.attention_weights = nn.Parameter(torch.randn(num_diseases, num_patches, 1))
        
    def forward(self, x):
        attention_weights = torch.softmax(self.attention_weights, dim=1)
        attention_weights = attention_weights.permute(1, 0, 2)
        attended_features = torch.einsum('bpc,pdc->bdc', x, attention_weights) 
        return attended_features, attention_weights


class DenseNet121WithAttention(nn.Module):
    def __init__(self, out_size, num_diseases):
        super(DenseNet121WithAttention, self).__init__()
        self.densenet121 = torchvision.models.densenet121(weights=DenseNet121_Weights.IMAGENET1K_V1)
        num_ftrs = self.densenet121.classifier.in_features
        self.vision_feature_dim = num_ftrs
        self.densenet121.classifier = nn.Identity()
        # self.freeze_densenet121()
        
        self.attention_module = AttentionModule(num_ftrs, num_diseases, num_patches=49)
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5), 
            nn.Linear(num_diseases * num_ftrs, out_size),  #logits
        )
        self.gradients = None

    def freeze_densenet121(self):
        for param in self.densenet121.parameters():
            param.requires_grad = False

    def save_gradients(self, grad):
        self.gradients = grad

    def forward(self, x):
        features = self.densenet121.features(x)
        features = features.view(features.size(0), features.size(1), -1).permute(0, 2, 1)
        attended_features, attention_weights = self.attention_module(features)
        out = attended_features.reshape(attended_features.size(0), -1)
        out = self.classifier(out)
        
        if self.training and features.requires_grad:
            features.register_hook(self.save_gradients)
        
        return out, attention_weights, attended_features
    
# Check if a GPU is available, otherwise use the CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# Load the model
vision_model = DenseNet121WithAttention(out_size=14, num_diseases=14)

# Load the saved model parameters
vision_model.load_state_dict(torch.load('/kaggle/input/x-ray-dataset-new/dense_net_121_d_e50.pth'))

vision_model.to(device)

# Set the vision_model to evaluation mode if you're making predictions
vision_model.eval()

visual_feature_dim = vision_model.vision_feature_dim


cuda:0


In [6]:
# # Load the entire model on the GPU 0
# device_map = {"": 0}
llm_model_name = "meta-llama/Llama-2-7b-hf"

llm_model = AutoModelForCausalLM.from_pretrained(
    llm_model_name,
    quantization_config=bnb_config
#     device_map=device_map  # This will automatically assign layers to GPUs if available
)
llm_model.config.use_cache = False
llm_model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(llm_model_name, trust_remote_code=True)
# this should be set for finutning and batched inference
tokenizer.add_special_tokens({"pad_token": "<PAD>"})

llm_model.resize_token_embeddings(len(tokenizer))

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding(32001, 4096)

In [7]:
llm_model = get_peft_model(llm_model, peft_config)

In [25]:
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

num_diseases = 14
# Custom dataset class
class PreCarDiv_Dataset(torch.utils.data.Dataset):
    def __init__(self, csv_data, tokenizer, vision_model,transform):
        self.csv_data = csv_data
        self.tokenizer = tokenizer
        self.vision_model = vision_model
        self.transform = transform

    def __len__(self):
        return len(self.csv_data)

    def __getitem__(self, idx):
        image_path = self.csv_data[idx]['image_path']
        image_path = os.path.join('/kaggle/input/x-ray-dataset-new/archive/Chexpertplus_Images/preprocessed_images2', image_path) 
        diagnostic_prompt = self.csv_data[idx]['diagnostic_prompt']
        target_report = self.csv_data[idx]['target_report']

        image = self.load_image(image_path)
        image = self.transform(image).unsqueeze(0).to(device)  # Move image to the same device as the model
        
        _,_,visual_features = self.vision_model(image)

        visual_features = visual_features.view(1 * num_diseases, -1).cpu()

        target_report_id = tokenizer(target_report, return_tensors='pt').input_ids.squeeze(0)
         # Tokenize the diagnostic prompt and remove the batch dimension
        diagnostic_prompt_ids = self.tokenizer(diagnostic_prompt, return_tensors='pt').input_ids.squeeze(0)
        diagnostic_prompt_len =diagnostic_prompt_ids.shape[0]
        
        # Concatenate the token IDs
        combined_input_id = torch.cat((diagnostic_prompt_ids, target_report_id), dim=0)
        eos_token_id = tokenizer.eos_token_id
        combined_input_id = torch.cat((combined_input_id, torch.tensor([eos_token_id], dtype=torch.long)), dim=0)

        # Calculate lengths
        visual_tokens_len = visual_features.shape[0]  # Number of visual tokens


        # Create labels with padding and masking for non-target tokens
        labels = torch.full((visual_tokens_len + combined_input_id.shape[0],), -100, dtype=torch.long)

        # Determine start index for the target report within the combined sequence
        target_start_idx = visual_tokens_len + diagnostic_prompt_len
        
         # Set the labels for the target text part, shifted by one position to the right
        labels[target_start_idx:target_start_idx + len(target_report_id) - 1] = target_report_id[1:]
        labels[target_start_idx + len(target_report_id) - 1] = eos_token_id # Ensure the last token predicts EOS
        
#         print("Lables", labels)
#         print('Sample:', idx)
#         print(f"visual_tokens_len: {visual_tokens_len}")
#         print(f"target_report_ids.shape: {target_report_id.shape}")
#         print(f"diagnostic_prompt_ids.shape: {diagnostic_prompt_ids.shape}")
#         print(f"combined_input_ids.shape: {combined_input_id.shape}")
#         print(f"target_start_idx: {target_start_idx}")
#         print(f"target_report_id[1:].shape: {target_report_id[1:].shape}")
#         print(f"Final labels.shape: {labels.shape}")
#         print(f'Visual Features from dataset: {visual_features.shape}')
#         print('#'*20)

        return {
            'visual_features': visual_features,
            'input_ids': combined_input_id,
            'attention_mask': torch.cat((
                torch.ones(visual_tokens_len, dtype=torch.long),
                torch.ones(len(combined_input_id), dtype=torch.long)
            )),
            'labels': labels
        }
    
    
    def load_image(self, image_path):
        image = Image.open(image_path).convert('RGB')
        return image


def pad_sequence(sequences, padding_value):
    return torch.nn.utils.rnn.pad_sequence(sequences, padding_value=padding_value)

def custom_collate_fn(batch):
#     print("Batch shape",len(batch))
    visual_features = torch.stack([item['visual_features'] for item in batch])
    input_ids = pad_sequence([item['input_ids'] for item in batch], padding_value=tokenizer.pad_token_id).transpose(0, 1)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], padding_value=0).transpose(0, 1)
    labels = pad_sequence([item['labels'] for item in batch], padding_value=-100).transpose(0, 1)
    
#     print("Lables shape",labels.shape)
    
#     print("Input Ids", input_ids)
#     print("A_Maks",attention_mask)
#     print("labels",labels)

    return {
        'visual_features': visual_features,
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }
# Load your CSV data using pandas
csv_file_path = '/kaggle/input/x-ray-dataset-new/final_dataset_latest.csv'
df = pd.read_csv(csv_file_path,nrows=100)
# Convert the dataframe to a list of dictionaries
csv_data = df.to_dict(orient='records')

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Splitting the data into train, validation, and test sets in 60-20-20 ratio
train_val_data, test_data = train_test_split(csv_data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

train_dataset = PreCarDiv_Dataset(train_data, tokenizer, vision_model, transform)
test_dataset = PreCarDiv_Dataset(test_data, tokenizer, vision_model, transform)
val_dataset = PreCarDiv_Dataset(val_data, tokenizer, vision_model, transform)

In [9]:
class Model_PreCarDiv(nn.Module):
    def __init__(self, visual_feature_dim, llm_model):
        super(Model_PreCarDiv, self).__init__()
        self.llm_model = llm_model
        self.visual_projection = nn.Linear(visual_feature_dim, llm_model.config.hidden_size).to(llm_model.dtype)

    def forward(self, visual_features, input_ids, attention_mask, labels=None):
        visual_features, combined_embeddings = self.prepare_features(visual_features, input_ids)
        outputs = self.llm_model(inputs_embeds=combined_embeddings, attention_mask=attention_mask, labels=labels)
        return outputs

    def prepare_features(self, visual_features, input_ids):
        # Convert visual_features to the same dtype as model parameters
        visual_features = visual_features.to(self.llm_model.dtype)
                                           
        # Visual_features shape is (batch_size, num_diseases, visual_feature_dim)
        batch_size, num_diseases, _ = visual_features.shape
        
        # Reshape visual features to (batch_size * num_diseases, visual_feature_dim)
        visual_features_reshaped = visual_features.view(batch_size * num_diseases, -1)
        
        # Project visual features to the same dimension as text embeddings
        visual_embeddings_reshaped = self.visual_projection(visual_features_reshaped)
        
        # Reshape back to (batch_size, num_diseases, hidden_size)
        visual_embeddings = visual_embeddings_reshaped.view(batch_size, num_diseases, -1)
        
        # Get the text embeddings from the model's embedding layer (batch_size, seq_len, hidden_size)
        text_embeddings = self.llm_model.get_input_embeddings()(input_ids)
        
        # Combine visual and text embeddings  (batch_size, num_diseases+seq_len, hidden_size)
        combined_embeddings = torch.cat((visual_embeddings, text_embeddings), dim=1)
        
        return visual_features, combined_embeddings

    def generate(self, visual_features, input_ids, attention_mask, max_length=512, num_beams=5, early_stopping=True, no_repeat_ngram_size=2):
        self.llm_model.eval()
        with torch.no_grad():
            _, combined_embeddings = self.prepare_features(visual_features, input_ids)
            # Start with given input_ids and expand generation up to max_length
            generated_ids = self.llm_model.generate(
                inputs_embeds=combined_embeddings,
                attention_mask=attention_mask,
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=early_stopping,
                no_repeat_ngram_size=2,
                eos_token_id=tokenizer.eos_token_id
            )
        return generated_ids

    
multi_model = Model_PreCarDiv(visual_feature_dim,llm_model).to(device)

In [10]:
# Calculate the number of trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_trainable_params = count_parameters(multi_model)
print(f"Number of trainable parameters: {num_trainable_params}")

Number of trainable parameters: 12587008


In [None]:
from transformers import Trainer

# Custom trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        visual_features = inputs['visual_features'].to(device)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = inputs['labels'].to(device)
        outputs = model(visual_features, input_ids, attention_mask, labels)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

# Set supervised fine-tuning parameters
trainer = CustomTrainer(
    model=multi_model,
    train_dataset=train_dataset,
    args=training_arguments,
    data_collator=custom_collate_fn,
    eval_dataset = val_dataset
)

# Start training
trainer.train()

In [12]:
def generate_predictions(data_loader, model, tokenizer, device, max_length=512):
    model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for batch in data_loader:
            visual_features = batch['visual_features'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Generate sequences using the model's generate method
            generated_ids = model.generate(
                visual_features=visual_features,
                attention_mask=attention_mask,
                input_ids=input_ids,
                num_beams=5,            # Use beam search with specified number of beams
                early_stopping=True,    # Stop generating as soon as all beams are finished
                no_repeat_ngram_size=2,  # Prevent repeating n-grams,
#                 eos_token_id=tokenizer.eos_token_id
            )

            # Decode the generated ids and the labels to text
            for i, gen_ids in enumerate(generated_ids):
                predicted_text = tokenizer.decode(gen_ids, skip_special_tokens=True)
                reference_text = tokenizer.decode(labels[i][labels[i] != -100], skip_special_tokens=True)  # filtering out -100 used for ignored indices
                print('Reference Report', reference_text)
                print('Predicted Report', predicted_text)
                predictions.append(predicted_text)
                references.append([reference_text])  # references must be a list of lists for sacrebleu

    return predictions, references


In [None]:
from sacrebleu.metrics import BLEU
from torch.utils.data import DataLoader

bleu_metric = BLEU()

# Create a DataLoader for your evaluation dataset
data_loader = DataLoader(val_dataset, batch_size=2, collate_fn=custom_collate_fn)
predictions, references = generate_predictions(data_loader, multi_model, tokenizer, device)



In [None]:
# Calculate the BLEU score
bleu_score = bleu_metric.corpus_score(predictions, references)
print(f"BLEU score: {bleu_score.score}")
