In [1]:
!pip install torch --upgrade
!pip install transformers --upgrade
!pip install bitsandbytes --upgrade
!pip install accelerate --upgrade

# install dependancied for google colab 
!pip install peft datasets torchvision --upgrade
!pip install git+https://github.com/huggingface/peft.git
  

Collecting torch
  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)

In [2]:
!pip install jinja2



In [3]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel, AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
from PIL import Image
import os
import json

# Set device to MPS if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

def load_clip():
    # -- processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    # Load CLIP processor and model, saving locally if not already present
    if os.path.exists("data/processor_clip_embeddings_vit_base_patch32.pt"):
        processor = CLIPProcessor.from_pretrained("data/processor_clip_embeddings_vit_base_patch32.pt")
    else:
        processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        # save processor for later use
        processor.save_pretrained("data/processor_clip_embeddings_vit_base_patch32.pt")

    model_clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    model_clip = model_clip.to(device).to(torch.float32)
    return processor, model_clip

def generate_clip_embeddings(processor, model_clip, image_dir, embeddings_path):
    # Generate CLIP embeddings for images and save them
    image_files = [f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]
    image_files = sorted(image_files)
    clip_embeddings = []
    image_ID = {}

    for i, image_file in enumerate(image_files):  # Limit to 100 images for this example
        image_path = os.path.join(image_dir, image_file)
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            image_features = model_clip.get_image_features(**inputs)
        clip_embeddings.append(image_features.cpu())
        image_ID[image_file.split(".")[0]] = i

    torch.save(torch.cat(clip_embeddings, dim=0), embeddings_path)
    return image_ID

class ProjectionLayer(nn.Module):
    def __init__(self, clip_embedding_dim, phi_hidden_dim):
        super().__init__()
        self.linear = nn.Linear(clip_embedding_dim, phi_hidden_dim)

    def forward(self, image_embeddings):
        return self.linear(image_embeddings)

class MultimodalPhiWithAdapter(nn.Module):
    def __init__(self, language_model, projection_layer, freeze_language_model=True, freeze_projection_layer=False):
        super().__init__()
        self.language_model = language_model
        self.projection_layer = projection_layer
        self.config = language_model.config
        self.set_trainable_params(freeze_language_model, freeze_projection_layer)
        
        # Convert all parameters to float16
        self.to(torch.float16)

    def set_trainable_params(self, freeze_language_model, freeze_projection_layer):
        for param in self.language_model.parameters():
            param.requires_grad = not freeze_language_model
        for param in self.projection_layer.parameters():
            param.requires_grad = not freeze_projection_layer

    def forward(self, input_ids=None, attention_mask=None, image_embeddings=None, labels=None, inputs_embeds=None, **kwargs):
        batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
        
        # Ensure all inputs are in float16
        if inputs_embeds is not None:
            inputs_embeds = inputs_embeds.to(torch.float16)
        if image_embeddings is not None:
            image_embeddings = image_embeddings.to(torch.float16)
        
        if inputs_embeds is None:
            if image_embeddings is not None:
                projected_embeddings = self.projection_layer(image_embeddings)
                input_embeds = self.language_model.get_input_embeddings()(input_ids)
                combined_embeds = torch.cat([projected_embeddings.unsqueeze(1), input_embeds], dim=1)
            else:
                combined_embeds = self.language_model.get_input_embeddings()(input_ids)
        else:
            combined_embeds = inputs_embeds

        if attention_mask is not None:
            if image_embeddings is not None:
                image_attention = torch.ones((batch_size, 1), dtype=torch.long, device=combined_embeds.device)
                combined_attention_mask = torch.cat([image_attention, attention_mask], dim=1)
            else:
                combined_attention_mask = attention_mask
        else:
            combined_attention_mask = None
        
        # Ensure combined_embeds is in float16
        combined_embeds = combined_embeds.to(torch.float16)

        if labels is not None and image_embeddings is not None:
            pad_labels = torch.full((batch_size, 1), -100, dtype=labels.dtype, device=labels.device)
            labels = torch.cat([pad_labels, labels], dim=1)

        outputs = self.language_model(
            inputs_embeds=combined_embeds,
            attention_mask=combined_attention_mask,
            labels=labels,
            **kwargs
        )
        
        return outputs

    def prepare_inputs_for_generation(self, *args, **kwargs):
        return self.language_model.prepare_inputs_for_generation(*args, **kwargs)
    
    def count_trainable_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
    
    def count_total_parameters(self):
        return sum(p.numel() for p in self.parameters())

def prepare_dataset(json_file, image_ID):
    with open(json_file, 'r') as f:
        data = json.load(f)
    
    prepared_data = []
    for item in data:
        if item['image'].split('.')[0] not in image_ID:
            continue
        
        conversations = item['conversations']
        prompt = ""
        response = ""
        
        for conv in conversations:
            if conv['from'] == 'human':
                if prompt:
                    prepared_data.append({
                        "id": item['id'],
                        "image": item['image'],
                        "prompt": prompt.strip(),
                        "response": response.strip()
                    })
                    response = ""
                prompt += conv['value'] + "\n"
            else:
                response += conv['value'] + "\n"
        
        if prompt and response:
            prepared_data.append({
                "id": item['id'],
                "image": item['image'],
                "prompt": prompt.strip(),
                "response": response.strip()
            })
    
    return prepared_data

class InstructDataset(torch.utils.data.Dataset):
    def __init__(self, instruct_data, clip_embeddings, tokenizer, image_ID, max_length=512):
        self.instruct_data = instruct_data
        self.clip_embeddings = clip_embeddings
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.image_ID = image_ID

    def __len__(self):
        return len(self.instruct_data)
    
    def __getitem__(self, idx):
        item = self.instruct_data[idx]
        
        full_text = ""
        for conv in item['conversations']:
            if conv['from'] == 'human':
                full_text += f"Human: {conv['value']}\n"
            else:
                full_text += f"Assistant: {conv['value']}\n"
        
        image_id = item["image"].split('.')[0]
        img_idx = self.image_ID.get(image_id, 0)
        image_embedding = self.clip_embeddings[img_idx]

        encoded = self.tokenizer(
            full_text, 
            return_tensors="pt", 
            truncation=True, 
            max_length=self.max_length,
            padding="max_length"
        )

        return {
            "input_ids": encoded.input_ids.squeeze().to(device),
            "attention_mask": encoded.attention_mask.squeeze().to(device),
            "image_embeddings": image_embedding.to(device),
            "labels": encoded.input_ids.squeeze().to(device),
        }
        

Using device: cuda


In [4]:
processor, model_clip = load_clip()
image_dir = "/kaggle/input/image-train2017/train2017" #"/kaggle/input/small-train/small_train" 
embeddings_path = "data/clip_embeddings.pt"
image_ID = generate_clip_embeddings(processor, model_clip, image_dir, embeddings_path)

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [5]:
print("hii")

hii


In [6]:

clip_embeddings = torch.load(embeddings_path)
clip_embedding_dim = clip_embeddings.shape[1]

  clip_embeddings = torch.load(embeddings_path)


In [7]:

# Load your Phi model with 4-bit quantization
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     bnb_8bit_use_double_quant=True,
#     bnb_8bit_quant_type="nf8",
#     bnb_8bit_compute_dtype=torch.float16
# )

# if os.path.exists("/kaggle/working/local_phi2_model"):
#     model_phi = AutoModelForCausalLM.from_pretrained(
#         "/kaggle/working/local_phi2_model",
#         quantization_config=bnb_config,
#         device_map="auto"
#     )
# else:
model_phi = AutoModelForCausalLM.from_pretrained(
    "microsoft/phi-2",
    quantization_config=bnb_config,
    device_map="auto"
)
model_phi.save_pretrained("/kaggle/working/local_phi2_model")

# -- tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
# if os.path.exists("/kaggle/working/local_phi2_model"):
#     tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/local_phi2_model")
#     print("tokenizer loaded from local")
# else:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
tokenizer.save_pretrained("/kaggle/working/local_phi2_model")
print("tokenizer loaded from HF")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer loaded from HF


In [8]:

projection_layer = ProjectionLayer(clip_embedding_dim, model_phi.config.hidden_size).to(device)
projection_layer = projection_layer.half()  # Cast weights to float16 
multimodal_phi = MultimodalPhiWithAdapter(model_phi, projection_layer).to(device)
multimodal_phi.config = model_phi.config  # Ensure the config is set


In [9]:


# Prepare the model for k-bit training
model_phi = prepare_model_for_kbit_training(model_phi)

# Configure QLoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
multimodal_phi = get_peft_model(multimodal_phi, lora_config)

# After applying LoRA, if you want to unfreeze certain parts:
multimodal_phi.set_trainable_params(freeze_language_model=True, freeze_projection_layer=False)

# Ensure the model is on the GPU
multimodal_phi = multimodal_phi.to(device)


In [10]:

# Add this new code block
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params = count_parameters(multimodal_phi)
trainable_params = count_trainable_parameters(multimodal_phi)
projection_params = count_parameters(projection_layer)
adapter_params = trainable_params  # Since only adapter layers are trainable

print(f"Total parameters: {total_params:,}")
print(f"Projection layer parameters: {projection_params:,}")
print(f"Adapter (trainable) parameters: {adapter_params:,}")
print(f"Percentage of trainable parameters: {(adapter_params / total_params) * 100:.2f}%")


Total parameters: 1,524,016,640
Projection layer parameters: 1,313,280
Adapter (trainable) parameters: 1,313,280
Percentage of trainable parameters: 0.09%


In [11]:
# !pip install --upgrade pyarrow datasets
# -- instruct_data = load_dataset("liuhaotian/LLaVA-Instruct-150K", split='train')
# Check for local copy first
# if os.path.exists("/kaggle/working/model_instruct150k"):
#     instruct_data = load_dataset("/kaggle/working/model_instruct150k", split='train')
#     print(instruct_data.column_names)
#     print(instruct_data[0])  # Check the first row
#     print("instruct data loaded from local")
# else:
#     print("loading instruct data from HF")
#     instruct_data = load_dataset("liuhaotian/LLaVA-Instruct-150K", split='train', streaming=True)
#     for example in instruct_data.take(1):
#         print(example)  # Print the first example
#     # Save the dataset locally for future use
#     # instruct_data.save_to_disk("/kaggle/working/model_instruct150k")
# print("instruct data loaded")

# instruct_data = instruct_data.filter(lambda x: x['id'] in image_ID.keys())
# train_dataset = InstructDataset(instruct_data, clip_embeddings, tokenizer, image_ID)

#--
import json

def load_instruct_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    processed_data = []
    for item in data:
        if isinstance(item['id'], int):
            item['id'] = str(item['id'])  # Convert int to string
        processed_data.append(item)
    
    return processed_data

# Load the data
json_path = "/kaggle/input/instruct-json-150k/llava_instruct_150k.json"
if os.path.exists(json_path):
    instruct_data = load_instruct_data(json_path)
    print("instruct data loaded from local")
else:
    print("Please download the JSON file manually and place it in 'model_instruct150k/train.json'")
    raise FileNotFoundError("train.json not found")

# Modify InstructDataset to handle IterableDataset
class ModifiedInstructDataset(InstructDataset):
    def __getitem__(self, idx):
        item = self.instruct_data[idx]
        
        full_text = ""
        for conv in item['conversations']:
            if conv['from'] == 'human':
                full_text += f"Human: {conv['value']}\n"
            else:
                full_text += f"Assistant: {conv['value']}\n"
        
        image_id = item["image"].split('.')[0]
        img_idx = self.image_ID.get(image_id, 0)
        image_embedding = self.clip_embeddings[img_idx].cpu()  # Ensure it's on CPU

        encoded = self.tokenizer(
            full_text, 
            return_tensors="pt", 
            truncation=True, 
            max_length=self.max_length,
            padding="max_length"
        )

#         return {
#             "input_ids": encoded.input_ids.squeeze().to(torch.long),
#             "attention_mask": encoded.attention_mask.squeeze().to(torch.long),
#             "image_embeddings": image_embedding.to(torch.float16),  # Change to float16
#             "labels": encoded.input_ids.squeeze().to(torch.long),
#         }
        return {
            "input_ids": encoded.input_ids.squeeze().to(torch.long),  # Cast to torch.long
            "attention_mask": encoded.attention_mask.squeeze().to(torch.float16), 
            "image_embeddings": image_embedding.to(torch.float16),
            "labels": encoded.input_ids.squeeze().to(torch.long),  # Ensure labels are integers (torch.long)
        }

# Filter the data
instruct_data = [item for item in instruct_data if item['id'] in image_ID.keys()]
train_dataset = ModifiedInstructDataset(instruct_data, clip_embeddings, tokenizer, image_ID)
#--



instruct data loaded from local


In [12]:
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device count: {torch.cuda.device_count()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")
print(f"Model device: {next(multimodal_phi.parameters()).device}")

Is CUDA available: True
Current device: 0
Device count: 1
Device name: Tesla P100-PCIE-16GB
Model device: cuda:0


In [13]:

from torch.utils.data import IterableDataset
from tqdm import tqdm
from transformers import TrainerCallback

# Create the multimodal model
multimodal_phi = MultimodalPhiWithAdapter(model_phi, projection_layer)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
multimodal_phi = get_peft_model(multimodal_phi, lora_config)

# Ensure the model is on the correct device
multimodal_phi = multimodal_phi.to(device)

# Update the training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Reduce batch size if you encounter memory issues
    learning_rate=2e-5,
    save_steps=50,
    save_total_limit=4,
    remove_unused_columns=False,
    fp16=torch.cuda.is_available(),
    bf16=False,
    tf32=False,
    max_steps=120000,
    logging_steps=250,
    logging_first_step=True,
    dataloader_pin_memory=False,
    report_to="none",
    # Add these lines to ensure proper device usage
    no_cuda=False,
    local_rank=-1,
)

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, threshold=0.01):
        self.threshold = threshold
        self.previous_loss = float('inf')

    def on_log(self, args, state, control, logs=None, **kwargs):
        # This method is called every time the trainer logs (as specified by logging_steps)
        if state.global_step > 1:
            current_loss = logs.get("loss", None)
            if current_loss is not None:
                loss_diff = abs(self.previous_loss - current_loss)
                print(f"Current loss: {current_loss}, Previous loss: {self.previous_loss}, Difference: {loss_diff}")
                if loss_diff < self.threshold:
                    print(f"Stopping early. Loss difference ({loss_diff}) is less than threshold ({self.threshold}).")
                    control.should_training_stop = True
                self.previous_loss = current_loss

# Create the trainer
trainer = Trainer(
    model=multimodal_phi,
    args=training_args,
    train_dataset=train_dataset,
    callbacks=[EarlyStoppingCallback(threshold=0.001)]    
)

# Start training
try:
    trainer.train()
except Exception as e:
    print(f"Error during training: {e}")
    import traceback
    traceback.print_exc()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,6.4063
250,2.3135
500,0.6252
750,0.6023
1000,0.5785
1250,0.5709
1500,0.5936
1750,0.5726
2000,0.5786
2250,0.5848


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 2.3135, Previous loss: inf, Difference: inf


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.6252, Previous loss: 2.3135, Difference: 1.6883


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.6023, Previous loss: 0.6252, Difference: 0.02290000000000003


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5785, Previous loss: 0.6023, Difference: 0.023799999999999932


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5709, Previous loss: 0.5785, Difference: 0.007600000000000051


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5936, Previous loss: 0.5709, Difference: 0.022700000000000053


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5726, Previous loss: 0.5936, Difference: 0.02100000000000002


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5786, Previous loss: 0.5726, Difference: 0.006000000000000005


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5848, Previous loss: 0.5786, Difference: 0.006199999999999983


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5626, Previous loss: 0.5848, Difference: 0.022199999999999998


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5596, Previous loss: 0.5626, Difference: 0.0030000000000000027


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5584, Previous loss: 0.5596, Difference: 0.0011999999999999789


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.575, Previous loss: 0.5584, Difference: 0.016599999999999948


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5595, Previous loss: 0.575, Difference: 0.015499999999999958


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5783, Previous loss: 0.5595, Difference: 0.01880000000000004


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5694, Previous loss: 0.5783, Difference: 0.008900000000000019


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Current loss: 0.5703, Previous loss: 0.5694, Difference: 0.0009000000000000119
Stopping early. Loss difference (0.0009000000000000119) is less than threshold (0.001).


In [15]:
# Save only the LoRA state dict
multimodal_phi.save_pretrained("fine_tuned_phi_lora", state_dict=multimodal_phi.state_dict())
# save projection layer
torch.save(projection_layer.state_dict(), "projection_layer.pt")

In [16]:
!zip -r my_folder1.zip /kaggle/working/

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/fine_tuned_phi_lora/ (stored 0%)
  adding: kaggle/working/fine_tuned_phi_lora/README.md (deflated 66%)
  adding: kaggle/working/fine_tuned_phi_lora/adapter_config.json (deflated 53%)
  adding: kaggle/working/fine_tuned_phi_lora/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/local_phi2_model/ (stored 0%)
  adding: kaggle/working/local_phi2_model/added_tokens.json (deflated 84%)
  adding: kaggle/working/local_phi2_model/generation_config.json (deflated 24%)
  adding: kaggle/working/local_phi2_model/special_tokens_map.json (deflated 75%)
  adding: kaggle/working/local_phi2_model/vocab.json (deflated 59%)
  adding: kaggle/working/local_phi2_model/config.json (deflated 55%)
  adding: kaggle/working/local_phi2_model/merges.txt (deflated 53%)
  adding: kaggle/working/local_phi2_model/model.safetensors (deflated 5%)
  adding: kaggle/working/local_phi2_model/tokenizer_config.json (deflated 94%)
  addi