In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoProcessor, AutoModelForVision2Seq, AutoConfig
from datasets import load_dataset
from PIL import Image
import random
from tqdm import tqdm
import numpy as np
import gc
import math
import time
import os
import pandas as pd
import shutil

In [None]:
# quantized_model_dir = '/kaggle/working/quantized_model_8bit'
# if os.path.exists(quantized_model_dir):
#     shutil.rmtree(quantized_model_dir)
#     print(f"Removed directory: {quantized_model_dir}")

In [None]:
base_path = '/kaggle/input/flickr30k'
image_folder_name = 'flickr30k_images'
captions_file_name = 'captions.txt'
calibration_output_dir = '/kaggle/working/flickr30k_calibration'

image_folder = os.path.join(base_path, image_folder_name)
captions_file = os.path.join(base_path, captions_file_name)

df_captions = pd.read_csv(
    captions_file,
    delimiter=',',
    header=None,
    names=['image_name', 'caption_index', 'caption']
)

df_captions['caption'] = df_captions['caption'].astype(str).str.strip()
captions_dict = df_captions.groupby('image_name')['caption'].apply(list).to_dict()
image_paths = {}
for img_name in captions_dict.keys():
    full_path = os.path.join(image_folder, img_name)
    if os.path.exists(full_path):
        image_paths[img_name] = full_path

available_image_names = list(image_paths.keys())
sample_size = min(500, len(available_image_names))
sample_image_names = random.sample(available_image_names, sample_size)

os.makedirs(calibration_output_dir, exist_ok=True)
for img_name in sample_image_names:
    src_path = image_paths[img_name]
    dst_path = os.path.join(calibration_output_dir, img_name)
    shutil.copy(src_path, dst_path)

In [None]:
import sys
import os

module_dir = '/kaggle/input/gptypys/'

if module_dir not in sys.path:
    sys.path.insert(0, module_dir)

import gptq
import quant

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
from PIL import Image
import os
import random

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Set the device and use 16-bit precision
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
base_path = '/kaggle/input/flickr30k'
captions_file_name = 'captions.txt'
calibration_output_dir = '/kaggle/working/flickr30k_calibration'
captions_file = os.path.join(base_path, captions_file_name)

# Prepare single input function
def prepare_single_input_phi3(img_path, caption_text, processor):
    if not os.path.exists(img_path):
        print(f"Warning: Image path not found: {img_path}")
        return None
    image = Image.open(img_path).convert("RGB")

    placeholder = "<|image_1|>\n"
    user_content = placeholder + str(caption_text).strip()

    messages = [{"role": "user", "content": user_content}]

    prompt = processor.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = processor(prompt, [image], return_tensors="pt")
    return inputs

# Load the model and processor
print(f"Loading model: {MODEL_ID}")

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch_dtype,
    device_map="auto",
    trust_remote_code=True,
    _attn_implementation='eager'  
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    trust_remote_code=True 
)

model.eval()

# Load captions data
df_captions = pd.read_csv(captions_file, delimiter=',', header=None, names=['image_name', 'caption_index', 'caption'])
df_captions['caption'] = df_captions['caption'].astype(str).str.strip()
captions_dict = df_captions.groupby('image_name')['caption'].apply(list).to_dict()

# Select random image with corresponding caption
image_dir = calibration_output_dir
calib_image_names = [f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

test_img_name = random.choice(calib_image_names)
test_img_path = os.path.join(image_dir, test_img_name)
test_captions = captions_dict.get(test_img_name, ["Describe the image."])
test_caption = test_captions[0]

print(f"Selected sample: Image='{test_img_name}', Caption='{test_caption}'")

# Prepare input
inputs = prepare_single_input_phi3(test_img_path, test_caption, processor)
inputs = {k: v.to(device, non_blocking=True) for k, v in inputs.items()}

# Run inference
generation_args = {
    "max_new_tokens": 500,
    "temperature": 0.0,
    "do_sample": False,
}

with torch.no_grad():
    generate_ids = model.generate(
        **inputs,
        eos_token_id=processor.tokenizer.eos_token_id,
        **generation_args
    )

    input_token_len = inputs['input_ids'].shape[1]
    generate_ids = generate_ids[:, input_token_len:]

    response = processor.batch_decode(
        generate_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]

    print("\nInference successful!")
    print("\nDecoded Output:", response)

print("\n--- Single Instance Check Complete ---")


In [None]:
import os
import random
import shutil
import time
import math
import gc

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
import transformers
from transformers import AutoModelForCausalLM, AutoProcessor

import quant
import gptq

# === Assume model, processor, device, torch_dtype are loaded ===
model.eval()
device = next(model.parameters()).device

QUANT_BITS = 8
QUANT_PERCHANNEL = True
QUANT_SYMMETRIC = True

calibration_output_dir = '/kaggle/working/flickr30k_calibration'
captions_file = '/kaggle/input/flickr30k/captions.txt'

if not os.path.isdir(calibration_output_dir):
    raise FileNotFoundError(f"Calibration directory not found: {calibration_output_dir}")
if not os.path.isfile(captions_file):
    raise FileNotFoundError(f"Captions file not found: {captions_file}")

df_captions = pd.read_csv(
    captions_file,
    delimiter=',',
    header=None,
    names=['image_name', 'caption_index', 'caption']
)
df_captions['caption'] = df_captions['caption'].astype(str).str.strip()
captions_dict = df_captions.groupby('image_name')['caption'].apply(list).to_dict()

calib_image_names = [f for f in os.listdir(calibration_output_dir)
                     if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

if not calib_image_names:
    raise ValueError(f"No images found in calibration directory: {calibration_output_dir}")

n_calib_samples = min(100, len(calib_image_names))
calib_image_names = random.sample(calib_image_names, n_calib_samples)


def prepare_single_input(img_path, caption_text, processor):
    if not os.path.exists(img_path):
        print(f"Warning: {img_path} not found")
        return None
    try:
        image = Image.open(img_path).convert("RGB")
        placeholder = "<|image_1|>\n"
        user_content = placeholder + caption_text.strip()
        messages = [{"role": "user", "content": user_content}]
        prompt = processor.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        inputs = processor(prompt, [image], return_tensors="pt")
        return inputs
    except Exception as e:
        print(f"Error preparing input for {img_path}: {e}")
        return None


gptq_modules = {}
module_names = {}

def is_target_module(module):
    return isinstance(module, nn.Linear) or (hasattr(module, '__class__') and module.__class__.__name__ == "Conv1D")

print("Identifying target modules...")
for name, module in model.named_modules():
    if is_target_module(module):
        gptq_inst = gptq.GPTQ(module)
        gptq_inst.quantizer = quant.Quantizer()
        gptq_inst.quantizer.configure(
            bits=QUANT_BITS,
            perchannel=QUANT_PERCHANNEL,
            sym=QUANT_SYMMETRIC
        )
        gptq_modules[module] = gptq_inst
        module_names[module] = name

print(f"Found {len(gptq_modules)} target modules.")

hook_handles = []
def make_hook(module, gptq_inst):
    def hook(module, inp, out):
        in_tensor = inp[0].detach() if isinstance(inp, tuple) else inp.detach()
        out_tensor = out[0].detach() if isinstance(out, tuple) else out.detach()
        try:
            gptq_inst.add_batch(in_tensor, out_tensor)
        except Exception as e:
            print(f"Error in hook for {module_names.get(module, type(module).__name__)} during add_batch: {e}")
    return hook

print("Registering forward hooks...")
for module, gptq_inst in gptq_modules.items():
    handle = module.register_forward_hook(make_hook(module, gptq_inst))
    hook_handles.append(handle)

print(f"Starting calibration with {n_calib_samples} samples...")
model.eval()
processed_count = 0
with torch.no_grad():
    for i, img_name in enumerate(calib_image_names):
        cap_list = captions_dict.get(img_name, ["Describe the image."])
        caption = random.choice(cap_list)
        img_path = os.path.join(calibration_output_dir, img_name)
        inputs = prepare_single_input(img_path, caption, processor)
        if inputs is None:
            continue

        try:
            inputs_on_device = {k: v.to(device, non_blocking=True) for k, v in inputs.items()}
            _ = model(**inputs_on_device)
            processed_count += 1
            if (i + 1) % 20 == 0 or (i + 1) == n_calib_samples:
                print(f"  Processed calibration sample {i+1}/{n_calib_samples}: {img_name}")
        except Exception as e:
            print(f"Error during forward pass for calibration sample {img_name}: {e}")

print(f"Calibration finished. Processed {processed_count} samples.")

print("Removing forward hooks...")
for handle in hook_handles:
    handle.remove()
hook_handles.clear()
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Starting quantization and packing...")
quantized_layers_count = 0
failed_layers = []

for module, gptq_inst in gptq_modules.items():
    module_name = module_names.get(module, type(module).__name__)
    print(f"Quantizing and packing: {module_name}")
    try:
        if hasattr(gptq_inst, 'nsamples') and gptq_inst.nsamples == 0:
            print(f"  Skipping {module_name}: No calibration samples recorded.")
            failed_layers.append(module_name)
            gptq_inst.free()
            continue

        gptq_inst.fasterquant(blocksize=128, percdamp=0.01, groupsize=-1, actorder=False)

        if hasattr(gptq_inst, 'quantizer') and hasattr(gptq_inst.quantizer, 'scale') and hasattr(gptq_inst.quantizer, 'zero'):
            scale = gptq_inst.quantizer.scale
            zero = gptq_inst.quantizer.zero
            maxq = gptq_inst.quantizer.maxq if hasattr(gptq_inst.quantizer, 'maxq') else (2 ** QUANT_BITS - 1)

            # Retrieve the original weight parameter for deletion.
            original_weight_param = None
            param_name = None
            for name, param in module.named_parameters():
                if param is module.weight:
                    original_weight_param = param
                    param_name = name
                    break

            if original_weight_param is None and hasattr(module, 'weight'):
                print(f"  Warning: module.weight exists but is not registered parameter for {module_name}. Attempting packing.")
                current_weight_float = module.weight.data.clone()
            elif original_weight_param is not None:
                current_weight_float = original_weight_param.data.clone()
            else:
                print(f"  Packing failed for {module_name}: Cannot find weight attribute/parameter after fasterquant.")
                failed_layers.append(module_name)
                continue

            weight_to_quantize = current_weight_float
            if isinstance(module, transformers.Conv1D):
                weight_to_quantize = current_weight_float.t()

            # Call quantize to get the quantized indices, and force cast to int8.
            qweight_int = quant.quantize(weight_to_quantize, scale, zero, maxq).to(torch.int8)

            if isinstance(module, transformers.Conv1D):
                qweight_int = qweight_int.t()

            module.register_buffer('qweight', qweight_int.clone().to('cpu'), persistent=True)
            module.register_buffer('scales', scale.clone().to('cpu'), persistent=True)
            module.register_buffer('qzeros', zero.clone().to('cpu'), persistent=True)

            module.quant_state = {
                'bits': QUANT_BITS,
                'groupsize': getattr(gptq_inst, 'groupsize', -1),
                'sym': QUANT_SYMMETRIC,
                'perchannel': QUANT_PERCHANNEL,
            }

            if param_name is not None:
                del module._parameters[param_name]
                print(f"  Original weight parameter '{param_name}' deleted for {module_name}")
            else:
                try:
                    del module.weight
                    print(f"  Original weight attribute deleted for {module_name}")
                except AttributeError:
                    print(f"  Warning: Could not delete weight attribute/parameter for {module_name}")

            quantized_layers_count += 1

        else:
            print(f"  Packing failed for {module_name}: Quantizer scale/zero not found after fasterquant.")
            failed_layers.append(module_name)

    except Exception as e:
        print(f"  Error during fasterquant or packing for {module_name}: {e}")
        import traceback
        traceback.print_exc()
        failed_layers.append(module_name)
    finally:
        gptq_inst.free()

print("\nQuantization and packing complete.")
print(f"Successfully quantized and packed {quantized_layers_count} layers.")
if failed_layers:
    print(f"Failed or skipped quantization/packing for {len(failed_layers)} layers: {failed_layers}")

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("\nModel layers modified. Quantized layers now have 'qweight', 'scales', 'qzeros' registered as buffers.")


In [None]:
import os
import torch
import gc
import transformers

QUANT_BITS = 8
QUANT_PERCHANNEL = True
QUANT_SYMMETRIC = True

save_dir = '/kaggle/working/quantized_model_8bit'
os.makedirs(save_dir, exist_ok=True)
state_dict = model.state_dict()
state_dict_path = os.path.join(save_dir, 'pytorch_model_quantized.bin')
torch.save(state_dict, state_dict_path)
print(f"Quantized model state dictionary saved to: {state_dict_path}")
try:
    if hasattr(model, 'config'):
        model.config.save_pretrained(save_dir)
        print(f"Model config saved to: {save_dir}")
    else:
        print("Warning: 'model.config' not found. Configuration not saved automatically.")
except Exception as e:
    print(f"Warning: Error saving model config using save_pretrained: {e}")
try:
    if processor is not None:
        default_chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}<|user|>\n{{ message['content'] }}<|end|>\n{% elif message['role'] == 'assistant' %}<|assistant|>\n{{ message['content'] }}<|end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}"
        has_template = False
        if hasattr(processor, 'tokenizer') and processor.tokenizer is not None:
             if getattr(processor.tokenizer, 'chat_template', None) is None:
                  print("Processor tokenizer missing 'chat_template', attempting to set default.")
                  try:
                      processor.tokenizer.chat_template = default_chat_template
                      has_template = True
                  except Exception as e_set:
                      print(f"  Warning: Failed to set default chat_template: {e_set}")
             else:
                  has_template = True
                  print("Processor tokenizer already has a 'chat_template'.")

        processor.save_pretrained(save_dir)
        print(f"Processor files saved to: {save_dir}")
        if not has_template:
            print("  Warning: Processor was saved, but chat_template might still be missing/incorrect.")
    else:
        print("Warning: 'processor' is None. Processor files not saved.")

except AttributeError as e_attr:
     print(f"Warning: AttributeError saving processor: {e_attr}")
     print("  Processor files might be incomplete in the save directory.")
except Exception as e:
     print(f"Warning: General error saving processor using save_pretrained: {e}")
     print("  Processor files might be incomplete in the save directory.")


del state_dict
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("\nSaving process finished.")

In [None]:
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from PIL import Image
import transformers
from transformers import AutoModelForCausalLM, AutoProcessor
import warnings
import gc
import shutil

warnings.filterwarnings('ignore')

MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
LOAD_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
LOAD_DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
load_dir = '/kaggle/working/quantized_model_8bit'
quantized_ckpt_filename = 'pytorch_model.bin' # Use the actual saved name
state_dict_path = os.path.join(load_dir, quantized_ckpt_filename)

if not os.path.exists(state_dict_path):
    raise FileNotFoundError(f"Quantized checkpoint not found: {state_dict_path}")

print(f"Loading base model architecture from: {MODEL_ID}")
config = transformers.AutoConfig.from_pretrained(load_dir, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(load_dir, trust_remote_code=True)

# Load the base architecture
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=LOAD_DTYPE,
    _attn_implementation='eager'
)
print("Base model architecture loaded.")

# Load the state dict containing qweight etc. using strict=False
print(f"Loading custom quantized state_dict from: {state_dict_path}")
quantized_state_dict = torch.load(state_dict_path, map_location='cpu')
load_result = base_model.load_state_dict(quantized_state_dict, strict=False)
print("State dict loading results:")
print(f"  Missing keys: {load_result.missing_keys}")
print(f"  Unexpected keys: {load_result.unexpected_keys}") # Should be empty if buffers were registered
del quantized_state_dict
gc.collect()

base_model.to(LOAD_DEVICE)
base_model.eval()
print(f"Model loaded onto device: {LOAD_DEVICE}")

# --- Necessary Dequantization and Patching Logic ---
def dequantize_weight(module, target_device):
    if not hasattr(module, 'qweight') or not hasattr(module, 'scales') or not hasattr(module, 'qzeros'):
         raise AttributeError(f"Quantized module {module} is missing qweight, scales, or qzeros buffers.")
    qweight = module.qweight.to(target_device)
    scales = module.scales.to(target_device)
    qzeros = module.qzeros.to(target_device)
    qzeros_float = qzeros.float()
    if scales.dim() == qweight.dim():
        dq_weight = (qweight.float() - qzeros_float) * scales
    elif scales.dim() == 1 and qzeros.dim() == 1 and scales.shape[0] == qweight.shape[0]:
        dq_weight = (qweight.float() - qzeros_float.unsqueeze(1)) * scales.unsqueeze(1)
    elif scales.numel() == 1 and qzeros.numel() == 1:
        dq_weight = (qweight.float() - qzeros_float) * scales
    else:
        raise ValueError(f"Unhandled scale/zero shape for module {module}. Weight:{qweight.shape}, Scale:{scales.shape}, Zero:{qzeros.shape}.")
    return dq_weight.to(base_model.dtype)

def quantized_layer_forward(module_instance, input_tensor, *args, **kwargs):
    target_device = input_tensor.device
    dequantized_weight = dequantize_weight(module_instance, target_device)
    bias = getattr(module_instance, 'bias', None)
    if bias is not None:
        bias = bias.to(target_device)
    output = F.linear(input_tensor, dequantized_weight, bias)
    return output

def patch_model_for_inference(model_to_patch):
    patched_count = 0
    for name, module in model_to_patch.named_modules():
        if hasattr(module, 'qweight') and hasattr(module, 'scales') and hasattr(module, 'qzeros'):
            is_weight_missing = any(mkey.startswith(name) and mkey.endswith('.weight') for mkey in load_result.missing_keys)
            if is_weight_missing:
                 print(f"  Weight correctly missing for {name}. Patching forward method.")
            elif hasattr(module, 'weight'):
                 print(f"  WARNING: Module {name} has qweight AND weight. Patching anyway.")
            else:
                 print(f"  Patching forward method for: {name} (Weight status uncertain)")
            module.forward = lambda *args, module=module, **kwargs: quantized_layer_forward(module, *args, **kwargs)
            patched_count += 1
    print(f"Patched {patched_count} quantized layers.")

print("\nPatching model for on-the-fly dequantization inference...")
patch_model_for_inference(base_model)
# --- End of Necessary Logic ---


def prepare_single_input_phi3(img_path, caption_text, processor):
    if not os.path.exists(img_path):
        return None
    image = Image.open(img_path).convert("RGB")
    placeholder = "<|image_1|>\n"
    user_content = placeholder + str(caption_text).strip()
    messages = [{"role": "user", "content": user_content}]
    prompt = processor.tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = processor(prompt, [image], return_tensors="pt")
    return inputs

base_path = '/kaggle/input/flickr30k'
captions_file = os.path.join(base_path, 'captions.txt')
calibration_output_dir = '/kaggle/working/flickr30k_calibration'

try:
    df_captions = pd.read_csv(captions_file, delimiter=',', header=None,
                              names=['image_name', 'caption_index', 'caption'])
    df_captions['caption'] = df_captions['caption'].astype(str).str.strip()
    captions_dict = df_captions.groupby('image_name')['caption'].apply(list).to_dict()

    calib_image_names = [f for f in os.listdir(calibration_output_dir)
                         if f.lower().endswith(('.png', '.jpg', '.jpeg')) and os.path.isfile(os.path.join(calibration_output_dir, f))]
    if not calib_image_names:
        raise ValueError("No images found in calibration directory.")

    test_img_name = random.choice(calib_image_names)
    test_img_path = os.path.join(calibration_output_dir, test_img_name)
    test_captions = captions_dict.get(test_img_name, ["Describe the image."])
    test_caption = test_captions[0]

    print(f"\nRunning inference for sample: Image='{test_img_name}', Caption='{test_caption}'")

    inputs = prepare_single_input_phi3(test_img_path, test_caption, processor)
    if inputs is None:
        raise ValueError("Input preparation failed.")

    inputs = {k: v.to(LOAD_DEVICE, non_blocking=True) for k, v in inputs.items()}

    generation_args = {
        "max_new_tokens": 500,
        "temperature": 0.0,
        "do_sample": False,
    }

    print("Generating output...")
    with torch.no_grad():
        eos_token_id = processor.tokenizer.eos_token_id
        if isinstance(eos_token_id, list):
            eos_token_id = eos_token_id[0]

        generate_ids = base_model.generate( # Use the patched base_model
            **inputs,
            eos_token_id=eos_token_id,
            **generation_args
        )

        input_token_len = inputs['input_ids'].shape[1]
        generate_ids = generate_ids[:, input_token_len:]
        response = processor.tokenizer.decode(
            generate_ids[0],
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )

    print("\nInference successful!")
    print("\nDecoded Output:", response)

except FileNotFoundError as e:
    print(f"Error: Required file or directory not found: {e}")
except ValueError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"\nAn unexpected error occurred during example inference: {e}")
    import traceback
    traceback.print_exc()

print("\n--- Single Instance Check Complete ---")

In [None]:
from peft import get_peft_model_state_dict

for name, param in base_model.named_parameters():
    if hasattr(param, 'dtype'):
        print(f"{name}: {param.dtype}")
