In [33]:
!pip install -i https://test.pypi.org/simple/ bitsandbytes

Looking in indexes: https://test.pypi.org/simple/
Collecting bitsandbytes
  Downloading https://test-files.pythonhosted.org/packages/5c/e0/597d593ec3b6cf5ea7eb4894a545045bd95611de8a316a2a1eaa838a2459/bitsandbytes-0.39.0-py3-none-any.whl.metadata (9.8 kB)
Downloading https://test-files.pythonhosted.org/packages/5c/e0/597d593ec3b6cf5ea7eb4894a545045bd95611de8a316a2a1eaa838a2459/bitsandbytes-0.39.0-py3-none-any.whl (95.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.8/95.8 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.39.0


In [32]:
!pip install accelerate



In [3]:
!pip install torch torchvision torchaudio transformers requests pillow

Collecting torchaudio
  Using cached torchaudio-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting torch
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Downloading typing_extensions-4.12.0-py3-none-any.whl.metadata (3.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Downloading nvidia_c

In [35]:
import torch
from transformers import PreTrainedModel
import torch.nn as nn
from loader.model_loader import load_vision_model, load_llm
from vision.projector import load_vision_projector
from vision.feature_select import feature_select
from vision.learned_encoding import load_learned_positional
from image_handling.padding import resize_with_padding, load_images
from image_handling.slice import split_image
from transformers import BitsAndBytesConfig
import math
import requests
from PIL import Image
from io import BytesIO

class LeMultiModalConfig:
    def __init__(self, 
                 max_len=8, 
                 device="cuda" if torch.cuda.is_available() else "cpu",
                 vision_model_path="openai/clip-vit-large-patch14-336",
                 llm_model_path="SweatyCrayfish/llama-3-8b-quantized",
                 positional_encoding_type="sinusoidal",  # Or "learned", "none"
                 **kwargs):
        self.max_len = max_len
        self.device = device
        self.vision_model_path = vision_model_path
        self.llm_model_path = llm_model_path
        self.positional_encoding_type = positional_encoding_type

class LeMultiModal(nn.Module):
    def __init__(self, config :LeMultiModalConfig):
        super().__init__()
        self.config = config
        self.device = config.device
        self.max_len = config.max_len
        self.quantization_config = BitsAndBytesConfig(load_in_8bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
        self.vision_model , self.image_processor = load_vision_model(config.vision_model_path, device = self.device )
        self.llm, self.tokenizer = load_llm(config.llm_model_path, device = self.device, quantization_config = None)
        self.vision_projector = load_vision_projector()
        self.llm_dim = self.llm.config.hidden_size
        self.vision_dim = self.vision_model.config.hidden_size
        self.learned_positional = load_learned_positional(self.max_len, self.llm_dim)
        self.uhd_sepparators = self.get_token_embeddings(["\n", ","])

    def get_token_embeddings(self, text):
        input_ids = self.tokenizer(text).input_ids

        with torch.no_grad():  # Optionally disable gradient calculation
            embeddings = self.llm.get_input_embeddings()(torch.tensor(input_ids).to(self.device))

        return embeddings

    def get_positional_encoding(max_seq_len, embedding_dim):
        position_encoding = torch.zeros(max_seq_len, embedding_dim)
        position = torch.arange(0, max_seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2) * (-math.log(10000.0) / embedding_dim))
        position_encoding[:, 0::2] = torch.sin(position * div_term)
        position_encoding[:, 1::2] = torch.cos(position * div_term)
        return position_encoding

    def processs(self, image, text):
        #Supports just 1 image for now
        if "<image>" not in text:
            new_embeddings = self.get_token_embeddings(text)
        else:
            assert text.count("<image>") == 1
            new_embeddings = self.encode_images_no_positional_encoding(image)
            before, after = text.split("<image>")
            if len(before) > 0:
                new_embeddings = torch.cat((self.get_token_embeddings(before), new_embeddings), dim=0)
            if len(after) > 0:
                new_embeddings = torch.cat((new_embeddings, self.get_token_embeddings(after)), dim=0)

        #run the embeddings through the llm and return the result in clear text
        with torch.no_grad():
            output = self.llm(new_embeddings.unsqueeze(0))
            return self.tokenizer.decode(output[0])
        
    def forward(self, image, text):
        #Supports just 1 image for now
        if "<image>" not in text:
            new_embeddings = self.get_token_embeddings(text)
        else:
            assert text.count("<image>") == 1
            new_embeddings = self.encode_images_no_positional_encoding(image)
            before, after = text.split("<image>")
            if len(before) > 0:
                new_embeddings = torch.cat((self.get_token_embeddings(before), new_embeddings), dim=0)
            if len(after) > 0:
                new_embeddings = torch.cat((new_embeddings, self.get_token_embeddings(after)), dim=0)

        #run the embeddings through the llm and return the result in clear text
        with torch.no_grad():
            output = self.llm(new_embeddings.unsqueeze(0))
            return self.tokenizer.decode(output[0])

    def encode_images_positional_encoding(self, images, padding = True, sinusoidal_encoding = True, learned_encoding = False):
        MAX_LEN = 8

        image_tensors = self.image_processor.preprocess(images, return_tensors='pt')['pixel_values'].to(self.device)
        #for the case where there are less than 8 images, add empty tensors
        if(padding):
            for i in range(MAX_LEN-len(images)):
                image_tensors = torch.cat((image_tensors, torch.zeros_like(image_tensors[0]).unsqueeze(0)), dim=0)
        
        with torch.no_grad(): 
            batch_features = self.vison_model(image_tensors, output_hidden_states=True)
            image_features = batch_features.hidden_states[-1]
            image_features = feature_select(image_features, "patch")
            # Positional Encoding
            if(sinusoidal_encoding):
                max_seq_len = image_features.shape[1]
                pos_encoding = self.get_positional_encoding(max_seq_len, image_features.shape[-1]).to(self.device)
                image_features += pos_encoding

        # Learned Positional Encoding
        if learned_encoding:
            image_features += self.learned_encoding_layer(image_features)

        return self.vision_projector(image_features)
    
    def images_uhd_positional_encoding(self, image):
        #lower the image with padding to 
        resized_image = resize_with_padding(image, 336)
        splits , h , w = split_image(image)
        self.encode_images_positional_encoding(splits)

    def imaged_uhd_arranged(self, image):
        resized_image = resize_with_padding(image, 336)
        splits , h , w = split_image(image)

        embeddings = self.encode_images_no_positional_encoding(splits)
        new_embeddings = []
        for i in range(h):
            for j in range(w):
                new_embeddings.append(embeddings[i*w+j])
                new_embeddings.append(self.uhd_sepparators[1])
            new_embeddings.append(self.uhd_sepparators[0])
        
        return new_embeddings
                
    
    def encode_images_no_positional_encoding(self, image_tensors):
        with torch.no_grad(): 
            batch_features = self.vison_model(image_tensors, output_hidden_states=True)
            image_features = batch_features.hidden_states[-1]
            image_features = feature_select(image_features, "patch")
        return self.vision_projector(image_features)

In [36]:
# Create configuration
config = LeMultiModalConfig()

# Initialize the model
model = LeMultiModal(config)

image = img

# Prepare text input (replace with your actual text input)
text = "This is a "
 
# Get model output
output = model.forward(image, text)
print(output)


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at SweatyCrayfish/llama-3-8b-quantized and are newly initialized: ['model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 

In [18]:
!pip install boto3



In [None]:
!pip install python-dotenv

In [19]:
import boto3
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()

aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)
#Download the images from s3
def download_image(bucket_name, image_path):
    client.download_file(bucket_name, image_path, 'tmp.jpg')

def download_data(bucket_name, image_path):
    client.download_file(bucket_name, image_path, 'data.json')

download_data("multimodal-ai-dataset", "sharegpt4v/sharegpt4v_instruct_gpt4-vision_cap100k.json")

def load_img(image_path):
    download_image("multimodal-ai-dataset", image_path)
    images = load_images(["tmp.jpg"])
    image = images[0]
    return image

In [21]:
import json

with open('data.json') as f:
    data = json.load(f)

In [28]:
img_link = data[:1][0]["image"]
print(img_link)
img_link = 'coco/train/'+ img_link.split('/')[2]
print(img_link)

coco/train2017/000000000009.jpg
coco/train/000000000009.jpg


In [30]:
img = load_img(img_link)