<a href="https://colab.research.google.com/github/evanmiller620/ECE570Project/blob/main/FInalProjectImplementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Implementation of **Language Is Not All You Need: Aligning Perception with Language Models**

An extension upon the KOSMOS model made by Microsoft AI Research

In [None]:
!pip install datasets
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
import requests
from PIL import Image


Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

In [None]:
# check for GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to(device) # Importing the pretrained KOSMOS model
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

Using device: cuda


model.safetensors:  17%|#7        | 1.16G/6.66G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/534 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/191k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/32.0k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

In [None]:
prompt = "<grounding>An image of"

url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png"
image = Image.open(requests.get(url, stream=True).raw)

# The original Kosmos-2 demo saves the image first then reload it. For some images, this will give slightly different image input and change the generation outputs.
image.save("new_image.jpg")
image = Image.open("new_image.jpg")

inputs = processor(text=prompt, images=image, return_tensors="pt")
inputs = inputs.to(device)

generated_ids = model.generate(
    pixel_values=inputs["pixel_values"],
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    image_embeds=None,
    image_embeds_position_mask=inputs["image_embeds_position_mask"],
    use_cache=True,
    max_new_tokens=128,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# Specify `cleanup_and_extract=False` in order to see the raw model generation.
processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)

print(processed_text)
# `<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.`

# By default, the generated  text is cleanup and the entities are extracted.
processed_text, entities = processor.post_process_generation(generated_text)

print(processed_text)
# `An image of a snowman warming himself by a fire.`

print(entities)
# `[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]`

<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.
An image of a snowman warming himself by a fire.
[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForVision2Seq, AutoProcessor
import torch.nn.functional as F

class MultiModalAttention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(MultiModalAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=num_heads)

    def forward(self, text_features, image_features):
        # Combine features for cross-modal attention
        combined_features = torch.cat((text_features, image_features), dim=1)
        attn_output, _ = self.attention(combined_features, combined_features, combined_features)
        return attn_output

class ExtendedKOSMOS(nn.Module):
    def __init__(self, base_model_name, embed_size=768, num_heads=8):
        super(ExtendedKOSMOS, self).__init__()
        self.base_model = AutoModelForVision2Seq.from_pretrained(base_model_name)

        # Text encoder
        self.text_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads), num_layers=6
        )

        # Image embedding projection
        self.image_projection = nn.Linear(2048, embed_size)  # Adjust based on output size from KOSMOS

        # Text embedding projection
        self.text_projection = nn.Linear(embed_size, embed_size)

        # Multimodal attention layer
        self.multi_modal_attention = MultiModalAttention(embed_size, num_heads)

        # Classification head (optional)
        self.classification_head = nn.Linear(embed_size, 10)  # Example for 10 classes, adjust as needed

        # Layer normalization and dropout
        self.layer_norm = nn.LayerNorm(embed_size)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, images, texts, attention_mask=None):
        # Process images through the base KOSMOS model
        image_outputs = self.base_model(images=images)
        image_embeddings = self.image_projection(image_outputs.last_hidden_state)

        # Process texts through the new text encoder
        text_outputs = self.text_encoder(texts, src_key_padding_mask=attention_mask)
        text_embeddings = self.text_projection(text_outputs)

        # Apply multi-modal attention
        combined_features = self.multi_modal_attention(text_embeddings, image_embeddings)

        # Layer normalization and dropout
        combined_features = self.layer_norm(combined_features)
        combined_features = self.dropout(combined_features)

        # Optional classification
        class_logits = self.classification_head(combined_features)

        return image_embeddings, text_embeddings, combined_features, class_logits

    def contrastive_loss(self, image_embeddings, text_embeddings, temperature=0.07):
        # Normalize the embeddings
        image_embeddings = F.normalize(image_embeddings, dim=-1)
        text_embeddings = F.normalize(text_embeddings, dim=-1)

        # Calculate cosine similarity
        logits = torch.matmul(image_embeddings, text_embeddings.T) / temperature

        # Create labels for the contrastive task
        labels = torch.arange(logits.size(0)).to(logits.device)

        # Calculate the loss
        loss = nn.CrossEntropyLoss()(logits, labels)
        return loss

# Initialize the extended model
extended_model = ExtendedKOSMOS("microsoft/kosmos-2-patch14-224").to(device)

# Load the processor
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

# Example input preparation
image_url = "https://example.com/path/to/your/image.jpg"  # Replace with a valid image URL
image = Image.open(requests.get(image_url, stream=True).raw)
texts = ["This is a description of the image."]  # Example text input

# Process inputs
inputs = processor(images=image, text=texts, return_tensors="pt").to(device)

# Forward pass
with torch.no_grad():
    image_embeddings, text_embeddings, combined_features, class_logits = extended_model(**inputs)

# Further processing (e.g., calculating contrastive loss, classification, etc.)


In [None]:
def contrastive_loss(image_embeddings, text_embeddings, temperature=0.07):
    # Normalize the embeddings
    image_embeddings = nn.functional.normalize(image_embeddings, dim=-1)
    text_embeddings = nn.functional.normalize(text_embeddings, dim=-1)

    # Calculate cosine similarity
    logits = torch.matmul(image_embeddings, text_embeddings.T) / temperature

    # Create labels for the contrastive task
    labels = torch.arange(logits.size(0)).to(logits.device)

    # Calculate the loss
    loss = nn.CrossEntropyLoss()(logits, labels)
    return loss