<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/JEPA_LLAMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Uninstall existing versions for a clean slate
!pip uninstall -y torch torchvision torchaudio flash-attn -q

# Install PyTorch, Hugging Face libraries, and other dependencies
!pip install --upgrade torch torchvision torchaudio transformers datasets accelerate evaluate bitsandbytes peft trl ninja packaging diffusers safetensors colab-env flash-attn --no-build-isolation --quiet



In [1]:
# Verify the installation
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch version: 2.7.1+cu126
CUDA available: True


In [2]:
import torch
import os
import sys
import json
import IPython
from datetime import datetime
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)

In [None]:
import os
from typing import List, Dict
import logging
import pandas as pd
from sklearn.linear_model import LogisticRegression
import re
import time
from tqdm import tqdm

# Disable warnings from transformers
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

import torch
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")


# Specify CUDA device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Initialize Llama model and tokenizer
llama_model_id = "meta-llama/Llama-3.1-8B-Instruct"

# Initialize Llama 3 with padding token fix
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_id, use_fast=True)
llama_tokenizer.padding_side = "right"
llama_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Initialize Llama 3
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_model_id,
    device_map=device,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

# Resize token embeddings with mean_resizing=False
llama_model.resize_token_embeddings(len(llama_tokenizer), mean_resizing=False)

# Move model to the device
llama_model.to(device)

llama_pipeline = pipeline(
    "text-generation",
    model=llama_model,
    tokenizer=llama_tokenizer,
    pad_token_id=llama_tokenizer.pad_token_id
)

llama_pipeline.generation_params = {
    "max_new_tokens": 2048, # 4096
    "do_sample": True,
    "top_k": 50,
    "top_p": 0.95,
    "temperature": 0.7,
    "pad_token_id": llama_tokenizer.pad_token_id,
}


## I-JEPA (Image Joint Embedding Predictive Architecture) with LLaMA

In [3]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm

# Define the encoder network
class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3)
        self.fc1 = nn.Linear(128 * 4 * 4, 128)

    def forward(self, x):
        x = transforms.functional.resize(x, size=(8, 8))
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(-1, 128 * 4 * 4)
        x = torch.relu(self.fc1(x))
        return x

# Define the predictor network
class ImagePredictor(nn.Module):
    def __init__(self):
        super(ImagePredictor, self).__init__()
        self.fc1 = nn.Linear(128, 128)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return x

# Define the I-JEPA model
class IJepa(nn.Module):
    def __init__(self):
        super(IJepa, self).__init__()
        self.encoder = ImageEncoder()
        self.predictor = ImagePredictor()

    def forward(self, x1, x2):
        z1 = self.encoder(x1)
        z2 = self.encoder(x2)
        p1 = self.predictor(z1)
        loss = nn.MSELoss()(p1, z2)
        return loss

# Use LLaMA  to generate captions for images
def generate_captions(images):
    captions = []
    for image in images:
        # Use a placeholder caption for demonstration purposes
        caption = "A picture of an object"
        inputs = llama_tokenizer(caption, return_tensors="pt").to(device)
        outputs = llama_model(**inputs, output_hidden_states=True)
        captions.append(outputs.hidden_states[-1][:, 0, :])
    return captions

# Train the I-JEPA model
transform = transforms.Compose([transforms.ToTensor()])
dataset = CIFAR10(root='./data', train=True, download=True, transform=transform)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

model = IJepa().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

n_epochs=5
for epoch in range(n_epochs):
    for x, _ in tqdm(data_loader, desc=f"Epoch {epoch+1}/{n_epochs}"):
        x = x.to(device)
        x1 = x + torch.randn_like(x) * 0.1  # Add noise to create a different view
        x2 = x
        loss = model(x1, x2)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Generate captions for images using LLaMA 4
        captions = generate_captions(x)

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1/5: 100%|██████████| 1563/1563 [2:03:50<00:00,  4.75s/it]


Epoch 1, Loss: 4.418068755285276e-08


Epoch 2/5: 100%|██████████| 1563/1563 [2:03:42<00:00,  4.75s/it]


Epoch 2, Loss: 1.1889252249375204e-07


Epoch 3/5: 100%|██████████| 1563/1563 [2:03:25<00:00,  4.74s/it]


Epoch 3, Loss: 7.83093412337621e-08


Epoch 4/5: 100%|██████████| 1563/1563 [2:03:14<00:00,  4.73s/it]


Epoch 4, Loss: 3.0441270837400225e-08


Epoch 5/5: 100%|██████████| 1563/1563 [2:03:16<00:00,  4.73s/it]

Epoch 5, Loss: 4.544378207071986e-09





## evaluation

In [4]:
!pip install pycocoevalcap -q

In [11]:
# Necessary Imports
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from tqdm import tqdm
from nltk.translate.meteor_score import meteor_score
from pycocoevalcap.cider.cider import Cider
import nltk
nltk.download('wordnet')

# Define the ImageEncoder and ImageCaptioningModel from the document
class ImageEncoder(nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3)
        self.fc1 = nn.Linear(128*4*4, 128)
    def forward(self, x):
        x = transforms.functional.resize(x, size=(8, 8))
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(-1, 128*4*4)
        x = torch.relu(self.fc1(x))
        return x

class ImageCaptioningModel(nn.Module):
    def __init__(self):
        super(ImageCaptioningModel, self).__init__()
        self.encoder = ImageEncoder()
        self.decoder = nn.Linear(128, 128)
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Conceptual Dataset class to demonstrate real data
class ImageDatasetWithCaptions(torch.utils.data.Dataset):
    def __init__(self):
        self.images = torch.randn(100, 3, 32, 32)
        self.captions = [f"This is a caption for image {i}" for i in range(100)]
    def __getitem__(self, index):
        return self.images[index], self.captions[index]
    def __len__(self):
        return len(self.images)

# Placeholder function to simulate real caption generation and decoding
def generate_and_decode_captions(images):
    # This simulates calling the Llama model and decoding its output.
    return [f"Predicted caption for an image" for _ in range(len(images))]

# Modified evaluation function
def evaluate_captions_real(model, data_loader, device):
    model.eval()
    meteor_scores = []
    gts = {}
    res = {}

    with torch.no_grad():
        for i, (x, true_captions) in enumerate(tqdm(data_loader)):
            x = x.to(device)
            predicted_captions_text = generate_and_decode_captions(x)

            batch_start_idx = i * data_loader.batch_size
            for j in range(len(predicted_captions_text)):
                idx = batch_start_idx + j
                gts[idx] = [true_captions[j]]
                res[idx] = [predicted_captions_text[j]]

            for pred, true in zip(predicted_captions_text, true_captions):
                meteor_scores.append(meteor_score([true.split()], pred.split()))

    cider_scorer = Cider()
    (score, _) = cider_scorer.compute_score(gts, res)
    cider_score_avg = score
    meteor_score_avg = sum(meteor_scores) / len(meteor_scores)

    print('\n')
    print(f"METEOR Score: {meteor_score_avg:.4f}")
    #print(f"CIDEr Score: {cider_score_avg:.4f}")

# Execute evaluation
dataset = ImageDatasetWithCaptions()
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ImageCaptioningModel().to(device)
evaluate_captions_real(model, data_loader, device)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 4/4 [00:00<00:00, 327.04it/s]



METEOR Score: 0.3758





## V-JEPA (Video Joint Embedding Predictive Architecture) with LLaMA

In [10]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.datasets import Kinetics
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm


# Define the encoder network
class VideoEncoder(nn.Module):
    def __init__(self):
        super(VideoEncoder, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3))
        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3))
        self.fc1 = nn.Linear(128 * 2 * 2 * 2, 128)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(-1, 128 * 2 * 2 * 2)
        x = torch.relu(self.fc1(x))
        return x

# Define the predictor network
class VideoPredictor(nn.Module):
    def __init__(self):
        super(VideoPredictor, self).__init__()
        self.fc1 = nn.Linear(128, 128)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return x

# Define the V-JEPA model
class VJepa(nn.Module):
    def __init__(self):
        super(VJepa, self).__init__()
        self.encoder = VideoEncoder()
        self.predictor = VideoPredictor()

    def forward(self, x1, x2):
        z1 = self.encoder(x1)
        z2 = self.encoder(x2)
        p1 = self.predictor(z1)
        loss = nn.MSELoss()(p1, z2)
        return loss

# Use LLaMA 4 to generate captions for videos
def generate_captions(videos):
    captions = []
    for video in videos:
        # Use a placeholder caption for demonstration purposes
        caption = "A video of an object"
        inputs = llama_tokenizer(caption, return_tensors="pt").to(device)
        outputs = llama_model(**inputs, output_hidden_states=True)
        captions.append(outputs.hidden_states[-1][:, 0, :])
    return captions

# Train the V-JEPA model
# Note: Kinetics dataset is large and requires significant resources.
# For demonstration purposes, we'll use a dummy dataset.
class DummyVideoDataset(torch.utils.data.Dataset):
    def __init__(self):
        self.videos = torch.randn(100, 3, 10, 32, 32)  # 100 videos, 3 channels, 10 frames, 32x32 resolution

    def __getitem__(self, index):
        return self.videos[index]

    def __len__(self):
        return len(self.videos)

dataset = DummyVideoDataset()
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

model = VJepa().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

n_epochs = 5
for epoch in range(n_epochs):
    for x in tqdm(data_loader, desc=f"Epoch {epoch+1}/{n_epochs}"):
        x = x.to(device)
        x1 = x + torch.randn_like(x) * 0.1  # Add noise to create a different view
        x2 = x
        loss = model(x1, x2)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Generate captions for videos using LLaMA 4
        captions = generate_captions(x)

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1/5: 100%|██████████| 4/4 [00:15<00:00,  3.90s/it]


Epoch 1, Loss: 0.0008843782125040889


Epoch 2/5: 100%|██████████| 4/4 [00:14<00:00,  3.69s/it]


Epoch 2, Loss: 0.0005779517232440412


Epoch 3/5: 100%|██████████| 4/4 [00:14<00:00,  3.66s/it]


Epoch 3, Loss: 0.00034413207322359085


Epoch 4/5: 100%|██████████| 4/4 [00:15<00:00,  3.75s/it]


Epoch 4, Loss: 0.00019226589938625693


Epoch 5/5: 100%|██████████| 4/4 [00:14<00:00,  3.65s/it]

Epoch 5, Loss: 0.00010456710151629522





## evaluation

In [9]:
# Necessary Imports
import torch
import torch.nn as nn
from tqdm import tqdm
from nltk.translate.meteor_score import meteor_score
from pycocoevalcap.cider.cider import Cider
import nltk
nltk.download('wordnet')

# Define the VideoEncoder and VideoCaptioningModel from the document
class VideoEncoder(nn.Module):
    def __init__(self):
        super(VideoEncoder, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3))
        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3))
        self.fc1 = nn.Linear(128*2*2*2, 128)
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(-1, 128*2*2*2)
        x = torch.relu(self.fc1(x))
        return x

class VideoCaptioningModel(nn.Module):
    def __init__(self):
        super(VideoCaptioningModel, self).__init__()
        self.encoder = VideoEncoder()
        self.decoder = nn.Linear(128, 128)
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Conceptual Dataset class with real captions
class DummyVideoDatasetWithCaptions(torch.utils.data.Dataset):
    def __init__(self):
        self.videos = torch.randn(100, 3, 10, 32, 32)
        self.captions = [f"A real caption for video {i}" for i in range(100)]
    def __getitem__(self, index):
        return self.videos[index], self.captions[index]
    def __len__(self):
        return len(self.videos)

# Placeholder function to simulate real caption generation and decoding
def generate_and_decode_captions(videos):
    # This simulates calling the Llama model and decoding its output.
    return [f"Predicted caption for a video" for _ in range(len(videos))]

# Modified evaluation function
def evaluate_video_captions_real(model, data_loader, device):
    model.eval()
    meteor_scores = []
    gts = {}
    res = {}

    with torch.no_grad():
        for i, (x, true_captions) in enumerate(tqdm(data_loader)):
            x = x.to(device)
            predicted_captions_text = generate_and_decode_captions(x)

            batch_start_idx = i * data_loader.batch_size
            for j in range(len(predicted_captions_text)):
                idx = batch_start_idx + j
                gts[idx] = [true_captions[j]]
                res[idx] = [predicted_captions_text[j]]

            for pred, true in zip(predicted_captions_text, true_captions):
                meteor_scores.append(meteor_score([true.split()], pred.split()))

    cider_scorer = Cider()
    (score, _) = cider_scorer.compute_score(gts, res)
    cider_score_avg = score
    meteor_score_avg = sum(meteor_scores) / len(meteor_scores)

    print('\n')
    print(f"METEOR Score: {meteor_score_avg:.4f}")
    #print(f"CIDEr Score: {cider_score_avg:.4f}")

# Execute evaluation
dataset = DummyVideoDatasetWithCaptions()
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VideoCaptioningModel().to(device)
evaluate_video_captions_real(model, data_loader, device)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 4/4 [00:00<00:00, 278.40it/s]



METEOR Score: 0.5350



