In [1]:
import os
import re
import io
import torch
import random
import warnings
import numpy as np
import polars as pl
import pandas as pd
from torch import nn
from typing import cast
from pathlib import Path
from peft import LoraConfig
from huggingface_hub import notebook_login
from colpali_engine.loss import ColbertPairwiseCELoss
from colpali_engine.models import ColPali, ColPaliProcessor
from datasets import load_from_disk, Dataset, Features, Image, Value
from colpali_engine.trainer.contrastive_trainer import ContrastiveTrainer
from colpali_engine.utils.torch_utils import get_torch_device, tear_down_torch
from colpali_engine.collators.visual_retriever_collator import VisualRetrieverCollator
from transformers import BitsAndBytesConfig, TrainerCallback, TrainingArguments, EarlyStoppingCallback, logging

logging.set_verbosity_error()
warnings.simplefilter('ignore')
notebook_login(new_session=False)

User is already logged in.


In [2]:
wd = os.path.dirname(os.getcwd())
os.chdir(wd)
print(f'path: {wd}') 

path: /home/dgarieck23/VLMs/tunnel_vision


In [3]:
from src.utils.utils import *

### Helpers

In [4]:
def print_trainable_parameters(model: nn.Module) -> None:
    '''
    Print the number of trainable parameters in the model.
    '''
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f'trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param}'
    )

### Leveraging GPU for Perfomance
To optimize performance, we'll use GPU accelaration if available

In [5]:
device = get_device()

GPU is available
GPU name: NVIDIA GeForce RTX 4090 Laptop GPU


#### Set Seed
To enhance reproductibility, and comparatibility

In [6]:
seed = 42
random.seed(seed)              # Python's built-in random module
np.random.seed(seed)           # NumPy
torch.manual_seed(seed)        # PyTorch
torch.cuda.manual_seed(seed)   # For GPU computations in PyTorch
torch.cuda.manual_seed_all(seed)  # If you're using multiple GPUs
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### Choose a quantization strategy

Quantization not only reduces the size of model weights but also reduces the memory consumption during inference. Below is an example of how memory usage changes with different quantization strategies:

- **fp16 (16-bit floating point)**: Each parameter requires 16 bits (2 bytes).
- **8-bit Quantization**: Each parameter requires 8 bits (1 byte).
- **4-bit Quantization**: Each parameter requires 4 bits (0.5 bytes).

#### Example:

Assume a model has **100 million parameters**.

| Precision         | Bits per Parameter | Total Memory Usage (MB) |
|-------------------|--------------------|-------------------------|
| **fp16 (16-bit)** | 16 bits (2 bytes)  | 100M * 2 bytes = 200 MB |
| **8-bit**         | 8 bits (1 byte)    | 100M * 1 byte = 100 MB  |
| **4-bit**         | 4 bits (0.5 bytes) | 100M * 0.5 bytes = 50 MB|

As seen from the table, moving from fp16 to 8-bit cuts the memory usage in half, and further reducing to 4-bit halves the memory usage again. Quantization thus plays a significant role in reducing the memory footprint of models, which is especially useful for deploying large models on devices with limited memory.

In [7]:
quant_strat = '4bit'

if quant_strat and str(device) != 'cuda:0':
    raise ValueError('This notebook requires a CUDA GPU to use quantization.')

In [8]:
# prepare quantization config
if quant_strat is None:
    bnb_config = None
elif quant_strat == '8bit':
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
    )
elif quant_strat == '4bit':
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
else:
    raise ValueError(f'Invalid quantization strategy: {quant_strat}')

### Loading the pre-trained model

ColPali is an advanced retrieval model that combines two powerful components:
- **Col**: Derived from ColBERT (Contextualized Late Interaction over BERT), a retrieval method designed for efficient and accurate document search by embedding queries and document passages into high-dimensional spaces and applying a late interaction mechanism for matching.
- **Pali**: Refers to a Vision Language Model (VLM) named PaliGemma, which is used to process visual content such as images or screenshots from documents.

##### How ColPali Works

Instead of relying on complex text-based PDF parsing, ColPali simplifies the indexing process by using **screenshots** of PDF pages. These visual representations are embedded using the Vision Language Model (PaliGemma). When a query is provided at inference time, ColPali embeds the query and matches it to the most similar document pages using the **contextualized late interaction mechanism** introduced by ColBERT. This enables ColPali to efficiently retrieve the most relevant document pages based on the visual and contextual similarity of the content.


<div>
<p style="text-align: left;">Architecture of ColPali illustrated</p>
<img src="/home/dgarieck23/VLMs/tunnel_vision/misc/colpali_arch.png" width="400" height="300"/>
<p style="text-align: left;"><em>Source: https://x.com/helloiamleonie</em></p>
</div>

In [9]:
# pre-trained model name (with LoRA adapter)
model_name = 'vidore/colpali-v1.2'

# get the LoRA config from the pretrained model
lora_config = LoraConfig.from_pretrained(model_name)

In [10]:
# load the model with the loaded pre-trained adapter
model = cast(
    ColPali,
    ColPali.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map=device,
    ),
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
if not model.active_adapters():
    raise ValueError('No adapter found in the model.')

### Deciding the Fine-tuning strategy

When adapting a pre-trained model to a new task, we need to decide on the fine-tuning strategy. Two common approaches are fine-tuning the bias terms or using LoRA (Low-Rank Adaptation), both of which allow us to modify the model without adjusting all of its parameters.

- **Fine-tuning the bias** involves updating only the bias terms of the model, which are small, additional parameters added to each neuron. This method is lightweight and quick, as only a small portion of the model’s parameters are changed. It’s useful for slight adaptations to the model when working with a related task or dataset. However, this strategy has limited flexibility since only biases are adjusted, which may not be sufficient for more complex tasks.

- **LoRA (Low-Rank Adaptation)** is a more flexible fine-tuning approach that introduces small, trainable matrices (adapters) into specific layers of the model. These matrices enable significant adjustments without modifying the core weights of the pre-trained model. LoRA provides a balance between preserving the model’s original knowledge and allowing it to learn new tasks, while still being memory-efficient compared to full fine-tuning.

The choice of strategy depends on the specific task and resources. If minimal adjustment is required, bias fine-tuning is a quick and effective option. For tasks needing more model flexibility while maintaining the original pre-trained knowledge, LoRA provides a more powerful fine-tuning solution.

In [12]:
# we need to unfreeze tha parameters
params_name = 'lora' # either bias or lora

for name, param in model.named_parameters():
    if params_name in name:
        param.requires_grad = True

print_trainable_parameters(model)

trainable params: 39,292,928 || all params: 1,766,287,216 || trainable%: 2.224605808390791


### Load the processor and the collator

In [13]:
if lora_config.base_model_name_or_path is None:
    raise ValueError('Base model name or path is required in the LoRA config.')

processor = cast(
    ColPaliProcessor,
    ColPaliProcessor.from_pretrained(model_name),
)
collator = VisualRetrieverCollator(processor=processor)

### Load the dataset

In [14]:
dataset = load_from_disk('data/processed/annual reports')

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'query', 'answer'],
        num_rows: 3428
    })
    test: Dataset({
        features: ['image', 'query', 'answer'],
        num_rows: 1470
    })
})

### Define training args

In [16]:
checkpoints_dir = Path('models/checkpoints')
checkpoints_dir.mkdir(exist_ok=True, parents=True)

In [17]:
training_args = TrainingArguments(
    output_dir=str(checkpoints_dir),
    hub_model_id=None,
    overwrite_output_dir=True,
    num_train_epochs=1.5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,
    eval_strategy='steps',
    save_steps=200,
    logging_steps=20,
    eval_steps=100,
    warmup_steps=100,
    learning_rate=5e-5,
    save_total_limit=1,
    report_to=[],
)

### Create the trainer

In [18]:
class EvaluateFirstStepCallback(TrainerCallback):
    '''
    Run eval after the first training step.
    Used to have a more precise evaluation learning curve.
    '''

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step == 1:
            control.should_evaluate = True

In [19]:
trainer = ContrastiveTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    args=training_args,
    data_collator=collator,
    loss_func=ColbertPairwiseCELoss(),
    is_vision_model=True
)

In [20]:
trainer.args.remove_unused_columns = False
trainer.add_callback(EvaluateFirstStepCallback())

### Evaluate the model before training

In [None]:
eval_results = trainer.evaluate()

### Fine-tune the model

Unfortunately, I ran out of memory while trying to fine-tune the model locally. To work around this limitation, I attempted fine-tuning only the bias parameters, but this did not yield significant results for my task. However, the good news is that you can still use this notebook to train your own model by following the steps outlined here. If you have access to more powerful hardware, you could explore more comprehensive fine-tuning strategies, such as using LoRA or even full fine-tuning, depending on your needs and resources.


<div>
<p style="text-align: left;">Large Language Models vs GPUs</p>
<img src="/home/dgarieck23/VLMs/tunnel_vision/misc/llm_vs_gpu.png" width="400" height="300"/>
<p style="text-align: left;"><em>Source: Tom & Jerry</em></p>
</div>


In [None]:
# train the model
train_results = trainer.train()

In [29]:
# store the model adapter 
trainer.save_model('models/colpali_ar_finetuned_4bit')

### Load your fine-tuned model

In [None]:
adapter_name = 'models/colpali_ar_finetuned_4bit'

model = cast(
    ColPali,
    ColPali.from_pretrained(
        adapter_name,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map=device,
    ),
)