In [None]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.environ['TOKEN_HUGGINGFACE'] = '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'
os.environ['DOWNLOAD_DATA_DIR'] = 'app/datasets'

In [None]:
import os

# Definir la ruta al directorio donde deseas almacenar el repositorio en Google Drive
base_path = '/content/drive/MyDrive/github'
repo_name = 'multimodal-llm-finetuning'
repo_path = os.path.join(base_path, repo_name)
repo_url = 'https://github.com/fathooo/multimodal-llm-finetuning.git'
branch_name = 'develop'

# Crear el directorio base si no existe
if not os.path.exists(base_path):
    os.makedirs(base_path)
    print(f"Directorio {base_path} creado.")
else:
    print(f"Directorio {base_path} ya existe.")

import os

if not os.path.exists(repo_path):
    print("Repositorio no encontrado en Google Drive, clonando el repositorio...")
    # Clonar el repositorio si no existe y hacer checkout a la rama develop
    !git clone {repo_url} {repo_path}
    %cd {repo_path}
    !git checkout {branch_name}
    !git pull origin {branch_name}
else:
    print("Repositorio ya existe en Google Drive. Haciendo checkout a la rama develop...")
    %cd {repo_path}
    !git checkout {branch_name}

# Confirma que estás en la rama correcta
!git branch

# Cambiar al directorio del repositorio
os.chdir(repo_path)
print("Directorio actual:", os.getcwd())

Directorio /content/drive/MyDrive/github ya existe.
Repositorio ya existe en Google Drive. Haciendo checkout a la rama develop...
/content/drive/MyDrive/github/multimodal-llm-finetuning
Already on 'develop'
Your branch is up to date with 'origin/develop'.
* [32mdevelop[m
  main[m
Directorio actual: /content/drive/MyDrive/github/multimodal-llm-finetuning


In [None]:
# Celda para instalar torch y CUDA desde el índice extra
!pip install torch --extra-index-url https://download.pytorch.org/whl/cu118
# Celda para instalar dotenv
!pip install python-dotenv
# Celda para instalar colorama
!pip install colorama
# Celda para instalar transformers
!pip install transformers==4.44.0
# Celda para instalar datasets
!pip install datasets

!pip install -U bitsandbytes transformers accelerate
!pip install huggingface_hub
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install pyarrow

!pip freeze > requirements-colab.txt
print("Archivo requirements-colab.txt creado.")


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118
Collecting transformers
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.44.0
    Uninstalling transformers-4.44.0:
      Successfully uninstalled transformers-4.44.0
Successfully installed transformers-4.44.2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Archivo requirements-colab.txt creado.


In [None]:
# Ejemplo de uso de funciones del módulo
from app.config.config import print_device_info, get_device, TOKEN_HUGGINGFACE
from app.data.download import download_datasets
from app.config.dataset_info import dataset_info_list
from app.utils.huggingface import hf_login

# Iniciar Colorama
from app.utils.colorama_utils import initialize_colorama
initialize_colorama()

# Obtener dispositivo
device = get_device()

# Imprimir la información del dispositivo
print_device_info()

# Descargar los datasets
download_datasets(dataset_info_list, os.getenv('DOWNLOAD_DATA_DIR'))

# Login HF
hf_login(TOKEN_HUGGINGFACE)


PyTorch Version: 2.4.0+cu121
CUDA Version: 12.1
CUDA available: True
Current device: 0
alpaca_spanish.parquet already exists in app/datasets
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import Trainer, TrainingArguments, ChameleonProcessor, ChameleonForConditionalGeneration, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import torch

MODEL_NAME = "facebook/chameleon-7b"
USE_PEFT = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = ChameleonForConditionalGeneration.from_pretrained(MODEL_NAME, quantization_config=quantization_config, device_map={"": 0})

if USE_PEFT:
    print("Preparando el modelo para el entrenamiento con PEFT (k-bit)")
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

print(f"Modelo cargado en el dispositivo")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Preparando el modelo para el entrenamiento con PEFT (k-bit)
Modelo cargado en el dispositivo


In [None]:
model

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["lm_head"],  # Se pueden probar diferentes partes del modelo
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)


trainable params: 557056 || all params: 3805644800 || trainable%: 0.014637624614887863


In [None]:
import pandas as pd
from datasets import Dataset
from colorama import Fore

def tokenize_function(examples, tokenizer):
    """Tokeniza tanto la instrucción como el input juntos para crear la entrada completa."""
    inputs = [f"Instrucción: {instr}. Entrada: {inp}" if inp else f"Instrucción: {instr}"
              for instr, inp in zip(examples["instruction"], examples["input"])]

    # Tokeniza las entradas
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)  # Agregando max_length

    # Tokeniza las salidas (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], padding="max_length", truncation=True, max_length=512)

    # Asegurar que ambas (inputs y labels) tengan el mismo tamaño
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def split_dataset(df, tokenizer, split_ratio=0.8):
    if 'instruction' not in df.columns or 'input' not in df.columns or 'output' not in df.columns:
        print_message("DataFrame does not contain the required columns for splitting", Fore.RED)
        return None, None

    dataset = Dataset.from_pandas(df)

    tokenized_dataset = dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)

    split_idx = int(len(tokenized_dataset) * split_ratio)
    train_dataset = tokenized_dataset.select(range(split_idx))
    val_dataset = tokenized_dataset.select(range(split_idx, len(tokenized_dataset)))

    return train_dataset, val_dataset


In [None]:
from app.config.environments import ENV_DOWNLOAD_DATA_DIR
from app.config.config import MODEL_NAME
from app.utils.colorama_utils import initialize_colorama, print_message
from app.data.preprocess import load_dataset
from colorama import Fore  # Se agrega esta línea
from transformers import ChameleonProcessor, Trainer, TrainingArguments, DataCollatorForSeq2Seq


# Check if the file exists before loading
# Charge the first dataset
file_info = dataset_info_list[0]
file_path = os.path.join(ENV_DOWNLOAD_DATA_DIR, file_info['file_name'])
if not os.path.exists(file_path):
    print_message("File does not exist.", Fore.RED)

# Initialize processor to get tokenizer
processor = ChameleonProcessor.from_pretrained(MODEL_NAME)
tokenizer = processor.tokenizer

# Load local dataset
df = load_dataset(file_path, file_info['format']) # aqui cargo el archivo
train_dataset, val_dataset = split_dataset(df, tokenizer, 0.8)

Some kwargs in processor config are unused and will not have any effect: image_token, image_seq_length. 


Preview of the first 3 rows of the dataset:
                    instruction                        input                                   output
            ¿Qué significa DNA?                              DNA significa ácido desoxirribonucleico.
¿Cuál es la capital de Francia?                                       La capital de Francia es París.
   Identifica el que no encaja. Twitter, Instagram, Telegram                                 Telegram



Map:   0%|          | 0/51942 [00:00<?, ? examples/s]



In [None]:
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq

if train_dataset and val_dataset:
    print("Dataset loaded successfully.")
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

    # Configura los argumentos de entrenamiento
    training_args = TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10000,
        learning_rate=1e-4,
        fp16=True,  # Entrenamiento en media precisión
        logging_steps=10,
        evaluation_strategy="steps",  # Evaluación durante el entrenamiento
        save_steps=100,  # Guardar el modelo cada 100 pasos
        output_dir="outputs",
        optim="paged_adamw_8bit"
    )

    # Data collator para entrenamiento de secuencia a secuencia
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

    # Inicializar el Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,  # Validación durante el entrenamiento
        data_collator=data_collator,
        tokenizer=tokenizer
    )

    # Entrenar el modelo
    trainer.train()
    pass
else:
    print_message("Failed to load the dataset for training. Please check if the file exists and is accessible.", Fore.RED)

Dataset loaded successfully.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss


In [None]:
import torch
print("CUDA disponible:", torch.cuda.is_available())
print("Número de GPUs disponibles:", torch.cuda.device_count())
print("Nombre de la GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU available")


CUDA disponible: True
Número de GPUs disponibles: 1
Nombre de la GPU: Tesla T4
