In [2]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
os.environ['TOKEN_HUGGINGFACE'] = '<Code get HuggingFace>'
os.environ['DOWNLOAD_DATA_DIR'] = 'app/datasets'

In [4]:
import os

# Definir la ruta al directorio donde deseas almacenar el repositorio en Google Drive
base_path = '/content/drive/MyDrive/github'
repo_name = 'multimodal-llm-finetuning'
repo_path = os.path.join(base_path, repo_name)
repo_url = 'https://github.com/fathooo/multimodal-llm-finetuning.git'
branch_name = 'develop'

# Crear el directorio base si no existe
if not os.path.exists(base_path):
    os.makedirs(base_path)
    print(f"Directorio {base_path} creado.")
else:
    print(f"Directorio {base_path} ya existe.")

import os

if not os.path.exists(repo_path):
    print("Repositorio no encontrado en Google Drive, clonando el repositorio...")
    # Clonar el repositorio si no existe y hacer checkout a la rama develop
    !git clone {repo_url} {repo_path}
    %cd {repo_path}
    !git checkout {branch_name}
    !git pull origin {branch_name}
else:
    print("Repositorio ya existe en Google Drive. Haciendo checkout a la rama develop...")
    %cd {repo_path}
    !git checkout {branch_name}

# Confirma que estás en la rama correcta
!git branch

# Cambiar al directorio del repositorio
os.chdir(repo_path)
print("Directorio actual:", os.getcwd())

Directorio /content/drive/MyDrive/github ya existe.
Repositorio ya existe en Google Drive. Haciendo checkout a la rama develop...
/content/drive/MyDrive/github/multimodal-llm-finetuning
M	requirements.txt
Already on 'develop'
Your branch is up to date with 'origin/develop'.
* [32mdevelop[m
  main[m
Directorio actual: /content/drive/MyDrive/github/multimodal-llm-finetuning


In [8]:
# Celda para instalar torch y CUDA desde el índice extra
!pip install torch --extra-index-url https://download.pytorch.org/whl/cu118
# Celda para instalar dotenv
!pip install python-dotenv
# Celda para instalar colorama
!pip install colorama
# Celda para instalar transformers
!pip install transformers==4.44.0
# Celda para instalar datasets
!pip install datasets

!pip freeze > requirements-colab.txt
print("Archivo requirements-colab.txt creado.")


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118
Archivo requirements-colab.txt creado.


In [14]:
# Ejemplo de uso de funciones del módulo
from app.config.config import print_device_info, get_device, TOKEN_HUGGINGFACE
from app.data.download import download_datasets
from app.config.dataset_info import dataset_info_list
from app.utils.huggingface import hf_login

# Iniciar Colorama
from app.utils.colorama_utils import initialize_colorama
initialize_colorama()

# Obtener dispositivo
device = get_device()

# Imprimir la información del dispositivo
print_device_info()

# Descargar los datasets
download_datasets(dataset_info_list, os.getenv('DOWNLOAD_DATA_DIR'))

# Login HF
hf_login(TOKEN_HUGGINGFACE)


PyTorch Version: 2.3.1+cu121
CUDA Version: 12.1
CUDA available: True
Current device: 0
alpaca_spanish.parquet already exists in app/datasets
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [15]:
from app.config.environments import ENV_DOWNLOAD_DATA_DIR
from app.config.config import MODEL_NAME
from app.utils.colorama_utils import initialize_colorama, print_message
from app.data.preprocess import load_dataset, split_dataset
from colorama import Fore  # Se agrega esta línea
from transformers import ChameleonProcessor


# Check if the file exists before loading
# Charge the first dataset
file_info = dataset_info_list[0]
file_path = os.path.join(ENV_DOWNLOAD_DATA_DIR, file_info['file_name'])
if not os.path.exists(file_path):
    print_message("File does not exist.", Fore.RED)

# Initialize processor to get tokenizer
processor = ChameleonProcessor.from_pretrained(MODEL_NAME)
tokenizer = processor.tokenizer

# Load local dataset
df = load_dataset(file_path, file_info['format'])
train_dataset, val_dataset = split_dataset(df, tokenizer, device)


Some kwargs in processor config are unused and will not have any effect: image_token, image_seq_length. 


Preview of the first 3 rows of the dataset:
                    instruction                        input                                   output
            ¿Qué significa DNA?                              DNA significa ácido desoxirribonucleico.
¿Cuál es la capital de Francia?                                       La capital de Francia es París.
   Identifica el que no encaja. Twitter, Instagram, Telegram                                 Telegram



In [16]:
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")

Number of training samples: 41553
Number of validation samples: 10389


In [17]:
if train_dataset and val_dataset:
    print("Dataset loaded successfully.")
    # Load model
    # model = ChameleonForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16).to(device)
    # # Train model
    # train_model(model, train_dataset, processor, device, LEARNING_RATE, MAX_STEPS_PER_EPOCH, EPOCHS)
    pass
else:
    print_message("Failed to load the dataset for training. Please check if the file exists and is accessible.", Fore.RED)

Dataset loaded successfully.
