In [None]:

# Desinstalar versiones potencialmente incompatibles
!pip uninstall -y transformers
!pip uninstall -y accelerate
!pip install -q --upgrade pip

# Instalar versiones correctas y compatibles con MiniCPM-V-2_6
!pip install --no-cache-dir \
  torch==2.1.2 \
  torchvision==0.16.2 \
  transformers==4.40.0 \
  Pillow==10.1.0 \
  sentencepiece==0.1.99 \
  decord==0.6.0 \
  accelerate==0.30.1 \
  bitsandbytes==0.43.1 \
  peft==0.10.0

# Reinicia el entorno manualmente después de esto para evitar errores residuales:
# Menu: Entorno de ejecución → Reiniciar entorno


In [None]:
#Accedemos a Google Drive, ya que ahi se encuentra cargada las imágenes y el archivo JSON
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Inicializo el HUgging Face
from huggingface_hub import login,notebook_login

login(token="XXXXXXXXXX") #Aqui se pone el Token que brinda el Hugging Face

notebook_login()

In [None]:
# Importación de las librerías necesarias
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModel
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import json
import os

# Cargo el modelo
model = AutoModel.from_pretrained(
    'openbmb/MiniCPM-V-2_6',
    trust_remote_code=True,
    attn_implementation='sdpa',
    torch_dtype=torch.bfloat16,
).cuda()
model = model.eval()

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6', trust_remote_code=True)

# Configuración LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

# Cargar dataset
with open("/content/drive/MyDrive/train_human.json") as f:
    data = json.load(f)

# Procesamiento de cada ejemplo del dataset
image_dir = "/content/drive/MyDrive/png"
examples = []
for item in data:
    img_path = os.path.join(image_dir, item["imgname"])
    if not os.path.exists(img_path):
        continue
    image = Image.open(img_path).convert("RGB")
    question = item["query"]
    answer = item["label"]
    examples.append({"image": image, "question": question, "answer": answer})

# Loop de entrenamiento
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(3):
    print(f"Epoch {epoch+1}/3")
    for i, sample in enumerate(examples):
        prompt = f"<image>\nQuestion: {sample['question']}\nAnswer:"
        msgs = [{"role": "user", "content": [sample["image"], sample["question"]]}]
        labels = tokenizer(sample["answer"], return_tensors="pt")["input_ids"].cuda()

In [None]:
from PIL import Image
import torch

# Ruta a la imagen externa
image_path = "/content/PRUEBAS/engañosa6.png"

# Cargar imagen
image = Image.open(image_path).convert("RGB")

# Pregunta
question = "First, describe what you see: axes, ranges, labels, colors, scale. Second, critically analyze each element: Are there truncated axes? Misleading colors? Is there a logarithmic scale without warning?Finally, based on the above, is the graph misleading? Detail which elements are misleading. "

# Construir input
msgs = [{"role": "user", "content": [image, question]}]

# Ejecutar inferencia
with torch.no_grad():
    response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)

# Mostrar resultado
print("Pregunta:", question)
print("Respuesta del modelo:", response)
