In [1]:
import os

# Descargar y descomprimir el dataset CLEVR si no existe
if not os.path.exists('/home/gperaltag/mi_entorno/'):
    print("Descargando el dataset CLEVR...")
    # Descargar el archivo .zip en la carpeta /content/
    !wget -nc https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip -P /home/gperaltag/mi_entorno/

    print("Descomprimiendo el dataset...")
    # Descomprimir el archivo .zip en /content/
    !unzip -qn /home/gperaltag/mi_entorno/CLEVR_v1.0.zip -d /home/gperaltag/mi_entorno/

    print("Eliminando el archivo ZIP...")
    # Eliminar el archivo ZIP para liberar espacio
    os.remove('/home/gperaltag/mi_entorno/CLEVR_v1.0.zip')
else:
    print("El dataset CLEVR ya está disponible en /home/gperaltag/mi_entorno/CLEVR_v1.0.")


!wget -O CLEVR_sample_10_000.tar.gz "https://www.dropbox.com/scl/fi/l87paducv9fmmkiymjfaw/CLEVR_sample_10_000.tar.gz?rlkey=pulv7wktvocicadq6miv3rbbz&st=lmq0fikc&dl=1"

!mkdir -p data
!tar -xzvf CLEVR_sample_10_000.tar.gz -C data/

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2/images/trainA/CLEVR_trainA_064565.png
mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2/images/trainA/CLEVR_trainA_065742.png
mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2/images/trainA/CLEVR_trainA_066629.png
mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2/images/trainA/CLEVR_trainA_066637.png
mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2/images/trainA/CLEVR_trainA_066682.png
mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2/images/trainA/CLEVR_trainA_067882.png
mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2/images/trainA/CLEVR_trainA_069109.png
mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2/images/trainA/CLEVR_trainA_069431.png
mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2/

In [2]:
import os
import json
import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Deshabilitar cuDNN para evitar problemas con GRU
torch.backends.cudnn.enabled = False

# Definir las rutas exactas basadas en la estructura de archivos proporcionada
clevr_original_dir = '/home/gperaltag/mi_entorno/CLEVR_v1.0'
clevr_systematic_10k_dir = '/content/data/mnt/ialabnas/datasets/CLEVR_CoGenT_v1.0/CLEVR_sample_10_000/sample_2'

# ------------------- Función para Calcular max_length -------------------

def calculate_max_length(question_file):
    with open(question_file, 'r') as f:
        questions = json.load(f)['questions']
    return max(len(q['question']) for q in questions)

# ------------------- Definición del Modelo -------------------

class FiLMResBlock(nn.Module):
    def __init__(self, in_channels):
        super(FiLMResBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size=1)
        self.conv2 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
        self.relu = nn.ReLU()

    def forward(self, x, gamma, beta):
        out = self.conv1(x)
        out = self.conv2(self.relu(out))
        gamma = gamma.unsqueeze(2).unsqueeze(3)
        beta = beta.unsqueeze(2).unsqueeze(3)
        out = gamma * out + beta
        return self.relu(out)

class FiLMNetwork(nn.Module):
    def __init__(self, num_classes, max_length, use_resnet=True):
        super(FiLMNetwork, self).__init__()
        resnet = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
        for param in resnet.parameters():
            param.requires_grad = False
        self.feature_extractor = nn.Sequential(
            resnet.conv1,
            resnet.bn1,
            resnet.relu,
            resnet.maxpool,
            resnet.layer1,
            resnet.layer2,
            resnet.layer3
        )
        self.reduce_channels = nn.Conv2d(1024 + 2, 128, kernel_size=1)
        self.resblocks = nn.ModuleList([FiLMResBlock(128) for _ in range(4)])
        self.word_embedding = nn.Embedding(256, 200)
        self.gru = nn.GRU(input_size=200, hidden_size=4096, batch_first=True)
        self.affine_gamma = nn.Linear(4096, 128)
        self.affine_beta = nn.Linear(4096, 128)
        self.classifier = nn.Sequential(
            nn.Conv2d(128, 512, kernel_size=1),
            nn.ReLU(),
            nn.AdaptiveMaxPool2d(1),
            nn.Flatten(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, num_classes)
        )

    def add_coordinates(self, features):
        batch_size, _, height, width = features.size()
        x_coords = torch.linspace(-1, 1, steps=width).repeat(batch_size, height, 1).unsqueeze(1)
        y_coords = torch.linspace(-1, 1, steps=height).repeat(batch_size, width, 1).unsqueeze(1).transpose(2, 3)
        coords = torch.cat([x_coords, y_coords], dim=1).to(features.device)
        return torch.cat([features, coords], dim=1)

    def forward(self, images, questions):
        features = self.feature_extractor(images)
        features = self.add_coordinates(features)
        features = self.reduce_channels(features)
        question_embedding = self.gru_forward(questions)
        gamma = self.affine_gamma(question_embedding)
        beta = self.affine_beta(question_embedding)
        for resblock in self.resblocks:
            features = resblock(features, gamma, beta)
        return self.classifier(features)

    def gru_forward(self, questions):
        embedded = self.word_embedding(questions)
        _, hidden = self.gru(embedded)
        return hidden.squeeze(0)

# ------------------- Definición del Dataset -------------------

class CLEVRDataset(Dataset):
    def __init__(self, image_dir, question_file, max_length, transform=None):
        self.image_dir = image_dir
        with open(question_file, 'r') as f:
            self.questions = json.load(f)['questions']
        self.transform = transform
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        image_path = os.path.join(self.image_dir, question['image_filename'])

        # Verificar si el archivo existe antes de cargarlo
        if not os.path.exists(image_path):
            print(f"Advertencia: Imagen {image_path} no encontrada, omitiendo.")
            return None

        image = Image.open(image_path).convert('RGB')
        question_text = question['question'][:self.max_length]
        question_tensor = torch.tensor([ord(c) for c in question_text], dtype=torch.long)
        if question_tensor.size(0) < self.max_length:
            question_tensor = torch.nn.functional.pad(question_tensor, (0, self.max_length - question_tensor.size(0)))
        label = int(question['answer']) if question['answer'].isdigit() else 0
        if self.transform:
            image = self.transform(image)
        return image, question_tensor, label

def collate_fn(batch):
    # Filtrar entradas None (imágenes faltantes)
    batch = [item for item in batch if item is not None]
    images, questions, labels = zip(*batch)
    images = torch.stack(images)
    questions = torch.stack(questions)
    labels = torch.tensor(labels)
    return images, questions, labels

# ------------------- Función de Evaluación -------------------

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for images, questions, labels in tqdm(dataloader, desc="Evaluando"):
            images, questions, labels = images.to(device), questions.to(device), labels.to(device)
            outputs = model(images, questions)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds) * 100  # Convertir a porcentaje
    print(f"Pérdida media: {avg_loss:.4f}, Precisión: {accuracy:.2f}%")
    return avg_loss, accuracy

# ------------------- Configuración y Carga del Modelo -------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Calcular el max_length más grande entre los conjuntos de validación
max_length_original = calculate_max_length(os.path.join(clevr_original_dir, 'questions/CLEVR_val_questions.json'))
max_length_valA_10k = calculate_max_length(os.path.join(clevr_systematic_10k_dir, 'questions/CLEVR_valA_questions.json'))
max_length_valB_10k = calculate_max_length(os.path.join(clevr_systematic_10k_dir, 'questions/CLEVR_valB_questions.json'))

# Usar el mayor valor de max_length entre los conjuntos
max_length = max(max_length_original, max_length_valA_10k, max_length_valB_10k)
print(f"Max length usado en el modelo: {max_length}")

# Inicializar el modelo con el max_length calculado
model = FiLMNetwork(num_classes=28, max_length=max_length).to(device)

# Descargar y cargar los pesos del modelo finetuneado
checkpoint_path = "/home/gperaltag/mi_entorno/best_film_model_finetuning.pth"
!wget -O {checkpoint_path} "https://www.dropbox.com/scl/fi/ups40j3l3nrjpcg5ias3d/best_film_model_finetuning.pth?rlkey=8rsffbjpi7wsh75rgu3qfx8k4&st=pxq24ntj&dl=1"
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
print("Pesos del modelo cargados exitosamente.")

# ------------------- Preparación de los Datasets y Transformaciones -------------------

transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])

# Definir datasets con el max_length calculado y rutas ajustadas
val_data_original = CLEVRDataset(
    os.path.join(clevr_original_dir, 'images/val'),
    os.path.join(clevr_original_dir, 'questions/CLEVR_val_questions.json'),
    max_length=max_length,
    transform=transform
)

val_dataA_10k = CLEVRDataset(
    os.path.join(clevr_systematic_10k_dir, 'images/valA'),
    os.path.join(clevr_systematic_10k_dir, 'questions/CLEVR_valA_questions.json'),
    max_length=max_length,
    transform=transform
)

val_dataB_10k = CLEVRDataset(
    os.path.join(clevr_systematic_10k_dir, 'images/valB'),
    os.path.join(clevr_systematic_10k_dir, 'questions/CLEVR_valB_questions.json'),
    max_length=max_length,
    transform=transform
)

val_loader_original = DataLoader(val_data_original, batch_size=64, collate_fn=collate_fn, num_workers=32)
val_loaderA_10k = DataLoader(val_dataA_10k, batch_size=64, collate_fn=collate_fn, num_workers=32)
val_loaderB_10k = DataLoader(val_dataB_10k, batch_size=64, collate_fn=collate_fn, num_workers=32)

# ------------------- Evaluación -------------------

criterion = nn.CrossEntropyLoss()

print("Evaluación en el conjunto de validación original de CLEVR:")
evaluate(model, val_loader_original, criterion, device)

print("Evaluación en el conjunto de validación ValA (10k) de CLEVR fine-tuning:")
evaluate(model, val_loaderA_10k, criterion, device)

print("Evaluación en el conjunto de validación ValB (10k) de CLEVR fine-tuning:")
evaluate(model, val_loaderB_10k, criterion, device)


Max length usado en el modelo: 208


Downloading: "https://download.pytorch.org/models/resnet101-cd907fc2.pth" to /root/.cache/torch/hub/checkpoints/resnet101-cd907fc2.pth
100%|██████████| 171M/171M [00:00<00:00, 227MB/s]


--2024-11-11 18:39:22--  https://www.dropbox.com/scl/fi/ups40j3l3nrjpcg5ias3d/best_film_model_finetuning.pth?rlkey=8rsffbjpi7wsh75rgu3qfx8k4&st=pxq24ntj&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc50d9c2fcb77b4a2912862197f7.dl.dropboxusercontent.com/cd/0/inline/CeNlOa7ARKLFuQzV9R99CoVLyKo_jRwCxgeSIeZnbuT3xHnvm1L4zJVzbgHUVhQ69zq9iK3EXzbl_LxP1ofL2jvBoz6QJLSdWuCjgANrHejxO2AA6PKS_wJz3ONqeHGj3aqZqiPwT8gES4WBcFyj-R1B/file?dl=1# [following]
--2024-11-11 18:39:23--  https://uc50d9c2fcb77b4a2912862197f7.dl.dropboxusercontent.com/cd/0/inline/CeNlOa7ARKLFuQzV9R99CoVLyKo_jRwCxgeSIeZnbuT3xHnvm1L4zJVzbgHUVhQ69zq9iK3EXzbl_LxP1ofL2jvBoz6QJLSdWuCjgANrHejxO2AA6PKS_wJz3ONqeHGj3aqZqiPwT8gES4WBcFyj-R1B/file?dl=1
Resolving uc50d9c2fcb77b4a2912862197f7.dl.dropboxusercontent.com (uc50d9c2fcb77b4a2912862197f7.dl.

  model.load_state_dict(torch.load(checkpoint_path, map_location=device))


Pesos del modelo cargados exitosamente.




Evaluación en el conjunto de validación original de CLEVR:


Evaluando: 100%|██████████| 2344/2344 [13:28<00:00,  2.90it/s]


Pérdida media: 0.3187, Precisión: 85.65%
Evaluación en el conjunto de validación ValA (10k) de CLEVR fine-tuning:


Evaluando: 100%|██████████| 313/313 [01:51<00:00,  2.80it/s]


Pérdida media: 0.2923, Precisión: 87.70%
Evaluación en el conjunto de validación ValB (10k) de CLEVR fine-tuning:


Evaluando: 100%|██████████| 313/313 [01:51<00:00,  2.81it/s]

Pérdida media: 0.2937, Precisión: 87.69%





(0.29367180309070945, 87.685)