In [2]:
!pip install torch torchvision transformers scikit-learn pandas

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [4]:
!pip install kaggle
from google.colab import files
files.upload()
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c 'petfinder-adoption-prediction'



Saving kaggle.json to kaggle.json
rm: cannot remove '/root/.kaggle': No such file or directory
Downloading petfinder-adoption-prediction.zip to /content
100% 1.93G/1.94G [00:12<00:00, 225MB/s]
100% 1.94G/1.94G [00:12<00:00, 165MB/s]


In [5]:
# Unzip the downloaded data
import zipfile
data_zip_path = "/content/petfinder-adoption-prediction.zip"
with zipfile.ZipFile(data_zip_path, 'r') as zip_ref:
  zip_ref.extractall("/content")  # Extract to the same directory

In [7]:
from google.colab import drive
import os
drive.mount('/content/drive')
# Ensure the processed data directory exists in Google Drive
processed_data_dir = '/content/drive/MyDrive/ProcessedData'
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)

Mounted at /content/drive


In [6]:
import pandas as pd
from PIL import Image
from transformers import AutoModel, AutoImageProcessor
from sklearn.decomposition import KernelPCA
from torch.utils.data import DataLoader, Dataset
import torch
import numpy as np
import os
from tqdm import tqdm
import torch.nn as nn
adaptive_avgpool = nn.AdaptiveAvgPool2d((1, 100))

# Cargar el dataset de PetFinder
train_data = pd.read_csv('/content/train/train.csv')
test_data = pd.read_csv('/content/test/test.csv')

image_folder = "/content/train_images"
image_test = "/content/test_images"
images = os.listdir(image_folder)
images.sort()

test_images = os.listdir(image_test)
test_images.sort()

# group images by PetId, me quedo con la primer foto nada mas
first_image_by_pet_id = {}
for image_name in images:
    pet_id = image_name.split('-')[0]
    if pet_id not in first_image_by_pet_id:
      first_image_by_pet_id[pet_id] = image_name
    else:
      continue

test_first_image_by_pet_id = {}
for image_name in test_images:
    pet_id = image_name.split('-')[0]
    if pet_id not in test_first_image_by_pet_id:
      test_first_image_by_pet_id[pet_id] = image_name
    else:
      continue

# agrego una columna image_path

def get_image_path(pet_id, folder, images_dict):
    if not pet_id in images_dict:
        return None
    return os.path.join(folder, images_dict[pet_id])


train_data['image_path'] = train_data['PetID'].apply(lambda x: get_image_path(x, image_folder, first_image_by_pet_id))
test_data['image_path'] = test_data['PetID'].apply(lambda x: get_image_path(x, image_test, test_first_image_by_pet_id))

# para testing, me quedo solo con las primeras 10 filas de data y test


# borro las filas donde image_path sea None
train_data = train_data[train_data['image_path'].notna()]
test_data = test_data[test_data['image_path'].notna()]


class PetFinderDataset(Dataset):
    def __init__(self, dataframe, processor):
        self.dataframe = dataframe
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        image = Image.open(img_path).convert("RGB")
        inputs = self.processor(images=image, return_tensors="pt")
        inputs = {k: v.squeeze() for k, v in inputs.items()}
        return inputs, self.dataframe.iloc[idx]['PetID']

processor = AutoImageProcessor.from_pretrained('facebook/dinov2-small')

train_dataset = PetFinderDataset(train_data, processor)
test_dataset = PetFinderDataset(test_data, processor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Cargar el modelo preentrenado
model = AutoModel.from_pretrained('facebook/dinov2-small')
model.to(device)
model.eval()

def extract_features(data_loader):
    features = []
    pet_ids = []
    with torch.no_grad():
        for inputs, pet_id in tqdm(data_loader):
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            img_features = outputs.last_hidden_state
            img_features = adaptive_avgpool(img_features).squeeze()
            img_features = img_features.cpu().numpy()
            features.append(img_features)
            pet_ids.extend(pet_id)
            # clean memory
            del inputs
            del outputs
            torch.cuda.empty_cache()
    features = np.concatenate(features, axis=0)
    return features, pet_ids

# Extraer características para el conjunto de entrenamiento
train_features, train_pet_ids = extract_features(train_loader)




100%|██████████| 1832/1832 [04:29<00:00,  6.79it/s]


In [5]:
print(train_features.shape)

(10, 100)


In [7]:
train_features_df = pd.DataFrame(train_features, columns=[f'ImageFeature{i}' for i in range(train_features.shape[1])])
train_features_df['PetID'] = train_pet_ids
train_features_df = train_features_df[['PetID'] + [f'ImageFeature{i}' for i in range(train_features.shape[1])]]
train_features_df.to_csv('/content/drive/MyDrive/ProcessedData/train_reduced_features.csv', index=False)

print("Características de entrenamiento generadas y guardadas en 'train_reduced_features.csv'")

Características de entrenamiento generadas y guardadas en 'train_reduced_features.csv'


In [8]:
test_features, test_pet_ids = extract_features(test_loader)
test_features_df = pd.DataFrame(test_features, columns=[f'ImageFeature{i}' for i in range(test_features.shape[1])])
test_features_df['PetID'] = test_pet_ids
test_features_df = test_features_df[['PetID'] + [f'ImageFeature{i}' for i in range(test_features.shape[1])]]
test_features_df.to_csv('/content/drive/MyDrive/ProcessedData/test_reduced_features.csv', index=False)

print("Características de prueba generadas y guardadas en 'test_reduced_features.csv'")

100%|██████████| 483/483 [00:59<00:00,  8.06it/s]


Características de prueba generadas y guardadas en 'test_reduced_features.csv'
