In [1]:
%cd ~/school/dl-2024-mp/

/home/nanahoshi/school/dl-2024-mp


In [42]:
import numpy as np
import torch

from torchvision.transforms import (
    CenterCrop,
    Compose,
    Lambda,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

from torch.utils.data import DataLoader


In [38]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [7]:
cifar_test_nolabels = unpickle('explore/cifar_test_nolabels.pkl')
cifar_test_nolabels

{b'data': array([[133, 136, 136, ..., 226, 225, 224],
        [160, 177, 176, ...,  89,  89,  88],
        [255, 255, 255, ..., 211, 213, 215],
        ...,
        [ 29,  29,  45, ..., 156, 155, 154],
        [124, 123, 126, ...,  49,  49,  51],
        [255, 255, 255, ..., 250, 251, 255]], dtype=uint8),
 b'ids': array([   0,    1,    2, ..., 9997, 9998, 9999])}

In [8]:
# list all ../OUTPUTS
import os
models = os.listdir('OUTPUTS')
models

['cifnet-18-tiny-lr0.01-bottleneck',
 'cifnet-18-apple--lr0.001--tbtest',
 'cifnet-18-cucumber-nope--lr0.001--d4_256',
 'cifnet-18-tiny-lr0.01-attention',
 'cifnet-18-tiny_attention--lr0.001--prenorm',
 'cifnet-18-banana--lr0.001--sigmoid_4d_128-128-64-64',
 'cifnet-18-cucumber--lr0.001--d4_256',
 'cifnet-18-cucumber--lr0.001--d4_256_nope',
 'cifnet-18-tiny-lr0.1-baseline']

In [9]:
cifar_test_nolabels[b'data'].shape

(10000, 3072)

In [43]:
cifar_test_nolabels[b'data']

array([[133, 136, 136, ..., 226, 225, 224],
       [160, 177, 176, ...,  89,  89,  88],
       [255, 255, 255, ..., 211, 213, 215],
       ...,
       [ 29,  29,  45, ..., 156, 155, 154],
       [124, 123, 126, ...,  49,  49,  51],
       [255, 255, 255, ..., 250, 251, 255]], dtype=uint8)

In [45]:
import pandas as pd

# Assuming 'data' can be represented in tabular form
df = pd.DataFrame(cifar_test_nolabels[b'data'])
csv_file_path = 'explore/cifar_test_nolabels.csv'
df.to_csv(csv_file_path, index=False)


## PREPROCESS

In [47]:
import torch
import torchvision

from transformers import AutoImageProcessor
from datasets import load_dataset

from models.cifnet import (
    CifNetForImageClassification,
    CifNetConfig,
)

def setup_model(model_name_or_path, ):
    model_config = CifNetConfig.from_pretrained(model_name_or_path)
    model = CifNetForImageClassification(model_config)
    # model = CifNetForImageClassification.from_pretrained(model_name_or_path, config=model_config)
    processor = AutoImageProcessor.from_pretrained(model_name_or_path)
    return model, processor

model, image_processor = setup_model('OUTPUTS/cifnet-18-cucumber--lr0.001--d4_256', )

In [48]:
# Preprocessing the datasets

# Define torchvision transforms to be applied to each image.
if "shortest_edge" in image_processor.size:
    size = image_processor.size["shortest_edge"]
else:
    size = (image_processor.size["height"], image_processor.size["width"])
normalize = (
    Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
    if hasattr(image_processor, "image_mean") and hasattr(image_processor, "image_std")
    else Lambda(lambda x: x)
)
train_transforms = Compose(
    [
        RandomResizedCrop(size),
        RandomHorizontalFlip(),
        ToTensor(),
        normalize,
    ]
)
val_transforms = Compose(
    [
        Resize(size),
        CenterCrop(size),
        ToTensor(),
        normalize,
    ]
)

dataset_name = "explore/cifar_test_nolabels.csv"
dataset = load_dataset(dataset_name)
max_train_samples = 1000
seed = 42
val_transforms = Compose(
    [
        Resize(size),
        CenterCrop(size),
        ToTensor(),
        normalize,
    ]
)

# Set the validation transforms
test_dataset = dataset
per_device_test_batch_size = 4
num_workers = 4
# DataLoaders creation:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example[label_column_name] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

# test_dataloader = DataLoader(test_dataset, collate_fn=collate_fn, batch_size=per_device_test_batch_size, num_workers=num_workers,)
test_dataloader = DataLoader(test_dataset, batch_size=per_device_test_batch_size, num_workers=num_workers,)


DatasetNotFoundError: Dataset 'explore/cifar_test_nolabels.csv' doesn't exist on the Hub or cannot be accessed. If the dataset is private or gated, make sure to log in with `huggingface-cli login` or visit the dataset page at https://huggingface.co/datasets/explore/cifar_test_nolabels.csv to ask for access.

In [22]:
from models.cifnet import (
    CifNetForImageClassification,
    CifNetConfig,
)
model_path = os.path.join('OUTPUTS', models[1])
model = CifNetForImageClassification.from_pretrained(model_path)

# Move model to testuation mode
model.test()

# If using a GPU
model = model.to('cuda')
preprocessed_test_data = preprocessed_test_data.to('cuda')

# Make predictions
with torch.no_grad():
    predictions = model(preprocessed_test_data)

# Process predictions as needed (e.g., applying softmax to get probabilities)

In [25]:
predictions = predictions.logits.argmax(dim=-1)

In [27]:
predictions.shape

torch.Size([10000])

In [33]:
# to csv
import pandas as pd

# ID,Labels

# Create a DataFrame with the predictions
df = pd.DataFrame({
    'ID': range(len(predictions)),
    'Labels': predictions.cpu().numpy()
})

df.to_csv('predictions.csv', index=False)


In [35]:
# see result distribution 
df['Labels'].value_counts()


Labels
1    4610
9    3886
0     451
6     275
3     266
5     250
2      84
8      80
7      52
4      46
Name: count, dtype: int64