#  VIT Model

In [1]:
import json
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViTForImageClassification
from sklearn.model_selection import train_test_split

# Step 1: Load Data from JSON
with open('C:\\Users\\hp\\Downloads\\DataVIT.json') as f:
    data = json.load(f)

# Flatten the image data
image_data = []
for item in data:
    for img in item['images']:
        image_data.append({'url': img['url'], 'label': item['interests'][0]})

# Collect paths to all .pt files
preprocessed_files = sorted([f'C:\\Users\\hp\\Downloads\\preprocessed_images_combined\\{filename}' 
                             for filename in os.listdir('C:\\Users\\hp\\Downloads\\preprocessed_images_combined/') 
                             if filename.endswith('.pt')])

# Create a mapping from filename to preprocessed file path
preprocessed_file_map = {os.path.basename(file): file for file in preprocessed_files}

# Step 2: Split Data into Training and Test Sets
train_data, test_data = train_test_split(image_data, test_size=0.2, random_state=42)

# Filter the train data to include only entries that have a corresponding preprocessed file
filtered_train_data = [entry for entry in train_data if os.path.basename(entry['url']).replace('.jpg', '.pt') in preprocessed_file_map]

# Filter the test data to include only entries that have a corresponding preprocessed file
filtered_test_data = [entry for entry in test_data if os.path.basename(entry['url']).replace('.jpg', '.pt') in preprocessed_file_map]

# Verify the length of filtered_train_data, filtered_test_data and preprocessed_files
print(f"Number of filtered train data entries: {len(filtered_train_data)}")
print(f"Number of filtered test data entries: {len(filtered_test_data)}")
print(f"Number of preprocessed files: {len(preprocessed_files)}")

# Map the preprocessed files to train and test data
preprocessed_train_files = [preprocessed_file_map[os.path.basename(entry['url']).replace('.jpg', '.pt')] for entry in filtered_train_data]
preprocessed_test_files = [preprocessed_file_map[os.path.basename(entry['url']).replace('.jpg', '.pt')] for entry in filtered_test_data]

# Step 3: Create the Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, data, preprocessed_files, transform=None):
        self.data = data
        self.preprocessed_files = preprocessed_files
        self.transform = transform

        # Ensure the lengths match
        assert len(self.data) == len(self.preprocessed_files), (
            f"Data length {len(self.data)} does not match number of preprocessed files {len(self.preprocessed_files)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        preprocessed_image = torch.load(self.preprocessed_files[idx])  # Load the preprocessed tensor from file
        label = self.data[idx]['label']
        if self.transform:
            preprocessed_image = self.transform(preprocessed_image)
        return preprocessed_image, label

# Define transformations (if any additional transformations are needed)
transform = transforms.Compose([
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Create datasets
train_dataset = CustomDataset(filtered_train_data, preprocessed_train_files, transform=transform)
test_dataset = CustomDataset(filtered_test_data, preprocessed_test_files, transform=transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Step 4: Create the ViT Model
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=len(set([item['label'] for item in filtered_train_data])),
    ignore_mismatched_sizes=True
)

# Step 5: Train the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 10 == 9:  # Print every 10 batches
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {running_loss / 10:.4f}')
            running_loss = 0.0
    print(f'Epoch {epoch+1} finished.')

# Step 6: Evaluate the Model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images).logits
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        if total % 100 == 0:  # Print every 100 images
            print(f'Progress: {total}/{len(test_loader.dataset)} images evaluated.')

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy:.2f}%')


  warn(


Number of filtered train data entries: 11856
Number of filtered test data entries: 2975
Number of preprocessed files: 14640




config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/5], Batch [10/741], Loss: 2.5350
Epoch [1/5], Batch [20/741], Loss: 2.3148
Epoch [1/5], Batch [30/741], Loss: 2.2133
Epoch [1/5], Batch [40/741], Loss: 2.2502
Epoch [1/5], Batch [50/741], Loss: 2.0798
Epoch [1/5], Batch [60/741], Loss: 2.1455
Epoch [1/5], Batch [70/741], Loss: 2.0068
Epoch [1/5], Batch [80/741], Loss: 1.9502
Epoch [1/5], Batch [90/741], Loss: 1.8795
Epoch [1/5], Batch [100/741], Loss: 1.9982
Epoch [1/5], Batch [110/741], Loss: 1.9482
Epoch [1/5], Batch [120/741], Loss: 1.9646
Epoch [1/5], Batch [130/741], Loss: 1.7429
Epoch [1/5], Batch [140/741], Loss: 2.0422
Epoch [1/5], Batch [150/741], Loss: 1.8537
Epoch [1/5], Batch [160/741], Loss: 1.9221
Epoch [1/5], Batch [170/741], Loss: 1.6791
Epoch [1/5], Batch [180/741], Loss: 1.8701
Epoch [1/5], Batch [190/741], Loss: 1.7824
Epoch [1/5], Batch [200/741], Loss: 1.7401
Epoch [1/5], Batch [210/741], Loss: 1.7004
Epoch [1/5], Batch [220/741], Loss: 1.6815
Epoch [1/5], Batch [230/741], Loss: 1.6987
Epoch [1/5], Batch [

In [3]:
from sklearn.metrics import classification_report
# Step 7: Display the Classification Report
print(classification_report(all_labels, all_predictions, target_names=list(set(item['label'] for item in filtered_train_data))))

NameError: name 'all_labels' is not defined

# VIT Amelioration

In [4]:
import json
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViTForImageClassification
from sklearn.model_selection import train_test_split

# Step 1: Load Data from JSON
with open('C:\\Users\\hp\\Downloads\\DataVIT.json') as f:
    data = json.load(f)

# Flatten the image data
image_data = []
for item in data:
    for img in item['images']:
        image_data.append({'url': img['url'], 'label': item['interests'][0]})

# Collect paths to all .pt files
preprocessed_files = sorted([f'C:\\Users\\hp\\Downloads\\preprocessed_images_combined\\{filename}' 
                             for filename in os.listdir('C:\\Users\\hp\\Downloads\\preprocessed_images_combined') 
                             if filename.endswith('.pt')])

# Create a mapping from filename to preprocessed file path
preprocessed_file_map = {os.path.basename(file): file for file in preprocessed_files}

# Step 2: Split Data into Training and Test Sets
train_data, test_data = train_test_split(image_data, test_size=0.2, random_state=42)

# Filter the train data to include only entries that have a corresponding preprocessed file
filtered_train_data = [entry for entry in train_data if os.path.basename(entry['url']).replace('.jpg', '.pt') in preprocessed_file_map]

# Filter the test data to include only entries that have a corresponding preprocessed file
filtered_test_data = [entry for entry in test_data if os.path.basename(entry['url']).replace('.jpg', '.pt') in preprocessed_file_map]

# Verify the length of filtered_train_data, filtered_test_data and preprocessed_files
print(f"Number of filtered train data entries: {len(filtered_train_data)}")
print(f"Number of filtered test data entries: {len(filtered_test_data)}")
print(f"Number of preprocessed files: {len(preprocessed_files)}")

# Map the preprocessed files to train and test data
preprocessed_train_files = [preprocessed_file_map[os.path.basename(entry['url']).replace('.jpg', '.pt')] for entry in filtered_train_data]
preprocessed_test_files = [preprocessed_file_map[os.path.basename(entry['url']).replace('.jpg', '.pt')] for entry in filtered_test_data]

# Step 3: Create the Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, data, preprocessed_files, transform=None):
        self.data = data
        self.preprocessed_files = preprocessed_files
        self.transform = transform

        # Ensure the lengths match
        assert len(self.data) == len(self.preprocessed_files), (
            f"Data length {len(self.data)} does not match number of preprocessed files {len(self.preprocessed_files)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        preprocessed_image = torch.load(self.preprocessed_files[idx])  # Load the preprocessed tensor from file
        label = self.data[idx]['label']
        if self.transform:
            preprocessed_image = self.transform(preprocessed_image)
        return preprocessed_image, label

# Define transformations (including data augmentation)
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Create datasets
train_dataset = CustomDataset(filtered_train_data, preprocessed_train_files, transform=transform)
test_dataset = CustomDataset(filtered_test_data, preprocessed_test_files, transform=transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Step 4: Create the ViT Model
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=len(set([item['label'] for item in filtered_train_data])),
    ignore_mismatched_sizes=True
)

# Step 5: Train the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 10 == 9:  # Print every 10 batches
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {running_loss / 10:.4f}')
            running_loss = 0.0
    print(f'Epoch {epoch+1} finished.')

# Step 6: Evaluate the Model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images).logits
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        if total % 100 == 0:  # Print every 100 images
            print(f'Progress: {total}/{len(test_loader.dataset)} images evaluated.')

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy:.2f}%')


Number of filtered train data entries: 11856
Number of filtered test data entries: 2975
Number of preprocessed files: 14640


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Batch [10/741], Loss: 2.6641
Epoch [1/10], Batch [20/741], Loss: 2.4907
Epoch [1/10], Batch [30/741], Loss: 2.4401
Epoch [1/10], Batch [40/741], Loss: 2.3358
Epoch [1/10], Batch [50/741], Loss: 2.3505
Epoch [1/10], Batch [60/741], Loss: 2.2187
Epoch [1/10], Batch [70/741], Loss: 2.2616
Epoch [1/10], Batch [80/741], Loss: 2.2518
Epoch [1/10], Batch [90/741], Loss: 2.1958
Epoch [1/10], Batch [100/741], Loss: 2.0777
Epoch [1/10], Batch [110/741], Loss: 2.1421
Epoch [1/10], Batch [120/741], Loss: 2.1205
Epoch [1/10], Batch [130/741], Loss: 2.1175
Epoch [1/10], Batch [140/741], Loss: 1.8167
Epoch [1/10], Batch [150/741], Loss: 2.1877
Epoch [1/10], Batch [160/741], Loss: 2.1421
Epoch [1/10], Batch [170/741], Loss: 1.9674
Epoch [1/10], Batch [180/741], Loss: 2.0756
Epoch [1/10], Batch [190/741], Loss: 2.0118
Epoch [1/10], Batch [200/741], Loss: 1.9082
Epoch [1/10], Batch [210/741], Loss: 1.9278
Epoch [1/10], Batch [220/741], Loss: 1.8974
Epoch [1/10], Batch [230/741], Loss: 1.94

In [5]:
import json
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViTForImageClassification
from sklearn.model_selection import train_test_split

# Step 1: Load Data from JSON
with open('C:\\Users\\hp\\Downloads\\DataVIT.json') as f:
    data = json.load(f)

# Flatten the image data
image_data = []
for item in data:
    for img in item['images']:
        image_data.append({'url': img['url'], 'label': item['interests'][0]})

# Collect paths to all .pt files
preprocessed_files = sorted([f'C:\\Users\\hp\\Downloads\\preprocessed_images_combined\\{filename}' 
                             for filename in os.listdir('C:\\Users\\hp\\Downloads\\preprocessed_images_combined') 
                             if filename.endswith('.pt')])

# Create a mapping from filename to preprocessed file path
preprocessed_file_map = {os.path.basename(file): file for file in preprocessed_files}

# Step 2: Split Data into Training and Test Sets
train_data, test_data = train_test_split(image_data, test_size=0.2, random_state=42)

# Filter the train data to include only entries that have a corresponding preprocessed file
filtered_train_data = [entry for entry in train_data if os.path.basename(entry['url']).replace('.jpg', '.pt') in preprocessed_file_map]

# Filter the test data to include only entries that have a corresponding preprocessed file
filtered_test_data = [entry for entry in test_data if os.path.basename(entry['url']).replace('.jpg', '.pt') in preprocessed_file_map]

# Define transformations (without data augmentation)
transform = transforms.Compose([
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Create datasets
train_dataset = CustomDataset(filtered_train_data, preprocessed_train_files, transform=transform)
test_dataset = CustomDataset(filtered_test_data, preprocessed_test_files, transform=transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Step 4: Create the ViT Model
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=len(set([item['label'] for item in filtered_train_data])),
    ignore_mismatched_sizes=True
)

# Step 5: Train the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 10 == 9:  # Print every 10 batches
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {running_loss / 10:.4f}')
            running_loss = 0.0
    print(f'Epoch {epoch+1} finished.')

# Step 6: Evaluate the Model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images).logits
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        if total % 100 == 0:  # Print every 100 images
            print(f'Progress: {total}/{len(test_loader.dataset)} images evaluated.')

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy:.2f}%')


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Batch [10/741], Loss: 2.5717
Epoch [1/10], Batch [20/741], Loss: 2.4114
Epoch [1/10], Batch [30/741], Loss: 2.4064
Epoch [1/10], Batch [40/741], Loss: 2.3562
Epoch [1/10], Batch [50/741], Loss: 2.3055
Epoch [1/10], Batch [60/741], Loss: 2.3425
Epoch [1/10], Batch [70/741], Loss: 2.2487
Epoch [1/10], Batch [80/741], Loss: 2.2641
Epoch [1/10], Batch [90/741], Loss: 2.1757
Epoch [1/10], Batch [100/741], Loss: 2.1765
Epoch [1/10], Batch [110/741], Loss: 2.1063
Epoch [1/10], Batch [120/741], Loss: 2.1868
Epoch [1/10], Batch [130/741], Loss: 2.0708
Epoch [1/10], Batch [140/741], Loss: 2.0361
Epoch [1/10], Batch [150/741], Loss: 2.0284
Epoch [1/10], Batch [160/741], Loss: 1.9470
Epoch [1/10], Batch [170/741], Loss: 2.0512
Epoch [1/10], Batch [180/741], Loss: 2.1432
Epoch [1/10], Batch [190/741], Loss: 2.0070
Epoch [1/10], Batch [200/741], Loss: 1.9716
Epoch [1/10], Batch [210/741], Loss: 1.8476
Epoch [1/10], Batch [220/741], Loss: 1.8850
Epoch [1/10], Batch [230/741], Loss: 1.99

In [6]:
import json
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import ViTForImageClassification
from sklearn.model_selection import train_test_split

# Step 1: Load Data from JSON
with open('C:\\Users\\hp\\Downloads\\DataVIT.json') as f:
    data = json.load(f)

# Flatten the image data
image_data = []
for item in data:
    for img in item['images']:
        image_data.append({'url': img['url'], 'label': item['interests'][0]})

# Collect paths to all .pt files
preprocessed_files = sorted([f'C:\\Users\\hp\\Downloads\\preprocessed_images_combined\\{filename}' 
                             for filename in os.listdir('C:\\Users\\hp\\Downloads\\preprocessed_images_combined/') 
                             if filename.endswith('.pt')])

# Create a mapping from filename to preprocessed file path
preprocessed_file_map = {os.path.basename(file): file for file in preprocessed_files}

# Step 2: Split Data into Training and Test Sets
train_data, test_data = train_test_split(image_data, test_size=0.2, random_state=42)

# Filter the train data to include only entries that have a corresponding preprocessed file
filtered_train_data = [entry for entry in train_data if os.path.basename(entry['url']).replace('.jpg', '.pt') in preprocessed_file_map]

# Filter the test data to include only entries that have a corresponding preprocessed file
filtered_test_data = [entry for entry in test_data if os.path.basename(entry['url']).replace('.jpg', '.pt') in preprocessed_file_map]

# Verify the length of filtered_train_data, filtered_test_data and preprocessed_files
print(f"Number of filtered train data entries: {len(filtered_train_data)}")
print(f"Number of filtered test data entries: {len(filtered_test_data)}")
print(f"Number of preprocessed files: {len(preprocessed_files)}")

# Map the preprocessed files to train and test data
preprocessed_train_files = [preprocessed_file_map[os.path.basename(entry['url']).replace('.jpg', '.pt')] for entry in filtered_train_data]
preprocessed_test_files = [preprocessed_file_map[os.path.basename(entry['url']).replace('.jpg', '.pt')] for entry in filtered_test_data]

# Step 3: Create the Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, data, preprocessed_files, transform=None):
        self.data = data
        self.preprocessed_files = preprocessed_files
        self.transform = transform

        # Ensure the lengths match
        assert len(self.data) == len(self.preprocessed_files), (
            f"Data length {len(self.data)} does not match number of preprocessed files {len(self.preprocessed_files)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        preprocessed_image = torch.load(self.preprocessed_files[idx])  # Load the preprocessed tensor from file
        label = self.data[idx]['label']
        if self.transform:
            preprocessed_image = self.transform(preprocessed_image)
        return preprocessed_image, label

# Define transformations (if any additional transformations are needed)
transform = transforms.Compose([
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Create datasets
train_dataset = CustomDataset(filtered_train_data, preprocessed_train_files, transform=transform)
test_dataset = CustomDataset(filtered_test_data, preprocessed_test_files, transform=transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Step 4: Create the ViT Model
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=len(set([item['label'] for item in filtered_train_data])),
    ignore_mismatched_sizes=True
)

# Step 5: Train the Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 10 == 9:  # Print every 10 batches
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{len(train_loader)}], Loss: {running_loss / 10:.4f}')
            running_loss = 0.0
    print(f'Epoch {epoch+1} finished.')

# Step 6: Evaluate the Model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images).logits
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        if total % 100 == 0:  # Print every 100 images
            print(f'Progress: {total}/{len(test_loader.dataset)} images evaluated.')

accuracy = 100 * correct / total
print(f'Accuracy: {accuracy:.2f}%')


Number of filtered train data entries: 11856
Number of filtered test data entries: 2975
Number of preprocessed files: 14640


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [1/10], Batch [10/741], Loss: 2.5790
Epoch [1/10], Batch [20/741], Loss: 2.2073
Epoch [1/10], Batch [30/741], Loss: 2.1986
Epoch [1/10], Batch [40/741], Loss: 2.1167
Epoch [1/10], Batch [50/741], Loss: 2.0829
Epoch [1/10], Batch [60/741], Loss: 2.1577
Epoch [1/10], Batch [70/741], Loss: 2.0347
Epoch [1/10], Batch [80/741], Loss: 1.9986
Epoch [1/10], Batch [90/741], Loss: 2.0597
Epoch [1/10], Batch [100/741], Loss: 1.8853
Epoch [1/10], Batch [110/741], Loss: 1.8176
Epoch [1/10], Batch [120/741], Loss: 2.0313
Epoch [1/10], Batch [130/741], Loss: 1.9176
Epoch [1/10], Batch [140/741], Loss: 2.0088
Epoch [1/10], Batch [150/741], Loss: 1.8199
Epoch [1/10], Batch [160/741], Loss: 1.7769
Epoch [1/10], Batch [170/741], Loss: 1.7643
Epoch [1/10], Batch [180/741], Loss: 2.0189
Epoch [1/10], Batch [190/741], Loss: 1.7140
Epoch [1/10], Batch [200/741], Loss: 1.7912
Epoch [1/10], Batch [210/741], Loss: 1.8924
Epoch [1/10], Batch [220/741], Loss: 1.9192
Epoch [1/10], Batch [230/741], Loss: 1.79

In [None]:
import tensorflow as tf
from tensorflow.keras import layers as L

# Define hyperparameters
learning_rate = 0.001
weight_decay = 0.0001
num_epochs = 1
image_size = 224  # Adjust based on your actual image size
patch_size = 7
num_patches = (image_size // patch_size) ** 2
projection_dim = 64
num_heads = 4
transformer_units = [projection_dim * 2, projection_dim]
transformer_layers = 8
mlp_head_units = [56, 28]
n_classes = 10  # Adjust based on your actual number of classes

# Define MLP
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = L.Dense(units, activation=tf.nn.gelu)(x)
        x = L.Dropout(dropout_rate)(x)
    return x

# Define Patch Creation Layer
class Patches(L.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding='VALID',
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

# Define Patch Encoding Layer
class PatchEncoder(L.Layer):
    def __init__(self, num_patches, projection_dim):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        self.projection = L.Dense(units=projection_dim)
        self.position_embedding = L.Embedding(input_dim=num_patches, output_dim=projection_dim)

    def call(self, patch):
        positions = tf.range(start=0, limit=self.num_patches, delta=1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

# Build the ViT model
def vision_transformer():
    inputs = L.Input(shape=(image_size, image_size, 3))
    
    # Create patches
    patches = Patches(patch_size)(inputs)
    
    # Encode patches
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)
    
    # Create multiple layers of the Transformer block
    for _ in range(transformer_layers):
        x1 = L.LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = L.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, dropout=0.1)(x1, x1)
        x2 = L.Add()([attention_output, encoded_patches])
        x3 = L.LayerNormalization(epsilon=1e-6)(x2)
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        encoded_patches = L.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor
    representation = L.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = L.Flatten()(representation)
    representation = L.Dropout(0.5)(representation)
    
    # Add MLP
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
    
    # Classify outputs
    logits = L.Dense(n_classes)(features)
    
    # Create the model
    model = tf.keras.Model(inputs=inputs, outputs=logits)
    
    return model

# Set learning rate schedule and optimizer
decay_steps = len(dataset) // 32  # Adjust based on your batch size
initial_learning_rate = learning_rate
lr_decayed_fn = tf.keras.experimental.CosineDecay(initial_learning_rate, decay_steps)
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_decayed_fn)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Compile the model
model = vision_transformer()
model.compile(optimizer=optimizer, 
              loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1), 
              metrics=['accuracy'])

# Define callbacks
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5, mode='max', restore_best_weights=True, verbose=1)
checkpointer = tf.keras.callbacks.ModelCheckpoint(filepath='./model.hdf5', monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True, mode='max')
callbacks = [earlystopping, lr_scheduler, checkpointer]

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
train_dataset = dataset.take(train_size)
valid_dataset = dataset.skip(train_size)

# Train the model
model.fit(train_dataset, validation_data=valid_dataset, epochs=num_epochs, callbacks=callbacks)

# Evaluate the model
print('Training results')
model.evaluate(train_dataset)
print('Validation results')
model.evaluate(valid_dataset)
