In [9]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import torch
from torch import nn

from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from transformers import ViTConfig, ViTModel

from PIL import Image
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
data_dir = "/home/ishan/Projects/Flask/Dataset"

filepath_list, feature_list = [], []

with open(os.path.join(data_dir, "labels-map-proj-v3.txt")) as f:
    lines = f.readlines()
    for each_line in lines:
        line_list = each_line.split(" ")
        filepath_list.append(os.path.join(data_dir, "map-proj-v3", line_list[0]))
        feature_list.append(int(line_list[1]))

data_dict = {"filepath": filepath_list, "label": feature_list}
data_df = pd.DataFrame(data_dict)

data_df = data_df[~data_df["filepath"].str.contains(".DS_Store")]

print(f"Number of samples: {len(data_df)}")
print(f"Unique labels: {data_df['label'].nunique()}")

data_df = data_df.sample(frac=1, random_state=42).reset_index(drop=True)

train_df, temp_df = train_test_split(
    data_df,
    test_size=0.3,
    stratify=data_df["label"],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=42
)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")


Number of samples: 73031
Unique labels: 8
Training set size: 51121
Validation set size: 10955
Test set size: 10955


In [4]:
class CustomDataset(Dataset):

  def __init__(self, data_df):

    self.images = []
    self.labels = []
    self.transform = transforms.ToTensor()

    for _, row in data_df.iterrows():
      image_path = row["filepath"]
      self.images.append(image_path)
      self.labels.append(row["label"])

  def __len__(self):
      return len(self.images)

  def __getitem__(self, idx):
      image_path = self.images[idx]
      label = self.labels[idx]
      filename = image_path.split("/")[-1]

      if not os.path.exists(image_path):
         print(image_path)
      image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
      image = np.stack((image,)*3, axis=-1)

      return image, label, filename.split("/")[-1]

In [5]:
epochs = 1
batch_size = 1
learning_rate = 2e-5

train_dataset = CustomDataset(train_df)
val_dataset = CustomDataset(val_df)
test_dataset = CustomDataset(test_df)

n_cpu = os.cpu_count()
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=n_cpu, drop_last=True)
val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=n_cpu, drop_last=True)
test_dl =  DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=n_cpu)

In [6]:
class Classifier(nn.Module):

    def __init__(self, config, num_labels):

        super(Classifier, self).__init__()
        self.vit = ViTModel(config)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.vit.config.hidden_size, num_labels)

    def forward(self, inputs):

        outputs = self.vit(inputs)
        output = self.dropout(outputs.last_hidden_state[:, 0])
        logits = self.classifier(output)
        logits = torch.nn.functional.softmax(logits, dim=1)
        return logits


# Define Models
config = ViTConfig(
    image_size=224,
    patch_size=16,
    num_classes=8,
    num_channels=1,
    embed_dim=1024,
    depth=24,
    num_heads=16,
    mlp_ratio=4,
    num_attention_heads=16,
    hidden_size=1024,
    num_layers=24
)

model = Classifier(config, 8).to(device)
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Number of params (M): %.2f' % (n_parameters / 1.e6))

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss()
  
if torch.cuda.is_available():
    model.cuda()

Number of params (M): 127.50


In [7]:
class ViTForImageClassification(nn.Module):

    def __init__(self, num_labels=3):

        super(ViTForImageClassification, self).__init__()
        self.vit = ViTModel.from_pretrained('google/vit-large-patch16-224-in21k')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.vit.config.hidden_size, num_labels)

    def forward(self, pixel_values):

        outputs = self.vit(pixel_values=pixel_values)
        output = self.dropout(outputs.last_hidden_state[:, 0])
        logits = self.classifier(output)
        logits = torch.nn.functional.softmax(logits, dim=1)
        return logits

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),         
    transforms.Normalize(mean=[0.5], std=[0.5])  
])

ground_truth, prediction = [] ,[]

model = torch.load("vit_imagenet_hirise_high.pth", map_location=torch.device('cpu'))

model.eval()

with torch.no_grad():

    for input, target in zip(test_df['filepath'], test_df['label']):

        image = Image.open(input).convert('RGB')

        input_tensor = transform(image).unsqueeze(0)  

        output = model(input_tensor)

        label = output.cpu().detach().numpy()
        prediction.append(np.argmax(label))
        ground_truth.append(target)

In [11]:
print("Accuracy:", accuracy_score(ground_truth, prediction))
print("Precision:", precision_score(ground_truth, prediction, average="weighted"))
print("Recall:", recall_score(ground_truth, prediction, average="weighted"))
print("F1-Score:", f1_score(ground_truth, prediction, average="weighted"))

Accuracy: 0.8311571364250337
Precision: 0.6908221854302622
Recall: 0.8311571364250337
F1-Score: 0.7545198297716315


  _warn_prf(average, modifier, msg_start, len(result))
