In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import sys
sys.path.append('/content/drive/MyDrive/dl-proj')

In [22]:
import kagglehub
import os
import torch
import pandas as pd
import matplotlib.pyplot as plt
import shutil

from sklearn.model_selection import train_test_split
from torchvision import transforms
from torch.utils.data import DataLoader
from helper_utils import ImageDataset, SplitImageDataset
from model import ResNetModel
from model_utils import train_model

from clip_utils import predict_image

from transformers import CLIPProcessor, CLIPModel
from PIL import Image

In [23]:
# Download data if it does not exist already
data_path = "./data/"
image_path = "./data/Images"
csv_path = "./data/Dataframes"

#define source path
source_path = kagglehub.dataset_download("amaralibey/gsv-cities")
print(source_path)

#check for cuda or mps
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

/root/.cache/kagglehub/datasets/amaralibey/gsv-cities/versions/1


In [24]:
#define destination path
destination_path = "./data"

# Ensure the destination directory exists
os.makedirs(destination_path, exist_ok=True)

# Copy all contents from source to destination
for item in os.listdir(source_path):
    source_item = os.path.join(source_path, item)
    destination_item = os.path.join(destination_path, item)

    shutil.move(source_item, destination_item)

print(f"All files have been copied to {destination_path}")


All files have been copied to ./data


In [25]:
image_path = "./data/Images"     # This was the path to my images, might need to remove

dataset = ImageDataset(image_path)

data, labels = dataset.image_paths, dataset.labels

#Split the dataset into training, validation, and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    data, labels, test_size=0.2, random_state=7 # 80% train, 20% test
)

#Split the test set into validation and testing sets (10% each of overall)
val_size = 0.5
val_data, test_data, val_labels, test_labels = train_test_split(
    test_data, test_labels, test_size=val_size, random_state=7
)

Loading images from: Bangkok
Loading images from: TRT
Loading images from: OSL
Loading images from: PRG
Loading images from: LosAngeles
Loading images from: Chicago
Loading images from: WashingtonDC
Loading images from: Phoenix
Loading images from: Melbourne
Loading images from: Boston
Loading images from: Brussels
Loading images from: Madrid
Loading images from: Barcelona
Loading images from: BuenosAires
Loading images from: Minneapolis
Loading images from: London
Loading images from: Medellin
Loading images from: Lisbon
Loading images from: Rome
Loading images from: Miami
Loading images from: MexicoCity
Loading images from: PRS
Loading images from: Osaka


In [26]:
# Split into three datasets, apply transformations to train dataset
train_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ColorJitter(brightness=.15,contrast=.15,saturation=.05,hue=.05),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])


train_dataset = SplitImageDataset(train_data, train_labels, transform = train_transform)
val_dataset = SplitImageDataset(val_data, val_labels)
test_dataset = SplitImageDataset(test_data, test_labels)


In [27]:
# Build data loader for three datasets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [28]:
# Define model

num_classes = len(dataset.label_map)
model = ResNetModel(num_classes=num_classes).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [29]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device=device)

In [30]:
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

def make_preds(image_path, model, processor, class_names):
  text_inputs = processor(text=class_names, return_tensors="pt", padding=True)

  predicted_labels = []

  for image_path in tqdm(test_data):
      predicted_label, _ = predict_image(
          image_path=image_path,
          model=model,
          processor=processor,
          text_inputs=text_inputs,
          class_names=class_names
      )
      predicted_labels.append(predicted_label)        # will return countries

  label_to_index = {name: idx for idx, name in enumerate(class_names)}
  predicted_labels_idx = [label_to_index[label] for label in predicted_labels]

  print(test_labels)

  accuracy = accuracy_score(test_labels, predicted_labels_idx)
  precision = precision_score(test_labels, predicted_labels_idx, average="weighted")
  recall = recall_score(test_labels, predicted_labels_idx, average="weighted")
  f1 = f1_score(test_labels, predicted_labels_idx, average="weighted")

  print(f"Accuracy: {accuracy * 100:.2f}%")
  print(f"Precision: {precision:.2f}")
  print(f"Recall: {recall:.2f}")
  print(f"F1 Score: {f1:.2f}")

In [31]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

locations = ["Bangkok, Thailand", "Toronto, Canada", "Oslo, Norway", "Prague, Czech Republic", "Los Angeles, United States", "Chicago, United States", "Washington DC, United States", "Phoenix, United States", "Melbourne, Australia", "Boston, United States", "Brussels, Belgium", "Madrid, Spain", "Barcelona, Spain", "Buenos Aires, Argentina", "Minneapolis, United States", "London, United Kingdom", "Medellin, Colombia", "Lisbon, Portugal", "Rome, Italy", "Miami, United States", "Mexico City, Mexico", "Paris, France", "Osaka, Japan"]
countries = ["Thailand", "Canada", "Norway", "Czech Republic", "United States", "United States", "United States", "United States", "Australia", "United States", "Belgium", "Spain", "Spain", "Argentina", "United States", "United Kingdom", "Colombia", "Portugal", "Italy", "United States", "Mexico", "France", "Japan"]
cities = ["Bangkok", "Toronto", "Oslo", "Prague", "Los Angeles", "Chicago", "Washington DC", "Phoenix", "Melbourne", "Boston", "Brussels", "Madrid", "Barcelona", "Buenos Aires", "Minneapolis", "London", "Medellin", "Lisbon", "Rome", "Miami", "Mexico City", "Paris", "Osaka"]


In [14]:
make_preds(image_path, model, processor, locations)

100%|██████████| 2300/2300 [20:10<00:00,  1.90it/s]

[17, 10, 11, 22, 2, 4, 6, 22, 11, 8, 15, 12, 0, 16, 12, 20, 9, 22, 8, 17, 0, 17, 0, 21, 14, 6, 22, 4, 19, 7, 10, 22, 16, 13, 21, 10, 0, 20, 14, 11, 6, 15, 0, 1, 0, 16, 17, 5, 3, 2, 4, 7, 21, 16, 15, 3, 20, 14, 13, 5, 1, 13, 22, 3, 12, 12, 6, 0, 16, 9, 21, 14, 8, 20, 13, 20, 12, 6, 10, 17, 0, 0, 9, 6, 18, 13, 13, 13, 17, 20, 10, 17, 11, 7, 19, 18, 0, 12, 15, 3, 9, 15, 17, 9, 15, 9, 12, 14, 7, 20, 18, 4, 9, 17, 11, 12, 14, 8, 3, 4, 0, 21, 21, 22, 1, 16, 18, 22, 7, 4, 20, 3, 9, 17, 13, 9, 1, 5, 18, 12, 0, 19, 17, 8, 9, 11, 1, 13, 18, 22, 16, 12, 20, 20, 16, 19, 22, 11, 9, 2, 2, 3, 8, 8, 11, 12, 12, 21, 4, 22, 11, 19, 22, 10, 18, 8, 6, 14, 5, 19, 20, 17, 9, 0, 20, 7, 4, 21, 20, 12, 9, 11, 16, 6, 20, 13, 20, 16, 18, 7, 14, 20, 12, 21, 3, 19, 9, 1, 13, 19, 4, 10, 11, 13, 10, 16, 2, 4, 17, 0, 6, 19, 5, 12, 1, 12, 18, 7, 8, 21, 20, 8, 1, 22, 2, 10, 6, 18, 3, 16, 5, 14, 15, 7, 21, 21, 18, 9, 17, 20, 22, 11, 21, 18, 20, 0, 5, 1, 10, 5, 7, 3, 10, 11, 5, 18, 17, 3, 14, 6, 3, 16, 19, 6, 4, 11, 8, 2




In [15]:
make_preds(image_path, model, processor, countries)

100%|██████████| 2300/2300 [15:57<00:00,  2.40it/s]

[17, 10, 11, 22, 2, 4, 6, 22, 11, 8, 15, 12, 0, 16, 12, 20, 9, 22, 8, 17, 0, 17, 0, 21, 14, 6, 22, 4, 19, 7, 10, 22, 16, 13, 21, 10, 0, 20, 14, 11, 6, 15, 0, 1, 0, 16, 17, 5, 3, 2, 4, 7, 21, 16, 15, 3, 20, 14, 13, 5, 1, 13, 22, 3, 12, 12, 6, 0, 16, 9, 21, 14, 8, 20, 13, 20, 12, 6, 10, 17, 0, 0, 9, 6, 18, 13, 13, 13, 17, 20, 10, 17, 11, 7, 19, 18, 0, 12, 15, 3, 9, 15, 17, 9, 15, 9, 12, 14, 7, 20, 18, 4, 9, 17, 11, 12, 14, 8, 3, 4, 0, 21, 21, 22, 1, 16, 18, 22, 7, 4, 20, 3, 9, 17, 13, 9, 1, 5, 18, 12, 0, 19, 17, 8, 9, 11, 1, 13, 18, 22, 16, 12, 20, 20, 16, 19, 22, 11, 9, 2, 2, 3, 8, 8, 11, 12, 12, 21, 4, 22, 11, 19, 22, 10, 18, 8, 6, 14, 5, 19, 20, 17, 9, 0, 20, 7, 4, 21, 20, 12, 9, 11, 16, 6, 20, 13, 20, 16, 18, 7, 14, 20, 12, 21, 3, 19, 9, 1, 13, 19, 4, 10, 11, 13, 10, 16, 2, 4, 17, 0, 6, 19, 5, 12, 1, 12, 18, 7, 8, 21, 20, 8, 1, 22, 2, 10, 6, 18, 3, 16, 5, 14, 15, 7, 21, 21, 18, 9, 17, 20, 22, 11, 21, 18, 20, 0, 5, 1, 10, 5, 7, 3, 10, 11, 5, 18, 17, 3, 14, 6, 3, 16, 19, 6, 4, 11, 8, 2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
make_preds(image_path, model, processor, cities)

100%|██████████| 2300/2300 [15:41<00:00,  2.44it/s]

[17, 10, 11, 22, 2, 4, 6, 22, 11, 8, 15, 12, 0, 16, 12, 20, 9, 22, 8, 17, 0, 17, 0, 21, 14, 6, 22, 4, 19, 7, 10, 22, 16, 13, 21, 10, 0, 20, 14, 11, 6, 15, 0, 1, 0, 16, 17, 5, 3, 2, 4, 7, 21, 16, 15, 3, 20, 14, 13, 5, 1, 13, 22, 3, 12, 12, 6, 0, 16, 9, 21, 14, 8, 20, 13, 20, 12, 6, 10, 17, 0, 0, 9, 6, 18, 13, 13, 13, 17, 20, 10, 17, 11, 7, 19, 18, 0, 12, 15, 3, 9, 15, 17, 9, 15, 9, 12, 14, 7, 20, 18, 4, 9, 17, 11, 12, 14, 8, 3, 4, 0, 21, 21, 22, 1, 16, 18, 22, 7, 4, 20, 3, 9, 17, 13, 9, 1, 5, 18, 12, 0, 19, 17, 8, 9, 11, 1, 13, 18, 22, 16, 12, 20, 20, 16, 19, 22, 11, 9, 2, 2, 3, 8, 8, 11, 12, 12, 21, 4, 22, 11, 19, 22, 10, 18, 8, 6, 14, 5, 19, 20, 17, 9, 0, 20, 7, 4, 21, 20, 12, 9, 11, 16, 6, 20, 13, 20, 16, 18, 7, 14, 20, 12, 21, 3, 19, 9, 1, 13, 19, 4, 10, 11, 13, 10, 16, 2, 4, 17, 0, 6, 19, 5, 12, 1, 12, 18, 7, 8, 21, 20, 8, 1, 22, 2, 10, 6, 18, 3, 16, 5, 14, 15, 7, 21, 21, 18, 9, 17, 20, 22, 11, 21, 18, 20, 0, 5, 1, 10, 5, 7, 3, 10, 11, 5, 18, 17, 3, 14, 6, 3, 16, 19, 6, 4, 11, 8, 2




In [34]:
countries = list({loc.split(", ")[1] for loc in locations})  # Extract unique countries
city_to_country_index = {i: countries.index(loc.split(", ")[1]) for i, loc in enumerate(locations)}


text_inputs = processor(text=countries, return_tensors="pt", padding=True)

predicted_labels = []

for image_path in tqdm(test_data):
    predicted_label, _ = predict_image(
        image_path=image_path,
        model=model,
        processor=processor,
        text_inputs=text_inputs,
        class_names=countries
    )
    predicted_labels.append(predicted_label)        # will return countries

label_to_index = {name: idx for idx, name in enumerate(countries)}
predicted_labels_idx = [label_to_index[label] for label in predicted_labels]
test_labels = [city_to_country_index[city_ind] for city_ind in test_labels if city_ind in city_to_country_index]



print(test_labels)

accuracy = accuracy_score(test_labels, predicted_labels_idx)
precision = precision_score(test_labels, predicted_labels_idx, average="weighted")
recall = recall_score(test_labels, predicted_labels_idx, average="weighted")
f1 = f1_score(test_labels, predicted_labels_idx, average="weighted")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

100%|██████████| 2300/2300 [14:21<00:00,  2.67it/s]

[5, 6, 2, 12, 13, 14, 14, 12, 2, 3, 9, 2, 8, 0, 2, 1, 14, 12, 3, 5, 8, 5, 8, 11, 14, 14, 12, 14, 14, 14, 6, 12, 0, 4, 11, 6, 8, 1, 14, 2, 14, 9, 8, 7, 8, 0, 5, 14, 15, 13, 14, 14, 11, 0, 9, 15, 1, 14, 4, 14, 7, 4, 12, 15, 2, 2, 14, 8, 0, 14, 11, 14, 3, 1, 4, 1, 2, 14, 6, 5, 8, 8, 14, 14, 10, 4, 4, 4, 5, 1, 6, 5, 2, 14, 14, 10, 8, 2, 9, 15, 14, 9, 5, 14, 9, 14, 2, 14, 14, 1, 10, 14, 14, 5, 2, 2, 14, 3, 15, 14, 8, 11, 11, 12, 7, 0, 10, 12, 14, 14, 1, 15, 14, 5, 4, 14, 7, 14, 10, 2, 8, 14, 5, 3, 14, 2, 7, 4, 10, 12, 0, 2, 1, 1, 0, 14, 12, 2, 14, 13, 13, 15, 3, 3, 2, 2, 2, 11, 14, 12, 2, 14, 12, 6, 10, 3, 14, 14, 14, 14, 1, 5, 14, 8, 1, 14, 14, 11, 1, 2, 14, 2, 0, 14, 1, 4, 1, 0, 10, 14, 14, 1, 2, 11, 15, 14, 14, 7, 4, 14, 14, 6, 2, 4, 6, 0, 13, 14, 5, 8, 14, 14, 14, 2, 7, 2, 10, 14, 3, 11, 1, 3, 7, 12, 13, 6, 14, 10, 15, 0, 14, 14, 9, 14, 11, 11, 10, 14, 5, 1, 12, 2, 11, 10, 1, 8, 14, 7, 6, 14, 14, 15, 6, 2, 14, 10, 5, 15, 14, 14, 15, 0, 14, 14, 14, 2, 3, 1, 9, 12, 0, 2, 12, 8, 7, 14, 6, 




In [17]:
# Fine-tuning CLIP

import torch
from torch.optim import AdamW
from torch.nn.functional import cosine_similarity
from torch.utils.data import DataLoader
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define constants
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
learning_rate = 1e-2
num_epochs = 10
temperature = 0.07

# Load the model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for images, labels in train_loader:  # Ensure train_loader provides images and numeric labels
        images = images.to(device)
        labels = labels.to(device)

        # Preprocess images and labels
        pixel_values = processor(images=images, return_tensors="pt").pixel_values.to(device)
        text_inputs = processor(
            text=[locations[label] for label in labels.cpu().numpy()],
            return_tensors="pt",
            padding=True
        ).input_ids.to(device)    # tensor containing city, country names converted to int tokens

        # Forward pass
        outputs = model(pixel_values=pixel_values, input_ids=text_inputs)
        image_embeddings = outputs.image_embeds / outputs.image_embeds.norm(dim=-1, keepdim=True)
        text_embeddings = outputs.text_embeds / outputs.text_embeds.norm(dim=-1, keepdim=True)

        # Compute similarity
        similarity = torch.matmul(image_embeddings, text_embeddings.T) # / temperature

        # Define ground truth and compute loss
        true_labels = torch.arange(similarity.size(0)).to(device)  # Diagonal targets
        loss_fn = torch.nn.CrossEntropyLoss()
        loss = loss_fn(similarity, true_labels)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader):.4f}")



# Validation Loop
model.eval()
val_loss = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(device)
        labels = labels.to(device)

        pixel_values = processor(images=images, return_tensors="pt").pixel_values.to(device)
        text_inputs = processor(
            text=[locations[label] for label in labels.cpu().numpy()],
            return_tensors="pt",
            padding=True
        ).input_ids.to(device)

        outputs = model(pixel_values=pixel_values, input_ids=text_inputs)
        image_embeddings = outputs.image_embeds / outputs.image_embeds.norm(dim=-1, keepdim=True)
        text_embeddings = outputs.text_embeds / outputs.text_embeds.norm(dim=-1, keepdim=True)

        similarity = torch.matmul(image_embeddings, text_embeddings.T) / temperature
        true_labels = torch.arange(similarity.size(0)).to(device)
        loss = torch.nn.CrossEntropyLoss()(similarity, true_labels)
        val_loss += loss.item()

        predicted_indices = similarity.argmax(dim=1)
        all_preds.extend(predicted_indices.cpu().numpy())
        all_labels.extend(true_labels.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average="weighted")
recall = recall_score(all_labels, all_preds, average="weighted")
f1 = f1_score(all_labels, all_preds, average="weighted")

print(f"Validation Loss: {val_loss / len(val_loader):.4f}")
print(f"Validation Accuracy: {accuracy * 100:.2f}%")
print(f"Validation Precision: {precision:.2f}")
print(f"Validation Recall: {recall:.2f}")
print(f"Validation F1 Score: {f1:.2f}")


It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


Epoch 1/10, Loss: 3.4657
Epoch 2/10, Loss: 3.4657
Epoch 3/10, Loss: 3.4657


KeyboardInterrupt: 