In [8]:
import os
from copy import deepcopy

from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms

import numpy as np

import custom_model

from PIL import Image

# autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
# transform for images
transform = transforms.Compose([
    transforms.Resize((2048, 1024)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Get the data
all_data = []
folder_paths=[r"D:\Unity\AITX_PanLoc\Assets\data\21.34938_-12.25565_folder"]
for folder_path in folder_paths:
    file_paths = os.listdir(folder_path)

    base_image = next((os.path.join(folder_path, f) for f in file_paths if "_1." in f and f.endswith(".jpg")), None)
    base_text = next((os.path.join(folder_path, f) for f in file_paths if "_1." in f and f.endswith(".txt")), None)

    if not (base_image and base_text):
        print(f"Base image or text not found in {folder_path}")
        continue

    with open(base_text, 'r') as f:
        base_position = [float(x) for x in f.read().split("(")[1].split(")")[0].split(",")]

    for file_path in file_paths:
        if "_1." in file_path or not file_path.endswith(".jpg"):
            continue

        image_path = os.path.join(folder_path, file_path)
        txt_path = os.path.join(folder_path, f"{os.path.splitext(file_path)[0]}.txt")

        if os.path.exists(txt_path):
            with open(txt_path, 'r') as f:
                location = [float(x) for x in f.read().split("(")[1].split(")")[0].split(",")]
            relative_x = location[0] - base_position[0]
            relative_y = location[2] - base_position[2]
            
            max_distance = 50
            normalized_x = np.clip(relative_x / max_distance, -1, 1)
            normalized_y = np.clip(relative_y / max_distance, -1, 1)
            
            all_data.append((base_image, image_path, normalized_x, normalized_y, 1.0))

# Turn the paths into tensors of the image
for i, data in enumerate(all_data):
    base_image=Image.open(data[0])
    base_image=transform(base_image).unsqueeze(0)
    current_image=Image.open(data[1])
    current_image=transform(current_image).unsqueeze(0)
    position=(data[2], data[4])
    all_data[i]=(base_image, current_image, position)

In [47]:
class ImagePairDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        base_img, current_img, position = self.data[idx]
        return base_img.squeeze(0), current_img.squeeze(0), torch.tensor(position)

dataset = ImagePairDataset(all_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [29]:
model = custom_model.ImagePositionPredictor()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)

In [58]:
def custom_loss(pred, target):
    # convert everything to float
    pred = pred.float()
    target = target.float()
    mse_loss = F.mse_loss(pred[:, :1], target[:, :1])
    # bce_loss = F.binary_cross_entropy(pred[:, 1], target[:, 1])
    return mse_loss#+ bce_loss

In [59]:
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for base_img, current_img, position in dataloader:
        base_img, current_img, position = base_img.to(device), current_img.to(device), position.to(device)

        optimizer.zero_grad()
        output = model(base_img, current_img)
        loss = custom_loss(output, position)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    scheduler.step(avg_loss)

tensor([[-0.0273,  0.0559],
        [-0.0430,  0.0506],
        [-0.0451,  0.0522]], device='cuda:0', grad_fn=<SliceBackward0>)
tensor([[-0.8784,  1.0000],
        [ 0.1934,  1.0000],
        [-0.6916,  1.0000]], device='cuda:0', dtype=torch.float64)
<class 'torch.Tensor'>
<class 'torch.Tensor'>
Epoch 1/3, Loss: 0.3994
tensor([[-0.3911, -0.0596],
        [-0.3612, -0.0597],
        [-0.3533, -0.0577]], device='cuda:0', grad_fn=<SliceBackward0>)
tensor([[-0.8784,  1.0000],
        [ 0.1934,  1.0000],
        [-0.6916,  1.0000]], device='cuda:0', dtype=torch.float64)
<class 'torch.Tensor'>
<class 'torch.Tensor'>
Epoch 2/3, Loss: 0.2199
tensor([[-0.5921, -0.1350],
        [-0.6048, -0.1373],
        [-0.6552, -0.1501]], device='cuda:0', grad_fn=<SliceBackward0>)
tensor([[-0.6916,  1.0000],
        [ 0.1934,  1.0000],
        [-0.8784,  1.0000]], device='cuda:0', dtype=torch.float64)
<class 'torch.Tensor'>
<class 'torch.Tensor'>
Epoch 3/3, Loss: 0.2323
