In [41]:
from PIL import Image
import kagglehub
import numpy as np
import os
import random

def train_test_split(path, split=0.75, max=None):
    items = [f"{path}/{i}" for i in os.listdir(path) if "png" in i]
    items = items[:max] if max is not None else items
    random.shuffle(items)
    split_index = int(len(items) * split)
    train, test = items[:split_index], items[split_index:]
    return train, test


def csv_loader(path):
    coords = []
    with open(path) as f:
        lines = f.readlines()

    for line in lines:
        lat, lng = line.strip().split(",")
        coords.append([float(lat), float(lng)])

    return coords


# 10,000 images, zero-indexed.png
path = kagglehub.dataset_download("paulchambaz/google-street-view")
path = f"{path}/dataset"

coords = csv_loader(f"{path}/coords.csv")

print(f"{len(coords)} coordinates loaded (lat, lng)")

train, test = train_test_split(path, split=0.75, max=100)
print(f"{len(train)} images for train, {len(test)} images for test")

10000 coordinates loaded (lat, lng)
75 images for train, 25 images for test


In [42]:
# I call the inputs (incoming nodes) 'x', and outgoing (nodes that I affect) 'o'

class Layer():
    compiled = False
    len = None
    x = None
    o = None
    
    def compile(self, inputs: int = None):
        if self.len is None:
            if inputs is None:
                raise Exception("The layer has an undefined size")
            self.len = inputs
        self.compiled = True

    def forward(self, x):
        raise Exception("Forward not defined")

    def backward(self, dl):
        raise Exception("Backward not defined")
    
    def loss(self, lr):
        pass

    def __len__(self):
        return self.len
        
# ReLU and LeakyReLU together
class ReLU(Layer):
    def __init__(self, c=0):
        self.c = c

    def forward(self, x):
        self.x = x
        return np.where(x > 0, x, self.c * x)


class Sigmoid(Layer):
    def forward(self, x):
        self.x = x
        self.o = 1 / (1 + np.exp(-x))
        return self.o
    
    def backward(self, dl):
        return dl * self.o * (1 - self.o)


class PatchLinear(Layer):
    def __init__(self, outputs, wpn):
        self.len = outputs
        self.wpn = wpn

    def compile(self, _=None):
        if _ is not None:
            raise Exception("This layer must be the first layer")
            
        self.weights = np.random.randn(self.len, self.wpn)  # N x Inputs
        self.bias = np.random.randn(self.len)  # N x 1
        self.compiled = True

    def forward(self, x):
        self.x = x
        self.o = []
        for patch, weights in zip(x, self.weights):
            self.o.append(patch.T @ weights)  # dot product, returns float
        return np.array(self.o) + self.bias

    def backward(self, dl):
        self.dW = np.zeros_like(self.weights)
        self.db = np.zeros_like(self.bias)
        dx = np.zeros_like(self.x) 

        for i, (patch, w) in enumerate(zip(self.x, self.weights)):
            self.dW[i] = dl[i] * patch
            self.db[i] = dl[i]
            dx[i] = dl[i] * w
        return dx

    def loss(self, lr):
        self.weights -= lr * self.dW
        self.bias -= lr * self.db

    def __repr__(self):
        return f"Patch Linear layer, with {self.wpn} weights per neuron and {self.len} outputs"


class Dense(Layer):
    def __init__(self, outputs):
        self.len = outputs

    def compile(self, inputs):
        if inputs is None and self.inputs is None:
            raise Exception("This layer has an undefined size")
        self.inputs = inputs
        self.weights = np.random.randn(self.len, inputs) # N x Inputs
        self.bias = np.random.randn(self.len) # N x 1
        self.compiled = True

    def forward(self, x):
        self.x = x
        self.o = self.weights @ x + self.bias
        return self.o
            
    def backward(self, dl):
        self.db = dl # x 1
        self.dW = np.outer(dl, self.x) # (N) x (Inputs)
        return self.weights.T @ dl
    
    def loss(self, lr):
        self.weights -= lr * self.dW
        self.bias -= lr * self.db

        
    def __repr__(self):
        return f"Dense layer, with {self.inputs} inputs and {self.len} outputs"

class Softmax(Layer):
    def forward(self, x):
        self.x = x
        sum_exp = np.exp(x).sum()
        self.o = np.exp(x) / sum_exp
        return self.o

    def backward(self, dl):
        dq = np.zeros_like(self.o)
        for i in range(len(dl)):
            for j in range(len(dl)):
                dq[i] += dl[j] * (self.o[i] * (1 - self.o[i])) if i == j else -(self.o[i] * self.o[j])

        return dq

In [43]:
from typing import List

class Model:
    def __init__(self):
        self.layers: List[Layer] = []

    def sigmoid(self):
        self.layers.append(Sigmoid())
        return self
    
    def relu(self):
        self.layers.append(ReLU())
        return self

    def leaky_relu(self, c: float):
        self.layers.append(ReLU(c=c))
        return self
    
    def patch_linear(self, outputs, wpn):
        self.layers.append(PatchLinear(outputs, wpn))
        return self

    def dense(self, outputs):
        self.layers.append(Dense(outputs))
        return self

    def softmax(self):
        self.layers.append(Softmax())
        return self
    
    def compile(self):
        inputs = None
        for layer in self.layers:
            layer.compile(inputs)
            inputs = len(layer)

    def train(self, data, correct, lr=0.001, epochs=100):
        for epoch in range(epochs):
            lowest_loss = float("inf")
            for inputs, target in zip(data, correct):
                if len(self.layers) == 0:
                    raise Exception("No layers to train")

                if inputs.shape[0] != len(self.layers[0]):
                    raise Exception("Data/input-layer shape mismatch")

                # forward pass
                for layer in self.layers:
                    if not layer.compiled:
                        raise Exception("Please compile your layers!")
                    inputs = layer.forward(inputs)

                if inputs.shape != target.shape:
                    raise Exception("Output-layer/target shape mismatch")

                loss = -(target * np.log(inputs)).sum()
                if loss < lowest_loss:
                    lowest_loss = loss

                # backwards pass
                dl = -target / inputs
                for index in range(len(self.layers) - 1, -1, -1):
                    layer = self.layers[index]
                    dl = layer.backward(dl)
                    layer.loss(lr)
            
            print(f"Epoch {epoch + 1}/{epochs}; best loss:", lowest_loss)

    def log(self):
        for layer in self.layers:
            print(layer)



In [44]:
OUT_H, OUT_W = 16, 32

model = (
    Model()
    .patch_linear(400, wpn=3072)  # 20 x 20 patches, 3072 weights per neuron
    .sigmoid()
    .dense(320)
    .sigmoid()
    .dense(OUT_H * OUT_W) # broadcast back up to heatmap coordinates
    .sigmoid()
    .softmax()
)

model.compile()
model.log()

Patch Linear layer, with 3072 weights per neuron and 400 outputs
<__main__.Sigmoid object at 0x117e3acf0>
Dense layer, with 400 inputs and 320 outputs
<__main__.Sigmoid object at 0x117e3a750>
Dense layer, with 320 inputs and 512 outputs
<__main__.Sigmoid object at 0x117e3aff0>
<__main__.Softmax object at 0x117e3b440>


In [None]:
def load_imgs(images):
    return np.array([np.asarray(Image.open(img)) for img in images])


def load_correct(images, height, width, sigma=1.0):
    images = [image.split("/")[-1] for image in images]
    indices = [int(image.split(".")[0]) for image in images]
    positions = [coords[index] for index in indices]
    correct = []
    for lat, lng in positions:
        array = np.zeros((height, width))
        lat_index = int((lat + 90) / 180 * height)
        lat_index = min(height - 1, lat_index)
        lon_index = int((lng + 180) / 360 * width)
        lon_index = min(width - 1, lon_index)
        for y in range(height):
            for x in range(width):
                # gaussian distribution for heatmap
                dist = (x - lon_index)**2 + (y - lat_index)**2
                array[y, x] = np.exp(-dist / (2 * sigma**2))
        correct.append(array.flatten())
    return np.array(correct)

# Need data in the right shape
# Incoming: [N, 640, 640, 3] 
# Returns: -> [N, 3 x 640 x 640]
def format_frames(frames, patch_size=32):
    N, H, W, C  = frames.shape
    num_patches_h = H // patch_size
    num_patches_w = W // patch_size
    # this took SO much experimentation, but you need to do this to ensure the patches are preserved
    frames = frames.reshape(N, num_patches_h, patch_size, num_patches_w, patch_size, 3)
    frames = frames.transpose(0, 1, 3, 2, 4, 5)
    frames = frames.reshape(N, -1, patch_size, C)
    frames = frames.transpose(0, 3, 1, 2)
    frames = frames.reshape(N, num_patches_h * num_patches_w, -1)
    return frames / 255.0  # normalize to 0-1


train_imgs = load_imgs(train)
train_correct = load_correct(train, OUT_H, OUT_W, sigma=5)
f_frames = format_frames(train_imgs)

model.train(data=f_frames, correct=train_correct, lr=0.01, epochs=100)

Epoch 1/100; best loss: 584.1142595204557
Epoch 2/100; best loss: 581.9623109291438
Epoch 3/100; best loss: 581.9889681924067
Epoch 4/100; best loss: 581.920544916958
Epoch 5/100; best loss: 581.9235405885946
Epoch 6/100; best loss: 582.5073297172912
Epoch 7/100; best loss: 582.6686586881954
Epoch 8/100; best loss: 582.8624524037465
