In [None]:
from typing import TypedDict
import csv
import kagglehub
import numpy as np
import os
import random
import re

class Sample(TypedDict):
    path: str
    lat: float
    lon: float
    image: np.ndarray
    target: np.ndarray

def coordinates_from_path(filepath):
    _, country, filename = filepath.split("/")[-3:]
    parts = re.split(r'[\s_d]|h|\.jpg', filename.replace("n", "-"))
    lat_deg, lat_dec, lon_deg, lon_dec, _, heading, _ = parts
    lat = float(f"{lat_deg}.{lat_dec}")
    lon = float(f"{lon_deg}.{lon_dec}")
    heading = int(heading)
    return (country, lat, lon, heading)

def load_images(path, output_csv="coordinates.csv"):
    items = []
    coordinates_dict = {}
    
    for country in next(os.walk(path))[1]:
        for image in os.listdir(f"{path}/{country}"):
            if "jpg" not in image: continue
            filepath = f"{path}/{country}/{image}"
            items.append(filepath)
            coords = coordinates_from_path(filepath)
            _, lat, lon, _ = coords
            coordinates_dict[filepath] = (lat, lon)
    
    random.shuffle(items)
    with open(output_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['country', 'lat', 'lon', 'heading'])
        for filepath in items:
            lat, lon = coordinates_dict[filepath]
            country = filepath.split("/")[-2]
            writer.writerow([country, lat, lon, 0])

    return items

def train_test_split(paths, split=0.75, max=None):
    paths = paths[:max] if max is not None else paths
    split_index = int(len(paths) * split)
    train, test = paths[:split_index], paths[split_index:]
    return train, test


# 10,000 images, zero-indexed.png
path = kagglehub.dataset_download("sylshaw/streetview-by-country")
# print(path)
path = f"{path}/streetview_images"
images = load_images(path)
train, test = train_test_split(images, split=0.8)

print(f"{len(train)} images for train, {len(test)} images for test, {len(images)} total!")

# The coordinates for the output-frame (how high resolution should the probability distribution be?)
OUT_H, OUT_W = 32, 64
EPSILON = 1e-12 # value for ensuring model does not hit 0

16000 images for train, 4000 images for test, 107269 total!


In [7]:
from typing import Literal
# I call the inputs (incoming nodes) 'x', and outgoing (nodes that I affect) 'o'

class Layer():
    compiled = False
    len = None
    x = None
    o = None
    mode = "train"
    
    def compile(self, inputs: int = None):
        if self.len is None:
            if inputs is None:
                raise Exception("The layer has an undefined size")
            self.len = inputs
        self.compiled = True
    
    def loss(self, lr):
        pass

    def set_mode(self, mode: Literal["train", "test"]):
        self.mode = mode

    def __len__(self):
        return self.len
        
# ReLU and LeakyReLU together
class ReLU(Layer):
    def __init__(self, c=0):
        self.c = c

    def forward(self, x):
        self.x = x
        self.o = np.where(x > 0, x, self.c * x)
        return self.o
    
    def backward(self, dl):
        return dl * np.where(self.x > 0, 1, self.c)

    def __repr__(self):
        return f"ReLU layer, with c={self.c}"


class Sigmoid(Layer):
    def forward(self, x):
        self.x = x
        self.o = 1 / (1 + np.exp(-x))
        return self.o
    
    def backward(self, dl):
        return dl * self.o * (1 - self.o)
    
    def __repr__(self):
        return "Sigmoid layer"


class PatchLinear(Layer):
    def __init__(self, outputs, wpn, decay):
        self.len = outputs
        self.wpn = wpn
        self.decay = decay

    def compile(self, _=None):
        if _ is not None:
            raise Exception("This layer must be the first layer")
            
        self.weights = np.random.randn(self.len, self.wpn) * (1 / np.sqrt(self.wpn)) # N x Inputs
        self.bias = np.zeros(self.len)  # N x 1
        self.compiled = True

    def forward(self, x):
        self.x = x
        self.o = []
        for patch, weights in zip(x, self.weights):
            self.o.append(patch.T @ weights)  # dot product, returns float
        return np.array(self.o) + self.bias

    def backward(self, dl):
        self.dW = np.zeros_like(self.weights)
        self.db = np.zeros_like(self.bias)
        dx = np.zeros_like(self.x) 

        for i, (patch, w) in enumerate(zip(self.x, self.weights)):
            self.dW[i] = dl[i] * patch
            self.db[i] = dl[i]
            dx[i] = dl[i] * w
        return dx

    def loss(self, lr):
        self.bias -= lr * self.db
        self.weights -= lr * (self.dW + self.decay * self.weights)

    def __repr__(self):
        return f"Patch Linear layer, with {self.wpn} weights per neuron and {self.len} outputs"


class Dense(Layer):
    def __init__(self, outputs, decay):
        self.len = outputs
        self.decay = decay

    def compile(self, inputs):
        if inputs is None and self.inputs is None:
            raise Exception("This layer has an undefined size")
        self.inputs = inputs
        self.weights = np.random.randn(self.len, inputs) / np.sqrt(inputs) # N x Inputs
        self.bias = np.zeros(self.len) # N x 1
        self.compiled = True

    def forward(self, x):
        self.x = x
        self.o = self.weights @ x + self.bias
        return self.o
            
    def backward(self, dl):
        self.dW = np.outer(dl, self.x) # (N) x (Inputs)
        self.db = dl # x 1
        return self.weights.T @ dl
    
    def loss(self, lr):
        self.bias -= lr * self.db
        self.weights -= lr * (self.dW + self.decay * self.weights)

        
    def __repr__(self):
        return f"Dense layer, with {self.inputs} inputs and {self.len} outputs"

class Softmax(Layer):
    def forward(self, x):
        self.x = x
        x_shift = x - np.max(x)
        exp = np.exp(x_shift)
        self.o = exp / np.sum(exp)
        return self.o

    def backward(self, dl):
        return self.o * (dl - np.dot(dl, self.o))

    def __repr__(self):
        return "Softmax layer"

class Dropout(Layer):
    def __init__(self, p=0.5):
        self.p = p
        self.mask = None

    def forward(self, x):
        if self.mode == "test":
            return x
        self.mask = np.random.binomial(n=1, p=1-self.p, size=x.shape)
        self.mask = self.mask * (1 / (1 - self.p))
        self.o = self.mask
        return self.o

    def backward(self, dl):
        return dl * self.mask

    def __repr__(self):
        return f"Dropout layer, with p={self.p}"


In [None]:
from typing import List
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image
from datetime import datetime

class Model:
    def __init__(self, decay=1e-5):
        self.layers: List[Layer] = []
        self.decay = decay

    def sigmoid(self):
        self.layers.append(Sigmoid())
        return self
    
    def relu(self):
        self.layers.append(ReLU())
        return self

    def leaky_relu(self, c: float):
        self.layers.append(ReLU(c=c))
        return self
    
    def patch_linear(self, outputs, wpn):
        self.layers.append(PatchLinear(outputs, wpn, self.decay))
        return self

    def dense(self, outputs):
        self.layers.append(Dense(outputs, self.decay))
        return self

    def softmax(self):
        self.layers.append(Softmax())
        return self
    
    def dropout(self, p):
        self.layers.append(Dropout(p))
        return self
    
    def compile(self):
        inputs = None
        for layer in self.layers:
            layer.compile(inputs)
            inputs = len(layer)

    def set_mode(self, mode: Literal["train", "test"]):
        for layer in self.layers:
            layer.set_mode(mode)

    def infer(self, test_gen, test_count):
        self.set_mode("test")
        random_indices = np.random.randint(0, test_count, size=10)
        loss_list = []
        l2_loss_list = []
        show_samples = []
        
        for index, sample in enumerate(tqdm(test_gen(), total=test_count, desc="Testing on novel images:")):
            inputs = sample["image"]
            target = sample["target"]
            for layer in self.layers:
                inputs = layer.forward(inputs)

            diff = inputs - target
            loss = np.mean(np.square(diff)) 
            loss_list.append(loss)

            l2_loss = 0
            for layer in self.layers:
                if hasattr(layer, "weights"):
                    l2_loss += 0.5 * self.decay * np.sum(layer.weights ** 2)
            l2_loss_list.append(l2_loss)

            if index in random_indices:
                show_samples.append({**sample, "output": inputs, "loss": loss})

        for show_sample in show_samples:
            flat_index = show_sample["output"].argmax()
            lat_index = flat_index // OUT_W
            lon_index = flat_index % OUT_W 
            guess_lat = ((lat_index + 0.5) / OUT_H) * 180 - 90
            guess_lng = ((lon_index + 0.5) / OUT_W) * 360 - 180

            plt.imshow(show_sample["target"].reshape(OUT_H, OUT_W), cmap='hot', origin='lower')
            print(f"MODEL AVERAGE LOSS (NOVEL): {np.mean(loss_list)}")
            print(f"TARGET CORRECT (lat: {show_sample['lat']}, lng: {show_sample['lon']})")
            plt.title(f"TARGET CORRECT")
            plt.show()
            print(f"MODEL GUESS (guess lat: {guess_lat}, guess lng: {guess_lng})")
            plt.title(f"MODEL GUESS (loss: {show_sample['loss']})")
            plt.imshow(show_sample["output"].reshape(OUT_H, OUT_W), cmap='hot', origin='lower')
            plt.show()
            plt.title(f"REAL IMAGE")
            plt.imshow(Image.open(show_sample["path"]))
            plt.show()

        plt.hist(loss_list, bins=30)
        plt.xlabel("Loss")
        plt.ylabel("Frequency")
        plt.title("NOVEL IMAGE LOSS HISTOGRAM")
        plt.show()
        plt.hist(l2_loss_list, bins=30)
        plt.xlabel("L2 Loss")
        plt.ylabel("Frequency")
        plt.title("NOVEL IMAGE L2 LOSS HISTOGRAM")
        plt.show()

                
    def train_epoch(self, train_gen, lr=1e-3):
        self.set_mode("train")
        
        loss_list = []
        l2_loss_list = []
        count = 0
        for sample in train_gen:
            inputs = sample["image"]
            target = sample["target"]

            if len(self.layers) == 0:
                raise Exception("No layers to train")

            if inputs.shape[0] != len(self.layers[0]):
                raise Exception("Data/input-layer shape mismatch")

            # forward pass
            for layer in self.layers:
                if not layer.compiled:
                    raise Exception("Please compile your layers!")
                inputs = layer.forward(inputs)

            if inputs.shape != target.shape:
                raise Exception("Output-layer/target shape mismatch")
                
            diff = inputs - target
            loss = np.mean(np.square(diff)) # MSE 
            loss_list.append(loss)

            # L2 regularization
            l2_loss = 0
            for layer in self.layers:
                if hasattr(layer, "weights"):
                    l2_loss += 0.5 * self.decay * np.sum(layer.weights ** 2)
            l2_loss_list.append(l2_loss)

            if count % 1000 == 0:
                print(f"Current model loss: {loss}")
                print(f"Current model l2 loss: {l2_loss}")
            count += 1
            
            # backwards pass
            N = inputs.shape[0]
            dl = 2 * (inputs - target) / N
            for index in range(len(self.layers) - 1, -1, -1):
                layer = self.layers[index]
                dl = layer.backward(dl)

            # update loss on weights
            for layer in self.layers:
                layer.loss(lr)

        return loss_list, l2_loss_list


    def log(self):
        for layer in self.layers:
            print(layer)
    
    def save(self, filepath):
        state = {}
        for i, layer in enumerate(self.layers):
            if hasattr(layer, 'weights') and hasattr(layer, 'bias'):
                state[i] = {'weights': layer.weights, 'bias': layer.bias}
        np.savez(filepath, state=state)
    
    def load(self, filepath):
        data = np.load(filepath, allow_pickle=True)
        state = data['state'].item()
        for i, layer in enumerate(self.layers):
            if hasattr(layer, 'weights') and hasattr(layer, 'bias'):
                layer.weights = state[i]['weights']
                layer.bias = state[i]['bias']

In [9]:
model = (
    Model()
    .patch_linear(400, wpn=3072)  # 20 x 20 patches, 3072 weights per neuron
    .leaky_relu(0.01)
    .dense(640)
    .leaky_relu(0.01)
    .dropout(0.2)
    .dense(640)
    .leaky_relu(0.01)
    .dropout(0.2)
    .dense(1200)
    .leaky_relu(0.01)
    .dropout(0.3)
    .dense(OUT_H * OUT_W) # broadcast back up to heatmap coordinates
    .softmax()
)

model.compile()
model.log()

Patch Linear layer, with 3072 weights per neuron and 400 outputs
ReLU layer, with c=0.01
Dense layer, with 400 inputs and 640 outputs
ReLU layer, with c=0.01
Dropout layer, with p=0.2
Dense layer, with 640 inputs and 640 outputs
ReLU layer, with c=0.01
Dropout layer, with p=0.2
Dense layer, with 640 inputs and 1200 outputs
ReLU layer, with c=0.01
Dropout layer, with p=0.3
Dense layer, with 1200 inputs and 2048 outputs
Softmax layer


In [None]:
def create_target(lat, lon, height, width, sigma=1.0):
    array = np.zeros((height, width))
    lat_index = int((lat + 90) / 180 * height)
    lat_index = min(height - 1, lat_index)
    lon_index = int((lon + 180) / 360 * width)
    lon_index = min(width - 1, lon_index)
    for y in range(height):
        for x in range(width):
            # gaussian distribution for heatmap
            dist = (x - lon_index)**2 + (y - lat_index)**2
            array[y, x] = np.exp(-dist / (2 * sigma**2))
    array = array.flatten()
    normalized = array / array.sum()
    normalized = np.clip(normalized, 1e-12, None)
    normalized = normalized / normalized.sum()
    return normalized

def load_samples(images, height, width, sigma=1.0):
    # Count valid samples without loading images
    count = sum(1 for img_path in images if coordinates_from_path(img_path) is not None)
    
    def gen():
        for img_path in images:
            coords = coordinates_from_path(img_path)
            if not coords:
                continue
            _, lat, lon, _ = coords
            image = Image.open(img_path)
            target = create_target(lat, lon, height, width, sigma)
            yield Sample(
                path=img_path,
                lat=lat,
                lon=lon,
                image=format_frame(np.asarray(image)),
                target=target
            )
    
    return gen, count

# Need data in the right shape
# Incoming: [640, 640, 3] 
# Returns: -> [3 x 640 x 640]
def format_frame(frame, patch_size=32):
    H, W, C  = frame.shape
    num_patches_h = H // patch_size
    num_patches_w = W // patch_size
    # this took SO much experimentation, but you need to do this to ensure the patches are preserved
    frame = frame.reshape(num_patches_h, patch_size, num_patches_w, patch_size, 3)
    frame = frame.transpose(0, 2, 1, 3, 4)
    frame = frame.reshape(-1, patch_size, C)
    frame = frame.transpose(2, 0, 1)
    frame = frame.reshape(num_patches_h * num_patches_w, -1)
    return frame / 255.0  # normalize to 0-1


SIGMA = 1.4
train_gen, train_count = load_samples(train, OUT_H, OUT_W, sigma=SIGMA)
test_gen, test_count = load_samples(test, OUT_H, OUT_W, sigma=SIGMA)

epochs = 6
lrs = [1e-2, 1e-2, 1e-3, 1e-3, 1e-4, 1e-4]
for epoch, lr in zip(range(epochs), lrs):
    loss_list, l2_loss_list = model.train_epoch(tqdm(train_gen(), total=train_count, desc=f"Epoch {epoch + 1}/{epochs}"), lr=1e-2)
    print(f"Epoch {epoch + 1}/{epochs}; avg loss: {np.mean(loss_list)}, avg l2 loss: {np.mean(l2_loss_list)}")
    plt.hist(loss_list, bins=60)
    plt.xlabel("Loss")
    plt.ylabel("Frequency")
    plt.title(f"EPOCH {epoch + 1} TRAIN LOSS HISTOGRAM")
    plt.show()
    plt.hist(loss_list, bins=60)
    plt.xlabel("L2 Loss")
    plt.ylabel("Frequency")
    plt.title(f"EPOCH {epoch + 1} TRAIN L2 LOSS HISTOGRAM")
    plt.show()
    model.infer(test_gen, test_count)
    model.save(f'models/model_{OUT_H}x{OUT_W}_e{epoch + 1}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.npz')

Epoch 1/6:   0%|          | 3/16000 [00:00<11:49, 22.56it/s]

Current model loss: 2.0443429382652322e-05
Current model l2 loss: 0.024645926194287675


Epoch 1/6:   6%|▋         | 1005/16000 [00:47<11:27, 21.82it/s]

Current model loss: 2.032025842810546e-05
Current model l2 loss: 0.024640996813368128


Epoch 1/6:  13%|█▎        | 2003/16000 [01:36<08:51, 26.31it/s]

Current model loss: 2.025014654400129e-05
Current model l2 loss: 0.024636068417175312


Epoch 1/6:  16%|█▋        | 2640/16000 [02:07<09:12, 24.20it/s]

In [None]:
# train_gen, train_count = load_samples(train, OUT_H, OUT_W, sigma=2)
# model.load('models/model_32x64_e1_2025-11-21_23-50-11.npz')
# model.infer(train_gen, train_count)