In [None]:
!pip install matplotlib requests python-dotenv pillow opencv-python

# Download dataset
## Requires an env variable API_KEY for Street View API to be specified

In [None]:
import io
import sys
import time
import logging

import matplotlib
import requests
from dotenv import load_dotenv
from threading import Semaphore
import os

import matplotlib.pyplot as plt
from PIL import Image
import concurrent.futures

THREADS = 32
IMAGE_SIZE = "224x246"
COORDINATES_PRECISION = 5  # Number of decimals to keep for the coordinates. 5 decimals is about 1 meter precision
OUTPUT_DIR = "./streetview_images/"
LOCATIONS_FILE = "./data/locations.json"
METADATA_DESTINATION = "./data/metadata.json"
VISUALIZATION_FILE = "./data/visualization.png"
LOGS_FILE = "./data/logs.log"
IMAGES_PER_CELL = 400

downloaded_images = {}
collisions = {"exact": 0, "close": 0}
fails = {"request_failed": 0, "download_failed": 0, "no_image": 0, "no_lat_lng": 0}

load_dotenv()
api_key = os.getenv("API_KEY")

semaphore = Semaphore()


def main():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[logging.FileHandler(LOGS_FILE, encoding='utf-8'), logging.StreamHandler(sys.stdout)],
    )

    logging.info('Download images started')

    locations = load_locations_from_file(LOCATIONS_FILE)

    visualize_generated_locations(
        [(location["lat"], location["lng"]) for cell in locations.values() for location in cell]
    )

    start = time.time()
    for cell_name, locations in locations.items():
        logging.info(f"Downloading images for {cell_name}...")
        output_dir = os.path.join(OUTPUT_DIR, cell_name)
        os.makedirs(output_dir, exist_ok=True)
        download_images(locations, output_dir)
    end = time.time()
    logging.info(f"Images downloaded in {end - start} seconds")
    logging.info(f"Downloaded images: {len(downloaded_images)}")
    logging.info(f"Collisions: {collisions}")
    logging.info(f"Fails: {fails}")
    save_metadata(METADATA_DESTINATION)

    # Count number of downloaded images in each cell
    downloaded_images_per_cell = {}
    for image in downloaded_images.values():
        cell = image["cell"]
        if not downloaded_images_per_cell.get(cell):
            downloaded_images_per_cell[cell] = 0
        downloaded_images_per_cell[cell] += 1

    # Print number of downloaded images in each cell
    logging.info("Number of downloaded images in each cell:")
    for cell in downloaded_images_per_cell:
        logging.info(f"{cell}: {downloaded_images_per_cell[cell]}")


def load_locations_from_file(file_path):
    """
    File format:
    [
        {
            "location": [
                10.0307112,
                55.9739344
            ],
            "municipality": "Odder Kommune",
            "city": "Odder C"
        },
        ...
    ]
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    locations = {}
    for item in data:
        cell_name = item['municipality']
        city = item['city']
        if not cell_name or cell_name == "None":
            continue  # Skip if no municipality
        location = {"cell": cell_name, "lat": item['location'][1], "lng": item['location'][0]}
        if not locations.get(cell_name):
            locations[cell_name] = {"cities": {}, "count": 0}
        if not locations[cell_name]["cities"].get(city):
            locations[cell_name]["cities"][city] = {"locations": [], "count": 0}

        locations[cell_name]["cities"][city]["locations"].append(location)
        locations[cell_name]["count"] += 1
        locations[cell_name]["cities"][city]["count"] += 1

    sampled_locations = {}
    total_locations = 0
    for cell_name in locations:
        cell_count = locations[cell_name]["count"]
        sampled_locations[cell_name] = []
        sampled_location_count = 0
        for city in locations[cell_name]["cities"]:
            city_count = locations[cell_name]["cities"][city]["count"]
            ratio = city_count / cell_count
            city_locations = locations[cell_name]["cities"][city]["locations"]
            number_of_locations_to_sample = min(max(round(ratio * IMAGES_PER_CELL), 1), city_count)
            sampled_locations[cell_name] += sample_locations(city_locations, number_of_locations_to_sample)
            sampled_location_count += number_of_locations_to_sample

        total_locations += sampled_location_count
        logging.info(f"{cell_name}: {sampled_location_count} locations sampled")

    logging.info(f"Total locations sampled: {total_locations}")
    return sampled_locations


def sample_locations(locations, number_of_locations_to_sample):
    """
    Uniformly sample locations from a list of locations
    """
    if number_of_locations_to_sample >= len(locations):
        return locations
    sampled_locations = []
    step = len(locations) / number_of_locations_to_sample
    for i in range(number_of_locations_to_sample):
        sampled_locations.append(locations[int(i * step)])
    return sampled_locations


def visualize_generated_locations(points, polygon=None):
    matplotlib.use('Agg')

    y = [point[0] for point in points]  # Extract the x coordinate from the tuple
    x = [point[1] for point in points]  # Extract the y coordinate from the tuple

    # Create a plot to visualize the points and the polygon
    plt.figure(figsize=(16, 12))
    plt.scatter(x, y, c='blue', s=1, alpha=0.1, label='Images locations')
    if polygon:
        y_polygon = [point[0] for point in polygon]
        x_polygon = [point[1] for point in polygon]
        plt.plot(x_polygon + [x_polygon[0]], y_polygon + [y_polygon[0]], c='red', label='Region')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title('Coordinates of images')
    plt.legend()
    plt.grid(True)
    fig_io = io.BytesIO()
    plt.savefig(fig_io, format='png', dpi=300)
    fig_io.seek(0)

    # Display the saved figure
    Image.open(fig_io).show()
    # Save to file
    plt.savefig(VISUALIZATION_FILE, format='png', dpi=300)


def download_images(locations, output_dir):
    with concurrent.futures.ThreadPoolExecutor(max_workers=min(THREADS, len(locations))) as executor:
        executor.map(verify_and_download_image, locations, [output_dir] * len(locations))


def verify_and_download_image(location, output_dir):
    lat, lng, pano_id = check_street_view_image_existence(lat_lng_to_string(location['lat'], location['lng']))
    if lat and lng:
        semaphore.acquire()
        if is_collision(lat, lng, location['cell']):
            semaphore.release()
            return

        semaphore.release()

        if download_street_view_image(lat, lng, IMAGE_SIZE, 0, output_dir):
            key = lat_lng_to_string(lat, lng)
            semaphore.acquire()
            if is_collision(lat, lng, location['cell']):
                semaphore.release()
                logging.warning(f"Image already downloaded {location['cell']}: lat={lat}, lng={lng}, pano_id={pano_id}")
                return
            downloaded_images[key] = {
                "lat": lat,
                "lng": lng,
                "pano_id": pano_id,
                "heading": 0,
                "cell": location['cell']
            }
            semaphore.release()


def is_collision(lat, lng, cell_name):
    """
    Check if an image for the given location already exists
    """
    key = lat_lng_to_string(lat, lng)
    if key in downloaded_images:
        if downloaded_images[key]["lat"] == lat and downloaded_images[key]["lng"] == lng:
            collisions["exact"] += 1
        else:
            collisions["close"] += 1
        logging.warning(
            f"Image for {key} already downloaded ({downloaded_images[key]}); new image: lat={lat}, lng={lng}, cell={cell_name}")
        return True
    return False


def check_street_view_image_existence(location):
    base_url = "https://maps.googleapis.com/maps/api/streetview/metadata"
    params = {
        "location": location,
        "key": api_key,
    }

    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        metadata = response.json()

        # Check the "status" field in the metadata
        metadata_loc = metadata.get("location")
        if metadata.get("status") == "OK" and metadata_loc:
            lat = metadata_loc.get("lat")
            lng = metadata_loc.get("lng")
            if lat and lng:
                return lat, lng, metadata.get("pano_id")
            else:
                logging.warning(f"No latitude or longitude found in metadata ({metadata}); image: {location}")
                fails["no_lat_lng"] += 1
                return None, None, None  # Street View image does not exist
        else:
            logging.warning(f"No image found. Status code {metadata.get('status')}; image: {location}")
            fails["no_image"] += 1
            return None, None, None  # Street View image does not exist
    logging.warning(f"Request failed. Status code {response.status_code}; image: {location}")
    fails["request_failed"] += 1
    return None, None, None  # Request was not successful or failed


def download_street_view_image(lat, lng, size, heading=0, output_dir="./"):
    base_url = "https://maps.googleapis.com/maps/api/streetview"

    # Define the parameters for the request
    params = {
        "location": lat_lng_to_string(lat, lng),
        "size": size,
        "heading": heading,
        "pitch": 0,
        "fov": 90,
        "source": "outdoor",
        "return_error_code": True,
        "key": api_key,
    }

    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        # Create a filename for the saved image
        filename = lat_lng_to_string(lat, lng)
        filepath = os.path.join(output_dir, filename + ".jpg")

        # Save the image to the specified directory
        with open(filepath, "wb") as f:
            f.write(response.content)
        return True
    else:
        logging.warning(f"Image download failed. Status code {response.status_code} url: {response.url}")
        semaphore.acquire()
        fails["download_failed"] += 1
        semaphore.release()
        return False


def lat_lng_to_string(lat, lng):
    return f"{lat},{lng}"


def lat_lng_to_key(lat, lng):
    return f"{round(lat, COORDINATES_PRECISION)},{round(lng, COORDINATES_PRECISION)}"


def save_metadata(output_file):
    # Create the output directory if it does not exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(os.path.join(output_file), 'w') as f:
        json.dump(downloaded_images, f)


if __name__ == "__main__":
    main()

## Remove "not found" images (empty images returned for locations without available imagery)

In [None]:
"""
Deletes images saying "Sorry, we have no imagery here" and updates the metadata file accordingly.
"""
import json
import os
import shutil

# Path to the downloaded images

METADATA_DESTINATION = "./data/metadata.json"
IMAGES_PATH = 'streetview_images/'
BACKUP_PATH = 'streetview_images_backup/'
COORDINATES_PRECISION = 5


def lat_lng_to_key(lat, lng):
    return f"{round(lat, COORDINATES_PRECISION)},{round(lng, COORDINATES_PRECISION)}"


print("Deleting images...")

# Open metadata file
with open(METADATA_DESTINATION, 'r') as f:
    metadata = json.load(f)

for root, dirs, files in os.walk(IMAGES_PATH):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(root, file)
            # Get file size
            file_size = os.path.getsize(image_path)
            # Empty images have specific size. It never happened for us that another image had the same size - it is rare enough that we can ignore it
            if file_size == 3810:
                # Save an image copy to a backup folder
                output_path = os.path.join(BACKUP_PATH, os.path.relpath(image_path, IMAGES_PATH))
                # Make dirs if they don't exist
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                shutil.copy(image_path, output_path)

                os.remove(image_path)

                lat_lng = os.path.splitext(os.path.basename(image_path))[0]
                lat = float(lat_lng.split(",")[0])
                lng = float(lat_lng.split(",")[1])
                key = lat_lng_to_key(lat, lng)
                del metadata[key]

# Save the updated metadata file
with open(METADATA_DESTINATION, 'w') as f:
    json.dump(metadata, f, indent=4)



print("Deleting completed.")

## Crop images

In [None]:
"""
Script cropping downloaded images to 224x224 pixels. Only the bottom part is removed as it contains a watermark.
"""

import os
import cv2
import numpy

path = 'streetview_images/'
path_cropped = 'streetview_images/'

target_width = 224
target_height = 224

# Create the directory if it does not exist
if not os.path.exists(path_cropped):
    os.makedirs(path_cropped)


print("Cropping and resizing images...")

for root, dirs, files in os.walk(path):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            # Construct the full path to the image
            image_path = os.path.join(root, file)

            stream = open(image_path, "rb")
            bytes = bytearray(stream.read())
            numpyarray = numpy.asarray(bytes, dtype=numpy.uint8)
            image = cv2.imdecode(numpyarray, cv2.IMREAD_UNCHANGED)

            if image is not None:
                # Get the dimensions of the image
                height, width, _ = image.shape

                # Calculate the cropping coordinates to remove the bottom part
                top_left = (0, 0)
                bottom_right = (width, min(height, target_height))

                # Crop the image
                cropped_image = image[top_left[1]:bottom_right[1], top_left[0]:bottom_right[0]]

                # Resize the cropped image to the target size
                cropped_image = cv2.resize(cropped_image, (target_width, target_height))

                # Construct the output path maintaining the same directory structure
                output_path = os.path.join(path_cropped, os.path.relpath(image_path, path))
                # Make dirs if they don't exist
                os.makedirs(os.path.dirname(output_path), exist_ok=True)

                # Save the cropped image (may contain special characters and cv2 does not support that) Do not use cv2.imwrite
                cv2.imencode(os.path.splitext(output_path)[1], cropped_image)[1].tofile(output_path)

print("Cropping and resizing completed.")
